Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TensorRT-LLM #1725

Merged
merged 1 commit into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/
*.nsys-rep
.VSCodeCounter
build*/
!builders/
*.egg-info/
.coverage
*.onnx
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ repos:
args:
- --skip=".git,3rdparty"
- --exclude-file=examples/whisper/tokenizer.py
- --ignore-words-list=rouge,inout,atleast,strat,nd
- --ignore-words-list=rouge,inout,atleast,strat,nd,subtile
exclude: 'tests/llm-test-defs/turtle/test_input_files'
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,6 @@ To get started with TensorRT-LLM, visit our documentation:
- [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
- [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
- [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)

## Community
- [Model zoo](https://huggingface.co/TheFloat16) (generated by TRT-LLM rel 0.9 a9356d4b7610330e89c1010f342a9ac644215c52)
14 changes: 6 additions & 8 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,10 @@ TP=2
PP=1
MAX_LEN=1024
MAX_BATCH=32
MAX_LORA_RANK=32
NUM_LAYERS=40
MAX_LORA_RANK=64
NUM_LORA_MODS=7
EOS_ID=2

SOURCE_LORA=chinese-llama-2-lora-13b
CPP_LORA=chinese-llama-2-lora-13b-cpp
Expand All @@ -234,7 +236,7 @@ ${HOME}/.local/bin/trtllm-build \
--gemm_plugin float16 \
--lora_plugin float16 \
--use_paged_context_fmha enable \
--lora_target_modules attn_qkv \
--lora_target_modules attn_q attn_k attn_v attn_dense mlp_h_to_4h mlp_4h_to_h mlp_gate \
--max_lora_rank ${MAX_LORA_RANK}

NUM_LORAS=(8 16 24 32 64 128 256)
Expand All @@ -252,8 +254,6 @@ mkdir -p $EG_DIR/data
# Prepare dataset without lora_task_id
python benchmarks/cpp/prepare_dataset.py \
--output "${EG_DIR}/data/token-norm-dist.json" \
--request-rate -1 \
--time-delay-dist constant \
--tokenizer $TOKENIZER \
token-norm-dist \
--num-requests $NUM_REQUESTS \
Expand All @@ -263,8 +263,6 @@ python benchmarks/cpp/prepare_dataset.py \
for nloras in ${NUM_LORAS[@]}; do
python benchmarks/cpp/prepare_dataset.py \
--output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
--request-rate -1 \
--time-delay-dist constant \
--rand-task-id 0 $(( $nloras - 1 )) \
--tokenizer $TOKENIZER \
token-norm-dist \
Expand Down Expand Up @@ -292,7 +290,7 @@ mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \

# Now run inference with various numbers or loras
# The host cache is set large enough to hold all the LoRAs in lora_dir
# GPU cache is set to hold 32 LoRAs
# GPU cache is set to hold 16 LoRAs
# This benchmark will preload all the LoRAs into the host cache
# We run inference on a range of active LoRAs exercising different cache miss rates.
for nloras in ${NUM_LORAS[@]}; do
Expand All @@ -303,7 +301,7 @@ for nloras in ${NUM_LORAS[@]}; do
--type IFB \
--dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
--lora_host_cache_bytes 8589934592 \
--lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
--lora_num_device_mod_layers $(( 16 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
--kv_cache_free_gpu_mem_fraction 0.80 \
--log_level info \
--eos_id ${EOS_ID} \
Expand Down
32 changes: 26 additions & 6 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -458,10 +458,6 @@ class Recorder
{
this->recordEnd(requestId, hasError);

if (mRespJsonFile.empty())
return;
int32_t outputSeqLen;

for (auto& tensor : responseTensors)
{
if (tensor.name == inference_request::kOutputIdsTensorName)
Expand All @@ -471,7 +467,7 @@ class Recorder
else if (tensor.name == inference_request::kSequenceLengthTensorName)
{
// Tensor of shape nBeams, and we only need the first one
outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
int32_t outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
if (mOutputHasInput)
{
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
Expand All @@ -482,6 +478,30 @@ class Recorder
}
}

void recordEnd(uint64_t requestId, texec::Response const& response)
{

this->recordEnd(requestId, response.hasError());

// Get the actual output length
if (!response.hasError())
{
auto outputTokenIds = response.getResult().outputTokenIds;

int32_t outSeqLen = 0;
for (auto const& beam : outputTokenIds)
{
outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);
}
if (mOutputHasInput)
{
int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
outSeqLen -= inputSeqLen;
}
mRequestBenchInfos[requestId].outputLength = outSeqLen;
}
}

float calcPercentile(std::vector<float> const& latencies, int percentile)
{
int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;
Expand Down Expand Up @@ -827,7 +847,7 @@ class ExecutorServer
numFinished++;
if (!warmup)
{
mRecorder->recordEnd(reqId, response.hasError());
mRecorder->recordEnd(reqId, response);
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <NvInfer.h>
#include <atomic>
#include <chrono>
#include <cuda_profiler_api.h>
#include <cxxopts.hpp>
#include <future>
#include <sstream>
Expand Down Expand Up @@ -213,6 +214,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
std::vector<float> latencies;
std::vector<float> generationTimes;
auto generationProfiler = std::make_shared<GptSession::GenerationProfiler>();
cudaProfilerStart();
while (iterIdx < numRuns)
{
auto const start = std::chrono::steady_clock::now();
Expand Down Expand Up @@ -242,6 +244,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
break;
}
}
cudaProfilerStop();

TLLM_LOG_INFO(memoryCounter.toString());
done = true;
Expand Down
4 changes: 0 additions & 4 deletions benchmarks/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ def parse_arguments():
help=
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
)
parser.add_argument('--strongly_typed',
default=False,
action='store_true',
help='This option will reduce the building time.')
parser.add_argument(
'--gpu_weights_percent',
type=str,
Expand Down
50 changes: 21 additions & 29 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,6 @@ def parse_arguments():
default=False,
action='store_true',
help="Build engines serially")
parser.add_argument('--strongly_typed',
default=False,
action='store_true',
help='This option will reduce the building time.')
parser.add_argument(
'--multiple_profiles',
default=False,
Expand Down Expand Up @@ -251,9 +247,6 @@ def build_gpt(args):
if not args.serial_build:
torch.cuda.set_device(runtime_rank)

strongly_typed = args.strongly_typed
if args.quantization is not None and "fp8" in args.quantization:
strongly_typed = True
num_kv_heads = build_config['num_heads'] \
if build_config['num_kv_heads'] is None else build_config['num_kv_heads']
apply_query_key_layer_scaling = False
Expand Down Expand Up @@ -321,7 +314,7 @@ def build_gpt(args):
quant_mode=quant_mode,
use_refit=False,
opt_level=build_config['builder_opt'],
strongly_typed=strongly_typed,
strongly_typed=True,
weight_streaming=is_weight_streaming,
**builder_config_extra_kwargs)
engine_name = get_engine_name(args.model, args.dtype, world_size,
Expand Down Expand Up @@ -363,8 +356,10 @@ def build_gpt(args):
'apply_query_key_layer_scaling':
builder_config.apply_query_key_layer_scaling,
'rotary_pct': build_config['rotary_pct'],
'moe_num_experts': build_config["moe_num_experts"],
'moe_top_k': build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.GPTForCausalLM(config)
Expand Down Expand Up @@ -399,7 +394,7 @@ def build_gpt(args):
elif family == "llama":
config = {
'architecture':
'LLaMAForCausalLM',
'LlamaForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
Expand Down Expand Up @@ -430,10 +425,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
}
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
Expand Down Expand Up @@ -602,9 +597,6 @@ def build_gpt(args):
}
config = PretrainedConfig.from_dict(config)
tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
tensorrt_llm_model = optimize_model(
tensorrt_llm_model,
use_parallel_embedding=config.use_parallel_embedding)
elif family == "falcon":
config = {
'architecture':
Expand Down Expand Up @@ -696,7 +688,7 @@ def build_gpt(args):
elif family == "internlm":
config = {
'architecture':
'LLaMAForCausalLM',
'LlamaForCausalLM',
'dtype':
args.dtype,
'num_hidden_layers':
Expand Down Expand Up @@ -778,10 +770,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
'qwen_type':
'qwen',
}
Expand Down Expand Up @@ -821,10 +813,10 @@ def build_gpt(args):
'world_size': world_size,
'tp_size': world_size
},
'moe_num_experts':
build_config["moe_num_experts"],
'moe_top_k':
build_config["moe_top_k"],
'moe': {
'num_experts': build_config["moe_num_experts"],
'top_k': build_config["moe_top_k"],
},
'qwen_type':
'qwen2',
}
Expand Down Expand Up @@ -1029,7 +1021,7 @@ def build_bert(args):
max_batch_size=max_batch_size,
max_input_len=max_input_len,
opt_level=build_config['builder_opt'],
strongly_typed=args.strongly_typed,
strongly_typed=True,
weight_streaming=is_weight_streaming,
)
engine_name = get_engine_name(args.model, args.dtype, world_size,
Expand Down Expand Up @@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
cross_attention=(component == 'decoder'),
has_position_embedding=has_position_embedding,
has_token_type_embedding=False, # by default
strongly_typed=False, # by default
strongly_typed=True,
gather_all_token_logits=False, # by default
int8=(quant_mode.has_act_and_weight_quant()
or quant_mode.is_int8_weight_only()),
Expand Down
19 changes: 16 additions & 3 deletions benchmarks/python/check_accuracy_mlperf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
from enum import Enum

import evaluate
Expand Down Expand Up @@ -82,9 +83,11 @@ def calculate_toks_per_sample(preds, eos_id):
return avg_len / num_samples


def calculate_rouge_score(preds, targets):
def calculate_rouge_score(preds, targets, rouge_dir=None):
print("Calculating ROUGE scores...")
metric = evaluate.load("rouge")
rouge_dir = rouge_dir if rouge_dir and os.path.exists(
rouge_dir) else "rouge"
metric = evaluate.load(rouge_dir)
preds, targets = postprocess_text(preds, targets[0:len(preds)])
result = metric.compute(predictions=preds,
references=targets,
Expand Down Expand Up @@ -114,6 +117,15 @@ def parse_arguments():
parser.add_argument("--base_model",
type=str,
help="Location of the model used (to create tokenizer)")

parser.add_argument(
'--rouge_dir',
default=None,
type=str,
help=
"evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
)

args = parser.parse_args()

return args
Expand Down Expand Up @@ -146,7 +158,8 @@ def main():
tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)

pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
achieved_scores = calculate_rouge_score(pred_texts, target_texts)
achieved_scores = calculate_rouge_score(pred_texts, target_texts,
args.rouge_dir)

achieved_scores['tokens_per_sample'] = tps_score
targets = ACCURACY_TARGETS[model]
Expand Down
12 changes: 4 additions & 8 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,10 @@ def check_memory(self, io_shapes: list, raise_exception=False):
self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
# when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
if not self.use_gpt_attention_plugin:
if os.getenv('TRTLLM_DISABLE_OOTB_KVCACHE_REUSE') != 'ON':
local_n_layer = ceil(self.build_config.num_layers /
self.runtime_mapping.pp_size)
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
local_n_layer + 1)
else:
# without reusing, we need one for past as engine inputs, one for present as engine outputs.
kv_cache_size_in_bytes *= 2
local_n_layer = ceil(self.build_config.num_layers /
self.runtime_mapping.pp_size)
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
local_n_layer + 1)

kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
"MiB")
Expand Down
4 changes: 1 addition & 3 deletions benchmarks/suite/tensorrt_llm_bench/utils/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def get_build_options(self, dtype: str) -> List[str]:
List[str]: A list of command line arguments to be added to build
commands.
"""
if self.value == self.FP8:
return ["--strongly_typed"]
else:
if not self.value == self.FP8:
return ["--gemm_plugin", dtype]


Expand Down
Loading