NVIDIA · kaiyux · Jun 4, 2024 · Jun 4, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 *.nsys-rep
 .VSCodeCounter
 build*/
+!builders/
 *.egg-info/
 .coverage
 *.onnx

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -46,5 +46,5 @@ repos:
         args:
         - --skip=".git,3rdparty"
         - --exclude-file=examples/whisper/tokenizer.py
-        - --ignore-words-list=rouge,inout,atleast,strat,nd
+        - --ignore-words-list=rouge,inout,atleast,strat,nd,subtile
         exclude: 'tests/llm-test-defs/turtle/test_input_files'
diff --git a/README.md b/README.md
@@ -75,3 +75,6 @@ To get started with TensorRT-LLM, visit our documentation:
 - [Installation Guide for Linux](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
 - [Installation Guide for Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html)
 - [Supported Hardware, Models, and other Software](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html)
+
+## Community
+- [Model zoo](https://huggingface.co/TheFloat16) (generated by TRT-LLM rel 0.9 a9356d4b7610330e89c1010f342a9ac644215c52)
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -210,8 +210,10 @@ TP=2
 PP=1
 MAX_LEN=1024
 MAX_BATCH=32
-MAX_LORA_RANK=32
+NUM_LAYERS=40
+MAX_LORA_RANK=64
 NUM_LORA_MODS=7
+EOS_ID=2
 
 SOURCE_LORA=chinese-llama-2-lora-13b
 CPP_LORA=chinese-llama-2-lora-13b-cpp
@@ -234,7 +236,7 @@ ${HOME}/.local/bin/trtllm-build \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \
-    --lora_target_modules attn_qkv \
+    --lora_target_modules attn_q attn_k attn_v attn_dense mlp_h_to_4h mlp_4h_to_h mlp_gate \
     --max_lora_rank ${MAX_LORA_RANK}
 
 NUM_LORAS=(8 16 24 32 64 128 256)
@@ -252,8 +254,6 @@ mkdir -p $EG_DIR/data
 # Prepare dataset without lora_task_id
 python benchmarks/cpp/prepare_dataset.py \
     --output "${EG_DIR}/data/token-norm-dist.json" \
-    --request-rate -1 \
-    --time-delay-dist constant \
     --tokenizer $TOKENIZER \
     token-norm-dist \
     --num-requests $NUM_REQUESTS \
@@ -263,8 +263,6 @@ python benchmarks/cpp/prepare_dataset.py \
 for nloras in ${NUM_LORAS[@]}; do
     python benchmarks/cpp/prepare_dataset.py \
         --output "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
-        --request-rate -1 \
-        --time-delay-dist constant \
         --rand-task-id 0 $(( $nloras - 1 )) \
         --tokenizer $TOKENIZER \
         token-norm-dist \
@@ -292,7 +290,7 @@ mpirun -n ${TP} --output-filename ${EG_DIR}/log-base-lora \
 
 # Now run inference with various numbers or loras
 # The host cache is set large enough to hold all the LoRAs in lora_dir
-# GPU cache is set to hold 32 LoRAs
+# GPU cache is set to hold 16 LoRAs
 # This benchmark will preload all the LoRAs into the host cache
 # We run inference on a range of active LoRAs exercising different cache miss rates.
 for nloras in ${NUM_LORAS[@]}; do
@@ -303,7 +301,7 @@ for nloras in ${NUM_LORAS[@]}; do
         --type IFB \
         --dataset "${EG_DIR}/data/token-norm-dist-lora-${nloras}.json" \
         --lora_host_cache_bytes 8589934592 \
-        --lora_num_device_mod_layers $(( 32 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
+        --lora_num_device_mod_layers $(( 16 * $NUM_LAYERS * $NUM_LORA_MODS * $MAX_LORA_RANK )) \
         --kv_cache_free_gpu_mem_fraction 0.80 \
         --log_level info \
         --eos_id ${EOS_ID} \

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -458,10 +458,6 @@ class Recorder
     {
         this->recordEnd(requestId, hasError);
 
-        if (mRespJsonFile.empty())
-            return;
-        int32_t outputSeqLen;
-
         for (auto& tensor : responseTensors)
         {
             if (tensor.name == inference_request::kOutputIdsTensorName)
@@ -471,7 +467,7 @@ class Recorder
             else if (tensor.name == inference_request::kSequenceLengthTensorName)
             {
                 // Tensor of shape nBeams, and we only need the first one
-                outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
+                int32_t outputSeqLen = *(bufferCast<int32_t>(*(tensor.tensor)));
                 if (mOutputHasInput)
                 {
                     int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
@@ -482,6 +478,30 @@ class Recorder
         }
     }
 
+    void recordEnd(uint64_t requestId, texec::Response const& response)
+    {
+
+        this->recordEnd(requestId, response.hasError());
+
+        // Get the actual output length
+        if (!response.hasError())
+        {
+            auto outputTokenIds = response.getResult().outputTokenIds;
+
+            int32_t outSeqLen = 0;
+            for (auto const& beam : outputTokenIds)
+            {
+                outSeqLen = std::max(static_cast<int32_t>(beam.size()), outSeqLen);
+            }
+            if (mOutputHasInput)
+            {
+                int inputSeqLen = mRequestBenchInfos[requestId].inputLength;
+                outSeqLen -= inputSeqLen;
+            }
+            mRequestBenchInfos[requestId].outputLength = outSeqLen;
+        }
+    }
+
     float calcPercentile(std::vector<float> const& latencies, int percentile)
     {
         int const index = static_cast<int>(std::ceil((percentile / 100.0) * latencies.size())) - 1;
@@ -827,7 +847,7 @@ class ExecutorServer
                     numFinished++;
                     if (!warmup)
                     {
-                        mRecorder->recordEnd(reqId, response.hasError());
+                        mRecorder->recordEnd(reqId, response);
                     }
                 }
             }

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -34,6 +34,7 @@
 #include <NvInfer.h>
 #include <atomic>
 #include <chrono>
+#include <cuda_profiler_api.h>
 #include <cxxopts.hpp>
 #include <future>
 #include <sstream>
@@ -213,6 +214,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                 std::vector<float> latencies;
                 std::vector<float> generationTimes;
                 auto generationProfiler = std::make_shared<GptSession::GenerationProfiler>();
+                cudaProfilerStart();
                 while (iterIdx < numRuns)
                 {
                     auto const start = std::chrono::steady_clock::now();
@@ -242,6 +244,7 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                         break;
                     }
                 }
+                cudaProfilerStop();
 
                 TLLM_LOG_INFO(memoryCounter.toString());
                 done = true;

diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -198,10 +198,6 @@ def parse_arguments():
         help=
         'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
     )
-    parser.add_argument('--strongly_typed',
-                        default=False,
-                        action='store_true',
-                        help='This option will reduce the building time.')
     parser.add_argument(
         '--gpu_weights_percent',
         type=str,

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -151,10 +151,6 @@ def parse_arguments():
                         default=False,
                         action='store_true',
                         help="Build engines serially")
-    parser.add_argument('--strongly_typed',
-                        default=False,
-                        action='store_true',
-                        help='This option will reduce the building time.')
     parser.add_argument(
         '--multiple_profiles',
         default=False,
@@ -251,9 +247,6 @@ def build_gpt(args):
     if not args.serial_build:
         torch.cuda.set_device(runtime_rank)
 
-    strongly_typed = args.strongly_typed
-    if args.quantization is not None and "fp8" in args.quantization:
-        strongly_typed = True
     num_kv_heads = build_config['num_heads'] \
         if build_config['num_kv_heads'] is None else build_config['num_kv_heads']
     apply_query_key_layer_scaling = False
@@ -321,7 +314,7 @@ def build_gpt(args):
         quant_mode=quant_mode,
         use_refit=False,
         opt_level=build_config['builder_opt'],
-        strongly_typed=strongly_typed,
+        strongly_typed=True,
         weight_streaming=is_weight_streaming,
         **builder_config_extra_kwargs)
     engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -363,8 +356,10 @@ def build_gpt(args):
             'apply_query_key_layer_scaling':
             builder_config.apply_query_key_layer_scaling,
             'rotary_pct': build_config['rotary_pct'],
-            'moe_num_experts': build_config["moe_num_experts"],
-            'moe_top_k': build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.GPTForCausalLM(config)
@@ -399,7 +394,7 @@ def build_gpt(args):
     elif family == "llama":
         config = {
             'architecture':
-            'LLaMAForCausalLM',
+            'LlamaForCausalLM',
             'dtype':
             args.dtype,
             'num_hidden_layers':
@@ -430,10 +425,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            }
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.LLaMAForCausalLM(config)
@@ -602,9 +597,6 @@ def build_gpt(args):
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(config)
-        tensorrt_llm_model = optimize_model(
-            tensorrt_llm_model,
-            use_parallel_embedding=config.use_parallel_embedding)
     elif family == "falcon":
         config = {
             'architecture':
@@ -696,7 +688,7 @@ def build_gpt(args):
     elif family == "internlm":
         config = {
             'architecture':
-            'LLaMAForCausalLM',
+            'LlamaForCausalLM',
             'dtype':
             args.dtype,
             'num_hidden_layers':
@@ -778,10 +770,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
             'qwen_type':
             'qwen',
         }
@@ -821,10 +813,10 @@ def build_gpt(args):
                 'world_size': world_size,
                 'tp_size': world_size
             },
-            'moe_num_experts':
-            build_config["moe_num_experts"],
-            'moe_top_k':
-            build_config["moe_top_k"],
+            'moe': {
+                'num_experts': build_config["moe_num_experts"],
+                'top_k': build_config["moe_top_k"],
+            },
             'qwen_type':
             'qwen2',
         }
@@ -1029,7 +1021,7 @@ def build_bert(args):
         max_batch_size=max_batch_size,
         max_input_len=max_input_len,
         opt_level=build_config['builder_opt'],
-        strongly_typed=args.strongly_typed,
+        strongly_typed=True,
         weight_streaming=is_weight_streaming,
     )
     engine_name = get_engine_name(args.model, args.dtype, world_size,
@@ -1207,7 +1199,7 @@ def enc_dec_build_helper(component, config, args):
         cross_attention=(component == 'decoder'),
         has_position_embedding=has_position_embedding,
         has_token_type_embedding=False,  # by default
-        strongly_typed=False,  # by default
+        strongly_typed=True,
         gather_all_token_logits=False,  # by default
         int8=(quant_mode.has_act_and_weight_quant()
               or quant_mode.is_int8_weight_only()),

diff --git a/benchmarks/python/check_accuracy_mlperf.py b/benchmarks/python/check_accuracy_mlperf.py
@@ -1,4 +1,5 @@
 import json
+import os
 from enum import Enum
 
 import evaluate
@@ -82,9 +83,11 @@ def calculate_toks_per_sample(preds, eos_id):
     return avg_len / num_samples
 
 
-def calculate_rouge_score(preds, targets):
+def calculate_rouge_score(preds, targets, rouge_dir=None):
     print("Calculating ROUGE scores...")
-    metric = evaluate.load("rouge")
+    rouge_dir = rouge_dir if rouge_dir and os.path.exists(
+        rouge_dir) else "rouge"
+    metric = evaluate.load(rouge_dir)
     preds, targets = postprocess_text(preds, targets[0:len(preds)])
     result = metric.compute(predictions=preds,
                             references=targets,
@@ -114,6 +117,15 @@ def parse_arguments():
     parser.add_argument("--base_model",
                         type=str,
                         help="Location of the model used (to create tokenizer)")
+
+    parser.add_argument(
+        '--rouge_dir',
+        default=None,
+        type=str,
+        help=
+        "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF."
+    )
+
     args = parser.parse_args()
 
     return args
@@ -146,7 +158,8 @@ def main():
     tps_score = calculate_toks_per_sample(pred_toks, tokenizer.eos_token)
 
     pred_texts = tokenizer.batch_decode(pred_toks, skip_special_tokens=True)
-    achieved_scores = calculate_rouge_score(pred_texts, target_texts)
+    achieved_scores = calculate_rouge_score(pred_texts, target_texts,
+                                            args.rouge_dir)
 
     achieved_scores['tokens_per_sample'] = tps_score
     targets = ACCURACY_TARGETS[model]

diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
@@ -279,14 +279,10 @@ def check_memory(self, io_shapes: list, raise_exception=False):
             self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
         # when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
         if not self.use_gpt_attention_plugin:
-            if os.getenv('TRTLLM_DISABLE_OOTB_KVCACHE_REUSE') != 'ON':
-                local_n_layer = ceil(self.build_config.num_layers /
-                                     self.runtime_mapping.pp_size)
-                kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
-                    local_n_layer + 1)
-            else:
-                # without reusing, we need one for past as engine inputs, one for present as engine outputs.
-                kv_cache_size_in_bytes *= 2
+            local_n_layer = ceil(self.build_config.num_layers /
+                                 self.runtime_mapping.pp_size)
+            kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
+                local_n_layer + 1)
 
         kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
                                                    "MiB")

diff --git a/benchmarks/suite/tensorrt_llm_bench/utils/enums.py b/benchmarks/suite/tensorrt_llm_bench/utils/enums.py
@@ -51,9 +51,7 @@ def get_build_options(self, dtype: str) -> List[str]:
             List[str]: A list of command line arguments to be added to build
             commands.
         """
-        if self.value == self.FP8:
-            return ["--strongly_typed"]
-        else:
+        if not self.value == self.FP8:
             return ["--gemm_plugin", dtype]