Update gemma2 examples with a note about sample generation (#1176)

SUMMARY: - Add a note advising users to either downgrade transformers from 4.49 or use vLLM for generation - We should revisit why this is only happening on generation with this new release but can be revisited down the road
vllm-project · Feb 19, 2025 · 6a1ba3c · 6a1ba3c
1 parent 45f2b33
commit 6a1ba3c
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 0 deletions.
diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -86,6 +86,10 @@ def process_and_tokenize(example):
     "Please use vLLM for inference with the quantized kv_cache.",
 )
 # Confirm generations of the quantized model look sane.
+
+# NOTE: transformers 4.49.0 results in a generation error with gemma2.
+# Consider either downgrading your transformers version to a previous version
+# or use vLLM for sample generation.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")

diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -29,6 +29,9 @@
 )
 
 # Confirm generations of the quantized model look sane.
+# NOTE: transformers 4.49.0 results in a generation error with gemma2.
+# Consider either downgrading your transformers version to a previous version
+# or use vLLM for sample generation.
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)

diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -68,6 +68,9 @@ def tokenize(sample):
 )
 
 # Confirm generations of the quantized model look sane.
+# NOTE: transformers 4.49.0 results in a generation error with gemma2.
+# Consider either downgrading your transformers version to a previous version
+# or use vLLM for sample generation.
 print("========== SAMPLE GENERATION ==============")
 input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids, max_new_tokens=20)