vllm-project · SageMoore · Mar 5, 2025 · vincent-4 · Mar 5, 2025 · vincent-4
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
@@ -1282,6 +1282,7 @@ def _compute_prefill_context(
         assert prefill_metadata.context_chunk_max_seq_lens is not None
         assert prefill_metadata.context_lens_tensor is not None
 
+        has_context = prefill_metadata.context_lens_tensor.max() > 0
         output = None
         iters = len(prefill_metadata.context_chunk_seq_tot)
 
@@ -1322,7 +1323,8 @@ def _compute_prefill_context(
                                                [0, q.shape[-1] - v.shape[-1]],
                                                value=0)
 
-            if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+            if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and \
+                has_context is False:
-                has_context is False:
+                not has_context:
-                has_context is False:
+                not has_context:
                 attn_output, attn_softmax_lse = self.triton_fa_func(
                     q,
                     k,
@@ -1411,7 +1413,7 @@ def _forward_prefill(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and has_context is False:
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and has_context is False:
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and has_context is False:
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
             output = self.triton_fa_func(
                 q,
                 k,

diff --git a/vllm/config.py b/vllm/config.py
@@ -3433,9 +3433,9 @@ def __post_init__(self):
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
         if self.model_config and self.model_config.use_mla and \
-            not current_platform.is_cuda():
+            not (current_platform.is_cuda() or current_platform.is_rocm()):
             logger.info(
-                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "MLA is enabled on a non-GPU platform; forcing chunked "
                 "prefill and prefix caching to be disabled.")
             self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.chunked_prefill_enabled = False