ROCm · Alexei-V-Ivanov-AMD · Feb 4, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
@@ -57,7 +57,7 @@ while true; do
 done
 
 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 
@@ -92,7 +92,9 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_moe.py \
   --ignore=kernels/test_prefix_prefill.py \
   --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py"
+  --ignore=kernels/test_sampler.py \
+  --ignore=kernels/test_cascade_flash_attn.py \
+  --ignore=kernels/test_mamba_mixer2.py"
 fi
 
 #ignore certain Entrypoints tests

@@ -92,7 +92,9 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test # 10min
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
+  amd_gpus: 4   # Just for the sake of queue testing
   fast_check: true
   source_file_dependencies:
   - vllm/core
@@ -105,6 +107,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   fast_check: true
   mirror_hardwares: [amd]
+  amd_gpus: 2   # Just for the sake of queue testing
   source_file_dependencies:
   - vllm/
   commands:
@@ -176,6 +179,7 @@ steps:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
+  working_dir: "/vllm-workspace/tests" # optional
 
 - label: V1 Test
   #mirror_hardwares: [amd]
@@ -217,6 +221,7 @@ steps:
     - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -235,6 +240,7 @@ steps:
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
 - label: LogitsProcessor Test # 5min
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/model_executor/layers
@@ -256,7 +262,9 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
+  amd_gpus: 8
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -282,7 +290,9 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Test %N # 1h each
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
+  amd_gpus: 8
   source_file_dependencies:
   - csrc/
   - vllm/attention
@@ -292,6 +302,7 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
+  working_dir: "/vllm-workspace/tests"
   mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
@@ -334,6 +345,7 @@ steps:
     - pytest -v -s encoder_decoder
 
 - label: OpenAI-Compatible Tool Use # 20 min
+  working_dir: "/vllm-workspace/tests" 
   fast_check: false
   mirror_hardwares: [ amd ]
   source_file_dependencies:

@@ -1,13 +1,13 @@
 {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
 {% set default_working_dir = "vllm/tests" %}
 {% set hf_home = "/root/.cache/huggingface" %}
 
 steps:
   - label: ":docker: build image"
     depends_on: ~
     commands:
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ."
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm  --target test --progress plain ."
       - "docker push {{ docker_image_amd }}"
     key: "amd-build"
     env:
@@ -27,7 +27,15 @@ steps:
     depends_on: 
       - "amd-build"
     agents:
-      queue: amd_gpu
+{% if step.amd_gpus and step.amd_gpus==8%}
+      queue: amd_gpu_8
+{% elif step.amd_gpus and step.amd_gpus==4%}
+      queue: amd_gpu_4
+{% elif step.amd_gpus and step.amd_gpus==2%}
+      queue: amd_gpu_4
+{% else%}
+      queue: amd_gpu_1
+{% endif%}
     commands: 
       - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
     env:

@@ -108,6 +108,7 @@ ARG COMMON_WORKDIR
 # Copy over the benchmark scripts as well
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
+# "Dummy alternation"
 
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

@@ -7,6 +7,7 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 
 from ....test_utils import xfail_if_rocm62
 from .conftest import get_text_from_llm_generator
@@ -43,6 +44,10 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 
     Additionally, we compare the results of the v1 and v2 managers.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     sampling_params = SamplingParams(
@@ -103,6 +108,10 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     The results with and without chunked prefill are not the same due to
     numerical instabilities.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     sampling_params = SamplingParams(

@@ -12,6 +12,7 @@
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
 
 from ..models.utils import check_outputs_equal
 
@@ -53,6 +54,10 @@ def test_mixed_requests(
     and the others don't. The cached position determines where
     the sequence is at among the batch of prefills.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     with hf_runner(model, dtype=dtype) as hf_model:
@@ -103,6 +108,11 @@ def test_unstable_prompt_sequence(
     backend: str,
     monkeypatch,
 ) -> None:
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     with vllm_runner(