diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a13e2cb78218..4f07be9aeb43 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -420,7 +420,7 @@ steps: - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md index 410da5000b11..c24722e9350a 100644 --- a/docs/dev-docker/README.md +++ b/docs/dev-docker/README.md @@ -291,7 +291,8 @@ python3 /app/vllm/benchmarks/benchmark_throughput.py \ --num-prompts $PROMPTS \ --max-num-seqs $MAX_NUM_SEQS ``` -For FP16 models, remove `--kv-cache-dtype fp8`. + +For FP16 models, remove `--kv-cache-dtype fp8`. When measuring models with long context lengths, performance may improve by setting `--max-model-len` to a smaller value (8192 in this example). It is important, however, to ensure that the `--max-model-len` is at least as large as the IN + OUT token counts. @@ -325,6 +326,7 @@ vllm serve amd/Llama-3.1-70B-Instruct-FP8-KV \ --gpu-memory-utilization 0.99 \ --num_scheduler-steps 10 ``` + For FP16 models, remove `--kv-cache-dtype fp8`. Change port (for example --port 8005) if port=8000 is currently being used by other processes. Run client in a separate terminal. Use port_id from previous step else port-id=8000.