|
3435 | 3435 | "outputs": [], |
3436 | 3436 | "source": [ |
3437 | 3437 | "# The MG vLLM model co-hosting serving container.\n", |
3438 | | - "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-imageplatform/vertex-model-garden/vllm-inference-restricted-ubuntu22.04-py3.12:model-garden.vllm-restricted-x86-release_20251028.02_p0\" # @param {type:\"string\"}" |
| 3438 | + "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/vllm-serve-gpu:20251028_0916_RC01\" # @param {type:\"string\"}" |
3439 | 3439 | ] |
3440 | 3440 | }, |
3441 | 3441 | { |
|
3506 | 3506 | "\n", |
3507 | 3507 | "Sample command:\n", |
3508 | 3508 | "\n", |
3509 | | - "# Local\n", |
3510 | | - "python benchmark_util_multi_model.py \\\n", |
3511 | | - " --model /path/to/modelA,/path/to/modelB \\\n", |
3512 | | - " --gpu-memory-partition 0.45,0.45 \\\n", |
3513 | | - " --total-gpus 8 \\\n", |
3514 | | - " --input-length 1200 \\\n", |
3515 | | - " --output-length 250 \\\n", |
3516 | | - " --num-prompts 200 \\\n", |
3517 | | - " --sonnet-prefix-len 49 \\\n", |
3518 | | - " --concurrencies 1 8 16 \\\n", |
3519 | | - " --max-median-ttft-ms 1000 \\\n", |
3520 | | - " --max-p99-ttft-ms 10000 \\\n", |
3521 | | - " --max-median-tpot-ms 100 \\\n", |
3522 | | - " --max-p99-tpot-ms 1000 \\\n", |
3523 | | - " --docker-uri us-docker.pkg.dev/vertex-imageplatform/vertex-model-garden/vllm-inference-restricted-ubuntu22.04-py3.12:model-garden.vllm-restricted-x86-release_20251028.02_p0 \\\n", |
3524 | | - " --server-init-timeout 300 \\\n", |
3525 | | - " --benchmark-script-path /path/to/benchmark_serving.py \\\n", |
3526 | | - " --dataset-path /path/to/sonnet.txt \\\n", |
3527 | | - " --results-output-path /path/to/benchmark_results.csv \\\n", |
3528 | | - " --server-stats-output-path /path/to/server_stats.csv \\\n", |
3529 | | - " --figure-output-dir /path/to/benchmark_figures/ \\\n", |
3530 | | - " --no-skip-concurrencies-given-latency \\\n", |
3531 | | - " --backend local\n", |
3532 | | - "\n", |
3533 | 3509 | "# Vertex\n", |
3534 | 3510 | "# Authenticate credentials\n", |
3535 | 3511 | "gcloud auth application-default login\n", |
|
3546 | 3522 | " --max-p99-ttft-ms 10000 \\\n", |
3547 | 3523 | " --max-median-tpot-ms 100 \\\n", |
3548 | 3524 | " --max-p99-tpot-ms 1000 \\\n", |
3549 | | - " --docker-uri us-docker.pkg.dev/vertex-imageplatform/vertex-model-garden/vllm-inference-restricted-ubuntu22.04-py3.12:model-garden.vllm-restricted-x86-release_20251028.02_p0 \\\n", |
| 3525 | + " --docker-uri us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/vllm-serve-gpu:20251028_0916_RC01 \\\n", |
3550 | 3526 | " --server-init-timeout 300 \\\n", |
3551 | 3527 | " --benchmark-script-path /path/to/benchmark_serving.py \\\n", |
3552 | 3528 | " --dataset-path /path/to/sonnet.txt \\\n", |
|
3683 | 3659 | " \"--docker-uri\",\n", |
3684 | 3660 | " type=str,\n", |
3685 | 3661 | " default=(\n", |
3686 | | - " \"us-docker.pkg.dev/vertex-imageplatform/vertex-model-garden/vllm-inference-restricted-ubuntu22.04-py3.12:model-garden.vllm-restricted-x86-release_20251028.02_p0\",\n", |
| 3662 | + " \"us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/vllm-serve-gpu:20251028_0916_RC01\",\n", |
3687 | 3663 | " ),\n", |
3688 | 3664 | " help=\"Docker image URI for the vLLM server.\",\n", |
3689 | 3665 | ")\n", |
|
5852 | 5828 | "HF_TOKEN = \"\" # @param {type:\"string\"}\n", |
5853 | 5829 | "\n", |
5854 | 5830 | "# The MG vLLM serving container supporting model replicas.\n", |
5855 | | - "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-imageplatform/vertex-model-garden/vllm-inference-restricted-ubuntu22.04-py3.12:model-garden.vllm-restricted-x86-release_20251028.02_p0\" # @param {type:\"string\"}\n", |
| 5831 | + "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/vllm-serve-gpu:20251028_0916_RC01\" # @param {type:\"string\"}\n", |
5856 | 5832 | "SERVER_INIT_TIMEOUT = 300 # @param {type:\"integer\"}\n", |
5857 | 5833 | "\n", |
5858 | 5834 | "# The total number of GPUs available on the machine.\n", |
|
0 commit comments