Update vLLM container version in QwQ deployment notebook.

vertex-mg-bot · copybara-github · commit 85fa955e6de1 · 2025-07-02T10:34:24.000-07:00
PiperOrigin-RevId: 778559325
diff --git a/notebooks/community/model_garden/model_garden_pytorch_qwq_deployment.ipynb b/notebooks/community/model_garden/model_garden_pytorch_qwq_deployment.ipynb
@@ -211,14 +211,14 @@
       "source": [
         "# @title Deploy\n",
         "\n",
-        "# @markdown This section uploads prebuilt Qwen2/Qwen2.5 models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 to 30 minutes to finish depending on the size of the model.\n",
+        "# @markdown This section uploads prebuilt QwQ models to Model Registry and deploys it to a Vertex AI Endpoint. It takes 15 to 30 minutes to finish depending on the size of the model.\n",
         "\n",
         "MODEL_ID = \"QwQ-32B\"  # @param [\"QwQ-32B\"] {isTemplate: true}\n",
         "model_path_prefix = \"Qwen\"\n",
         "model_id = os.path.join(model_path_prefix, MODEL_ID)\n",
         "\n",
         "# The pre-built serving docker image for vLLM.\n",
-        "VLLM_DOCKER_URI = \"us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-6.ubuntu2204.py310\"\n",
+        "VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250506_0916_RC01\"\n",
         "\n",
         "# @markdown Set `use_dedicated_endpoint` to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
         "use_dedicated_endpoint = True  # @param {type:\"boolean\"}\n",
@@ -333,9 +333,10 @@
         "        vllm_args.append(\n",
         "            f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
         "        )\n",
+        "\n",
         "    if enable_yarn_scaling:\n",
         "        vllm_args.append(\n",
-        "            '--rope-scaling=\\'{\"factor\":4.0,\"original_max_position_embeddings\":32768,\"type\":\"yarn\"}\\''\n",
+        "            '--rope-scaling=\\'{\"factor\": 4.0, \"original_max_position_embeddings\": 32768, \"rope_type\": \"yarn\"}\\''\n",
         "        )\n",
         "\n",
         "    if model_type:\n",