Add new variants to qwen3-vl

vertex-mg-bot · copybara-github · commit bab9c398fd5e · 2025-10-24T00:02:05.000-07:00
PiperOrigin-RevId: 823383168
diff --git a/notebooks/community/model_garden/model_garden_pytorch_qwen3_vl.ipynb b/notebooks/community/model_garden/model_garden_pytorch_qwen3_vl.ipynb
@@ -237,7 +237,7 @@
       },
       "outputs": [],
       "source": [
-        "model_version = \"qwen3-vl-30b-a3b-instruct\"  # @param [\"qwen3-vl-235b-a22b-instruct\", \"qwen3-vl-235b-a22b-instruct-fp8\", \"qwen3-vl-235b-a22b-thinking\", \"qwen3-vl-235b-a22b-thinking-fp8\", \"qwen3-vl-30b-a3b-instruct\", \"qwen3-vl-30b-a3b-instruct-fp8\", \"qwen3-vl-30b-a3b-thinking\", \"qwen3-vl-30b-a3b-thinking-fp8\"] {isTemplate:true}\n",
+        "model_version = \"qwen3-vl-8b-instruct\"  # @param [\"qwen3-vl-235b-a22b-instruct\", \"qwen3-vl-235b-a22b-instruct-fp8\", \"qwen3-vl-235b-a22b-thinking\", \"qwen3-vl-235b-a22b-thinking-fp8\", \"qwen3-vl-30b-a3b-instruct\", \"qwen3-vl-30b-a3b-instruct-fp8\", \"qwen3-vl-30b-a3b-thinking\", \"qwen3-vl-30b-a3b-thinking-fp8\", \"qwen3-vl-32b-instruct\", \"qwen3-vl-32b-instruct-fp8\", \"qwen3-vl-32b-thinking\", \"qwen3-vl-32b-thinking-fp8\", \"qwen3-vl-4b-instruct\", \"qwen3-vl-4b-instruct-fp8\", \"qwen3-vl-4b-thinking\", \"qwen3-vl-4b-thinking-fp8\", \"qwen3-vl-8b-instruct\", \"qwen3-vl-8b-instruct-fp8\", \"qwen3-vl-8b-thinking\", \"qwen3-vl-8b-thinking-fp8\"] {isTemplate:true}\n",
         "MODEL_NAME = f\"qwen/qwen3-vl@{model_version}\""
       ]
     },
@@ -383,9 +383,9 @@
         "    accept_eula=True,\n",
         "    use_dedicated_endpoint=use_dedicated_endpoint,\n",
         "    serving_container_image_uri=\"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20251003_0916_RC01\",\n",
-        "    machine_type=\"a3-highgpu-2g\",\n",
+        "    machine_type=\"a3-highgpu-1g\",\n",
         "    accelerator_type=\"NVIDIA_H100_80GB\",\n",
-        "    accelerator_count=2,\n",
+        "    accelerator_count=1,\n",
         ")"
       ]
     },
@@ -424,7 +424,7 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "P9rp6hTLgif2"
+        "id": "scQowXXcD8Fe"
       },
       "source": [
         "## Inference"
@@ -439,16 +439,14 @@
       },
       "outputs": [],
       "source": [
-        "# @title Chat completion\n",
-        "\n",
+        "# @title Inference\n",
         "if use_dedicated_endpoint:\n",
         "    DEDICATED_ENDPOINT_DNS = endpoint.gca_resource.dedicated_endpoint_dns\n",
         "ENDPOINT_RESOURCE_NAME = endpoint.resource_name\n",
         "\n",
         "# @markdown Because the Qwen3 models generate detailed reasoning steps, the output is expected to be long. We recommend using streaming for a better generation experience.\n",
-        "# @title Chat Completions Inference\n",
         "\n",
-        "# @title Chat Completions Inference\n",
+        "# @title Inference\n",
         "\n",
         "# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.\n",
         "\n",