|
237 | 237 | }, |
238 | 238 | "outputs": [], |
239 | 239 | "source": [ |
240 | | - "model_version = \"qwen3-vl-30b-a3b-instruct\" # @param [\"qwen3-vl-235b-a22b-instruct\", \"qwen3-vl-235b-a22b-instruct-fp8\", \"qwen3-vl-235b-a22b-thinking\", \"qwen3-vl-235b-a22b-thinking-fp8\", \"qwen3-vl-30b-a3b-instruct\", \"qwen3-vl-30b-a3b-instruct-fp8\", \"qwen3-vl-30b-a3b-thinking\", \"qwen3-vl-30b-a3b-thinking-fp8\"] {isTemplate:true}\n", |
| 240 | + "model_version = \"qwen3-vl-8b-instruct\" # @param [\"qwen3-vl-235b-a22b-instruct\", \"qwen3-vl-235b-a22b-instruct-fp8\", \"qwen3-vl-235b-a22b-thinking\", \"qwen3-vl-235b-a22b-thinking-fp8\", \"qwen3-vl-30b-a3b-instruct\", \"qwen3-vl-30b-a3b-instruct-fp8\", \"qwen3-vl-30b-a3b-thinking\", \"qwen3-vl-30b-a3b-thinking-fp8\", \"qwen3-vl-32b-instruct\", \"qwen3-vl-32b-instruct-fp8\", \"qwen3-vl-32b-thinking\", \"qwen3-vl-32b-thinking-fp8\", \"qwen3-vl-4b-instruct\", \"qwen3-vl-4b-instruct-fp8\", \"qwen3-vl-4b-thinking\", \"qwen3-vl-4b-thinking-fp8\", \"qwen3-vl-8b-instruct\", \"qwen3-vl-8b-instruct-fp8\", \"qwen3-vl-8b-thinking\", \"qwen3-vl-8b-thinking-fp8\"] {isTemplate:true}\n", |
241 | 241 | "MODEL_NAME = f\"qwen/qwen3-vl@{model_version}\"" |
242 | 242 | ] |
243 | 243 | }, |
|
383 | 383 | " accept_eula=True,\n", |
384 | 384 | " use_dedicated_endpoint=use_dedicated_endpoint,\n", |
385 | 385 | " serving_container_image_uri=\"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20251003_0916_RC01\",\n", |
386 | | - " machine_type=\"a3-highgpu-2g\",\n", |
| 386 | + " machine_type=\"a3-highgpu-1g\",\n", |
387 | 387 | " accelerator_type=\"NVIDIA_H100_80GB\",\n", |
388 | | - " accelerator_count=2,\n", |
| 388 | + " accelerator_count=1,\n", |
389 | 389 | ")" |
390 | 390 | ] |
391 | 391 | }, |
|
424 | 424 | { |
425 | 425 | "cell_type": "markdown", |
426 | 426 | "metadata": { |
427 | | - "id": "P9rp6hTLgif2" |
| 427 | + "id": "scQowXXcD8Fe" |
428 | 428 | }, |
429 | 429 | "source": [ |
430 | 430 | "## Inference" |
|
439 | 439 | }, |
440 | 440 | "outputs": [], |
441 | 441 | "source": [ |
442 | | - "# @title Chat completion\n", |
443 | | - "\n", |
| 442 | + "# @title Inference\n", |
444 | 443 | "if use_dedicated_endpoint:\n", |
445 | 444 | " DEDICATED_ENDPOINT_DNS = endpoint.gca_resource.dedicated_endpoint_dns\n", |
446 | 445 | "ENDPOINT_RESOURCE_NAME = endpoint.resource_name\n", |
447 | 446 | "\n", |
448 | 447 | "# @markdown Because the Qwen3 models generate detailed reasoning steps, the output is expected to be long. We recommend using streaming for a better generation experience.\n", |
449 | | - "# @title Chat Completions Inference\n", |
450 | 448 | "\n", |
451 | | - "# @title Chat Completions Inference\n", |
| 449 | + "# @title Inference\n", |
452 | 450 | "\n", |
453 | 451 | "# @markdown Once deployment succeeds, you can send requests to the endpoint using the OpenAI SDK.\n", |
454 | 452 | "\n", |
|
0 commit comments