Refactor qwen3 axolotl notebook.

vertex-mg-bot · copybara-github · commit 74376f278769 · 2025-06-09T11:12:47.000-07:00
PiperOrigin-RevId: 769218301
diff --git a/notebooks/community/model_garden/model_garden_axolotl_qwen3_finetuning.ipynb b/notebooks/community/model_garden/model_garden_axolotl_qwen3_finetuning.ipynb
@@ -611,9 +611,13 @@
         "        raise ValueError(\"Do not override base_model flag here.\")\n",
         "axolotl_flag_overrides.append(f\"--base_model={HF_MODEL_ID}\")\n",
         "\n",
-        "# Set model_id and publisher. This is required for Vertex AI fine-tuning job and Vertex AI model deployment.\n",
-        "publisher = HF_MODEL_ID.split(\"/\")[0]\n",
-        "model_id = HF_MODEL_ID.split(\"/\")[1]\n",
+        "base_model = axolotl_config[\"base_model\"]\n",
+        "for overrides in axolotl_flag_overrides:\n",
+        "    if overrides.startswith(\"--base_model=\"):\n",
+        "        base_model = overrides.split(\"=\")[1]\n",
+        "        break\n",
+        "publisher = base_model.split(\"/\")[0]\n",
+        "model_id = base_model.split(\"/\")[1]\n",
         "model_id = model_id.replace(\".\", \"-\")"
       ]
     },
@@ -731,7 +735,7 @@
         "CLI_PROMPT = \"What is car?\"  # @param {type:\"string\"}\n",
         "\n",
         "if INFERENCE_METHOD == \"gradio\":\n",
-        "    ! cd axolotl && export CUDA_VISIBLE_DEVICES=0 && axolotl inference --base-model=$HF_MODEL_ID $local_config_path --lora-model-dir=$AXOLOTL_OUTPUT_DIR --gradio\n",
+        "    ! cd axolotl && export CUDA_VISIBLE_DEVICES=0 && axolotl inference --base-model=$base_model $local_config_path --lora-model-dir=$AXOLOTL_OUTPUT_DIR --gradio\n",
         "elif INFERENCE_METHOD == \"cli\":\n",
         "    assert CLI_PROMPT, \"CLI_PROMPT must be set if INFERENCE_METHOD is 'cli'.\"\n",
         "    env = os.environ.copy()\n",
@@ -740,7 +744,7 @@
         "        \"axolotl\",\n",
         "        \"inference\",\n",
         "        local_config_path,\n",
-        "        f\"--base-model={HF_MODEL_ID}\",\n",
+        "        f\"--base-model={base_model}\",\n",
         "        f\"--lora-model-dir={AXOLOTL_OUTPUT_DIR}\",\n",
         "    ]\n",
         "    run_cmd_and_check_output(cmd, env, f\"{CLI_PROMPT}\\x04\", f\"{WORKING_DIR}/axolotl/\")\n",
@@ -775,7 +779,7 @@
         "    \"python3\",\n",
         "    \"-m\",\n",
         "    \"axolotl.cli.merge_lora\",\n",
-        "    f\"--base-model={HF_MODEL_ID}\",\n",
+        "    f\"--base-model={base_model}\",\n",
         "    f\"--output-dir={AXOLOTL_OUTPUT_DIR}\",\n",
         "    local_config_path,\n",
         "]\n",
@@ -1068,6 +1072,8 @@
         "    dtype: str | None = None,\n",
         "    enable_trust_remote_code: bool = False,\n",
         "    enable_torch_compile: bool = False,\n",
+        "    torch_compile_max_bs: int | None = None,\n",
+        "    attention_backend: str = \"\",\n",
         "    enable_flashinfer_mla: bool = False,\n",
         "    disable_cuda_graph: bool = False,\n",
         "    speculative_algorithm: str | None = None,\n",
@@ -1115,6 +1121,11 @@
         "\n",
         "    if enable_torch_compile:\n",
         "        sglang_args.append(\"--enable-torch-compile\")\n",
+        "        if torch_compile_max_bs:\n",
+        "            sglang_args.append(f\"--torch-compile-max-bs={torch_compile_max_bs}\")\n",
+        "\n",
+        "    if attention_backend:\n",
+        "        sglang_args.append(f\"--attention-backend={attention_backend}\")\n",
         "\n",
         "    if enable_flashinfer_mla:\n",
         "        sglang_args.append(\"--enable-flashinfer-mla\")\n",
@@ -1144,6 +1155,13 @@
         "    if enable_jit_deepgemm:\n",
         "        env_vars[\"SGL_ENABLE_JIT_DEEPGEMM\"] = \"1\"\n",
         "\n",
+        "    # HF_TOKEN is not a compulsory field and may not be defined.\n",
+        "    try:\n",
+        "        if HF_TOKEN:\n",
+        "            env_vars[\"HF_TOKEN\"] = HF_TOKEN\n",
+        "    except NameError:\n",
+        "        pass\n",
+        "\n",
         "    model = aiplatform.Model.upload(\n",
         "        display_name=model_name,\n",
         "        serving_container_image_uri=SGLANG_DOCKER_URI,\n",
@@ -1186,7 +1204,7 @@
         "                \"maxReplicaCount\": 1,\n",
         "            },\n",
         "            \"system_labels\": {\n",
-        "                \"NOTEBOOK_NAME\": \"model_garden_axolotl_finetuning.ipynb\",\n",
+        "                \"NOTEBOOK_NAME\": \"model_garden_axolotl_qwen3_finetuning.ipynb\",\n",
         "                \"NOTEBOOK_ENVIRONMENT\": common_util.get_deploy_source(),\n",
         "            },\n",
         "        },\n",
@@ -1268,11 +1286,11 @@
         "prompt = \"<|im_start|>user What is the best way to diagnose and fix a flickering light in my house?<|im_end|><|im_start|>assistant\"  # @param {type: \"string\"}\n",
         "\n",
         "# @markdown By default, Qwen3 has thinking capabilities enabled, similar to QwQ-32B. This means the model will use its reasoning abilities to enhance the quality of generated responses.\n",
-        "# @markdown The model will generate think content wrapped in a \\...\\ block, followed by the final response.\n",
+        "# @markdown The model will generate think content wrapped in a \\<think>...\\</think> block, followed by the final response.\n",
         "# @markdown `max_new_tokens` may need to be increased to accommodate the additional think content.\n",
         "enable_thinking = True  # @param {type:\"boolean\"}\n",
         "if not enable_thinking:\n",
-        "    prompt += \"\"\n",
+        "    prompt += \"<think></think>\"\n",
         "\n",
         "\n",
         "max_new_tokens = 1024  # @param {type:\"integer\"}\n",
@@ -1310,9 +1328,7 @@
         ")\n",
         "\n",
         "for prediction in response.predictions:\n",
-        "    print(prediction)\n",
-        "\n",
-        "# @markdown Click \"Show Code\" to see more details."
+        "    print(prediction)"
       ]
     },
     {