The main changes:

vertex-mg-bot · copybara-github · commit a72b7bc9b0a9 · 2025-06-13T13:28:12.000-07:00
- `get_deployment_pod_name` now extract the app selector to query the pods
- remove dependency to service, using instead pod port instead
- adds a `POD_PORT` as template variable to allow to pass the port from the UI

PiperOrigin-RevId: 771205143
diff --git a/notebooks/community/model_garden/gke_model_ui_deployment_notebook_auto.ipynb b/notebooks/community/model_garden/gke_model_ui_deployment_notebook_auto.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "Pr9TgOcV9vAXeqGiyTaTI5kS",
       "metadata": {
         "cellView": "form",
         "id": "Pr9TgOcV9vAXeqGiyTaTI5kS"
@@ -27,7 +26,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "M1CpgYundFwz",
       "metadata": {
         "id": "M1CpgYundFwz"
       },
@@ -50,7 +48,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "t2jj2XOgkS4F",
       "metadata": {
         "id": "t2jj2XOgkS4F"
       },
@@ -124,7 +121,6 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "XMf-T58TkDy1",
       "metadata": {
         "cellView": "form",
         "id": "XMf-T58TkDy1"
@@ -155,15 +151,14 @@
     {
       "cell_type": "code",
       "execution_count": null,
-      "id": "IKGTaN84p8rX",
       "metadata": {
         "cellView": "form",
         "id": "IKGTaN84p8rX"
       },
       "outputs": [],
       "source": [
-        "# @title # Chat completion for text-only models { vertical-output: true}\n",
-        "# @markdown You may send prompts to the model server for prediction.\n",
+        "# @title # Chat completion for text-only models {vertical-output: true}\n",
+        "# @markdown Run cell to prompt the model server for prediction.\n",
         "# @markdown\n",
         "# @markdown * **user_prompt (string):** This is the text prompt you provide to the language model. It's the question or instruction e (e.g., \"Explain neural networks\").\n",
         "# @markdown * **temperature (number):** This  parameter controls the randomness of the model's output. It influences how the model selects the next token in the sequence it generates. Typical values range from 0.2 to 1.0.\n",
@@ -180,133 +175,171 @@
         "REGION = \"\"  # @param {type:\"string\", isTemplate:true}\n",
         "NAMESPACE = \"\"  # @param {type:\"string\", isTemplate:true}\n",
         "DEPLOYMENT = \"\"  # @param {type:\"string\", isTemplate:true}\n",
-        "DEPLOYMENT_APP_LABEL = \"\"  # @param {type:\"string\", isTemplate:true}\n",
+        "POD_PORT = \"\"  # @param {type:\"string\", isTemplate:true}\n",
         "\n",
-        "SERVICE = f\"{DEPLOYMENT}-service\"\n",
         "\n",
-        "\n",
-        "def _run_kubectl(cmd):\n",
-        "    \"\"\"Executes a kubectl command and returns its stdout.\"\"\"\n",
-        "    result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=60)\n",
-        "    return result.stdout.strip()\n",
-        "\n",
-        "\n",
-        "def fetch_cluster_credential(cluster, region, project_id):\n",
+        "def _run_kubectl(cmd, timeout=60):\n",
+        "    \"\"\"Executes a kubectl command.\"\"\"\n",
         "    try:\n",
-        "        # Ensure credentials for the target cluster\n",
-        "        cred_cmd = [\n",
-        "            \"gcloud\",\n",
-        "            \"container\",\n",
-        "            \"clusters\",\n",
-        "            \"get-credentials\",\n",
-        "            cluster,\n",
-        "            f\"--location={region}\",\n",
-        "            f\"--project={project_id}\",\n",
-        "        ]\n",
-        "        _run_kubectl(cred_cmd)\n",
-        "    except Exception as e:\n",
-        "        # Original code prints error and returns empty dict\n",
-        "        print(f\"Error fetching cluster credentials: {e}\")\n",
-        "        return {}\n",
-        "\n",
-        "\n",
-        "def get_deployment_pod_name(deployment, namespace, deployment_app_label):\n",
-        "    \"\"\"Finds the running pod name for a given deployment and namespace.\"\"\"\n",
+        "        result = subprocess.run(\n",
+        "            cmd, capture_output=True, text=True, check=True, timeout=timeout\n",
+        "        )\n",
+        "        return result.stdout.strip()\n",
+        "    except subprocess.CalledProcessError as e:\n",
+        "        raise RuntimeError(\n",
+        "            f\"Kubectl command failed: {' '.join(e.cmd)}\\nStderr: {e.stderr}\"\n",
+        "        ) from e\n",
+        "    except subprocess.TimeoutExpired as e:\n",
+        "        raise RuntimeError(f\"Kubectl command timed out: {' '.join(e.cmd)}\") from e\n",
+        "\n",
+        "\n",
+        "def fetch_cluster_credentials(cluster, region, project_id):\n",
+        "    \"\"\"Ensures credentials for the target GKE cluster.\"\"\"\n",
+        "    cred_cmd = [\n",
+        "        \"gcloud\",\n",
+        "        \"container\",\n",
+        "        \"clusters\",\n",
+        "        \"get-credentials\",\n",
+        "        cluster,\n",
+        "        f\"--location={region}\",\n",
+        "        f\"--project={project_id}\",\n",
+        "    ]\n",
+        "    _run_kubectl(cred_cmd)\n",
         "\n",
+        "\n",
+        "def get_deployment_selector_labels(deployment_name, namespace):\n",
+        "    \"\"\"Retrieves the selector labels for a given Kubernetes deployment.\"\"\"\n",
         "    cmd = [\n",
         "        \"kubectl\",\n",
         "        \"get\",\n",
-        "        \"pods\",\n",
+        "        \"deployment\",\n",
+        "        deployment_name,\n",
         "        \"-n\",\n",
         "        namespace,\n",
         "        \"-o\",\n",
         "        \"json\",\n",
-        "        \"-l\",\n",
-        "        f\"app={deployment_app_label}\",\n",
-        "        \"--field-selector=status.phase=Running\",\n",
         "    ]\n",
-        "    try:\n",
-        "        pods_json = _run_kubectl(cmd)\n",
-        "        pods = json.loads(pods_json)\n",
-        "        if pods.get(\"items\"):\n",
-        "            return pods[\"items\"][0][\"metadata\"][\"name\"]\n",
-        "        print(f\"No running pods found for {deployment} in {namespace}.\")\n",
-        "        return None\n",
-        "    except (\n",
-        "        subprocess.CalledProcessError,\n",
-        "        json.JSONDecodeError,\n",
-        "        IndexError,\n",
-        "        KeyError,\n",
-        "    ) as e:\n",
-        "        print(f\"Error getting pod name for {deployment} in {namespace}: {e}\")\n",
-        "        return None\n",
-        "\n",
-        "\n",
-        "def check_inference_label(pod_name, namespace):\n",
-        "    \"\"\"Checks if the specified pod has the vLLM inference server label.\"\"\"\n",
+        "    deployment_json = _run_kubectl(cmd)\n",
+        "    deployment_data = json.loads(deployment_json)\n",
+        "\n",
+        "    selector_labels = (\n",
+        "        deployment_data.get(\"spec\", {}).get(\"selector\", {}).get(\"matchLabels\")\n",
+        "    )\n",
+        "    if not selector_labels:\n",
+        "        raise RuntimeError(\n",
+        "            f\"No selector labels found for deployment '{deployment_name}' in\"\n",
+        "            f\" namespace '{namespace}'.\"\n",
+        "        )\n",
+        "    return selector_labels\n",
         "\n",
-        "    cmd = [\"kubectl\", \"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"json\"]\n",
-        "    try:\n",
-        "        pod_json = _run_kubectl(cmd)\n",
-        "        labels = json.loads(pod_json).get(\"metadata\", {}).get(\"labels\", {})\n",
-        "        return labels.get(\"ai.gke.io/inference-server\") == \"vllm\"\n",
-        "    except (subprocess.CalledProcessError, json.JSONDecodeError, KeyError) as e:\n",
-        "        print(f\"Error checking labels for pod {pod_name} in {namespace}: {e}\")\n",
-        "        return False\n",
         "\n",
+        "def get_running_pod_name(deployment_name, namespace):\n",
+        "    \"\"\"Retrieves the name of a running pod associated with a deployment.\"\"\"\n",
+        "    selector_labels = get_deployment_selector_labels(deployment_name, namespace)\n",
+        "    label_selector_str = \",\".join(f\"{k}={v}\" for k, v in selector_labels.items())\n",
         "\n",
-        "def get_service_endpoint(service, namespace):\n",
-        "    \"\"\"Retrieve the service endpoint of the deployment\"\"\"\n",
-        "    endpoint_cmd = [\n",
+        "    cmd = [\n",
         "        \"kubectl\",\n",
         "        \"get\",\n",
-        "        \"endpoints\",\n",
-        "        service,\n",
+        "        \"pods\",\n",
         "        \"-n\",\n",
         "        namespace,\n",
+        "        \"-o\",\n",
+        "        \"json\",\n",
+        "        \"-l\",\n",
+        "        label_selector_str,\n",
+        "        \"--field-selector=status.phase=Running\",\n",
         "    ]\n",
-        "    try:\n",
-        "        endpoint_output = _run_kubectl(endpoint_cmd).splitlines()\n",
-        "        if len(endpoint_output) < 2 or len(endpoint_output[1].split()) < 2:\n",
-        "            print(f\"Endpoint data incomplete for {service}.\")\n",
-        "            return None\n",
-        "        endpoint = endpoint_output[1].split()[\n",
-        "            1\n",
-        "        ]  # Assumes format: NAME ENDPOINTS AGE -> service ip:port,... age\n",
-        "        return endpoint\n",
-        "    except subprocess.CalledProcessError as e:\n",
-        "        print(f\"Error getting endpoints for {service}: {e}\")\n",
-        "        return None\n",
+        "    pods_json = _run_kubectl(cmd)\n",
+        "    pods_data = json.loads(pods_json)\n",
+        "\n",
+        "    if not pods_data.get(\"items\"):\n",
+        "        raise RuntimeError(\n",
+        "            f\"No running pods found for deployment '{deployment_name}' in namespace\"\n",
+        "            f\" '{namespace}' with selector '{label_selector_str}'.\"\n",
+        "        )\n",
+        "    return pods_data[\"items\"][0][\"metadata\"][\"name\"]\n",
         "\n",
         "\n",
-        "def process_response(request, pod_name, pod_endpoint, is_vllm_inference, namespace):\n",
-        "    \"\"\"Sends a request to the pod and processes the response.\"\"\"\n",
+        "def check_vllm_inference_label(pod_name, namespace):\n",
+        "    \"\"\"Checks if the specified pod has the vLLM inference server label.\"\"\"\n",
+        "    cmd = [\"kubectl\", \"get\", \"pod\", pod_name, \"-n\", namespace, \"-o\", \"json\"]\n",
+        "    pod_json = _run_kubectl(cmd)\n",
+        "    labels = json.loads(pod_json).get(\"metadata\", {}).get(\"labels\", {})\n",
+        "    return labels.get(\"ai.gke.io/inference-server\") == \"vllm\"\n",
         "\n",
-        "    json_data_escaped = json.dumps(request).replace(\"'\", \"'\\\\''\")\n",
+        "\n",
+        "def send_inference_request(\n",
+        "    request_payload, pod_name, pod_port, is_vllm_inference, namespace\n",
+        "):\n",
+        "    \"\"\"Sends an inference request to the specified pod and returns the model's response.\"\"\"\n",
+        "    json_data_escaped = json.dumps(request_payload).replace(\"'\", \"'\\\\''\")\n",
         "    curl_cmd = (\n",
         "        f\"kubectl exec -n {namespace} -t {pod_name} -- curl -s -X POST\"\n",
-        "        f' http://{pod_endpoint}/generate -H \"Content-Type: application/json\"'\n",
-        "        f\" -d '{json_data_escaped}' 2> /dev/null\"\n",
+        "        f' http://localhost:{pod_port}/generate -H \"Content-Type:'\n",
+        "        ' application/json\"'\n",
+        "        f\" -d '{json_data_escaped}' 2\u003e /dev/null\"\n",
         "    )\n",
+        "\n",
+        "    response_raw = _run_kubectl([\"bash\", \"-c\", curl_cmd])\n",
+        "\n",
+        "    if not response_raw:\n",
+        "        raise RuntimeError(f\"Empty response received from pod '{pod_name}'.\")\n",
+        "\n",
         "    try:\n",
-        "        response_raw = _run_kubectl([\"bash\", \"-c\", curl_cmd])\n",
-        "        if not response_raw:\n",
-        "            return f\"Error: Empty response from pod {pod_name}.\"\n",
         "        first_line = response_raw.splitlines()[0]\n",
         "        data = json.loads(first_line)\n",
+        "    except json.JSONDecodeError as e:\n",
+        "        raise RuntimeError(\n",
+        "            f\"Failed to decode JSON response from pod: {e}. Raw: {response_raw}\"\n",
+        "        ) from e\n",
+        "    except IndexError:\n",
+        "        raise RuntimeError(\n",
+        "            f\"Unexpected empty response line from pod. Raw: {response_raw}\"\n",
+        "        )\n",
+        "\n",
+        "    if is_vllm_inference:\n",
+        "        predictions = data.get(\"predictions\")\n",
+        "        if isinstance(predictions, list) and predictions:\n",
+        "            return predictions[0]\n",
+        "        raise RuntimeError(f\"Unexpected vLLM response format. Raw data: {data}\")\n",
+        "    else:  # TGI format\n",
+        "        generated_text = data.get(\"generated_text\")\n",
+        "        if generated_text is not None:\n",
+        "            return generated_text\n",
+        "        raise RuntimeError(f\"Unexpected TGI response format. Raw data: {data}\")\n",
+        "\n",
+        "\n",
+        "# --- Main Execution Logic ---\n",
+        "\n",
+        "\n",
+        "def execute_chat_completion(\n",
+        "    deployment_name, namespace, pod_port, user_prompt, temperature, max_tokens\n",
+        "):\n",
+        "    \"\"\"Executes the full chat completion process: fetches credentials, finds a pod,\n",
+        "\n",
+        "    determines inference type, sends a request, and returns the response.\n",
+        "    \"\"\"\n",
+        "    display(Markdown(\"Establishing cluster credentials...\"))\n",
+        "    fetch_cluster_credentials(CLUSTER, REGION, PROJECT_ID)\n",
+        "\n",
+        "    display(Markdown(\"Retrieving pod information...\"))\n",
+        "    pod_name = get_running_pod_name(deployment_name, namespace)\n",
+        "    display(Markdown(f\"Successfully identified pod: `{pod_name}`\"))\n",
+        "\n",
+        "    is_vllm = check_vllm_inference_label(pod_name, namespace)\n",
+        "\n",
+        "    request_payload = {\n",
+        "        \"max_tokens\": max_tokens,\n",
+        "        \"temperature\": temperature,\n",
+        "        \"prompt\" if is_vllm else \"inputs\": user_prompt,\n",
+        "    }\n",
+        "    display(Markdown(\"Sending inference request...\"))\n",
+        "    response = send_inference_request(\n",
+        "        request_payload, pod_name, pod_port, is_vllm, namespace\n",
+        "    )\n",
         "\n",
-        "        if is_vllm_inference:  # vLLM format\n",
-        "            predictions = data.get(\"predictions\")\n",
-        "            if isinstance(predictions, (list, tuple)) and predictions:\n",
-        "                return predictions[0]\n",
-        "            return f\"Error: Unexpected vLLM format. Raw: {first_line}\"\n",
-        "        else:  # TGI format\n",
-        "            generated_text = data.get(\"generated_text\")\n",
-        "            if generated_text is not None:\n",
-        "                return generated_text\n",
-        "            return f\"Error: Unexpected TGI format. Raw: {first_line}\"\n",
-        "    except Exception as e:\n",
-        "        return f\"Unexpected error during response processing: {e}\"\n",
+        "    return response\n",
         "\n",
         "\n",
         "# --- Widgets Setup ---\n",
@@ -330,40 +363,24 @@
         "\n",
         "# --- Submit Button Logic ---\n",
         "def on_submit_clicked(b):\n",
-        "    \"\"\"Handles the submit button click event.\"\"\"\n",
         "    with output_area_response:\n",
         "        clear_output()\n",
+        "        display(Markdown(\"Loading...\"))\n",
         "\n",
-        "        fetch_cluster_credential(CLUSTER, REGION, PROJECT_ID)\n",
-        "\n",
-        "        # retrieve deployment pod\n",
-        "        pod_name = get_deployment_pod_name(DEPLOYMENT, NAMESPACE, DEPLOYMENT_APP_LABEL)\n",
-        "        if not pod_name:\n",
-        "            display(\n",
-        "                Markdown(f\"**Error:** Could not find running pod for `{DEPLOYMENT}`.\")\n",
-        "            )\n",
-        "            return\n",
-        "\n",
-        "        # build the request message\n",
-        "        is_vllm = check_inference_label(pod_name, NAMESPACE)\n",
-        "        request = {\n",
-        "            \"max_tokens\": max_tokens_widget.value,\n",
-        "            \"temperature\": temperature_widget.value,\n",
-        "            \"prompt\" if is_vllm else \"inputs\": user_prompt_widget.value,\n",
-        "        }\n",
-        "\n",
-        "        # retrieve service endpoint for the deployment\n",
-        "        endpoint = get_service_endpoint(SERVICE, NAMESPACE)\n",
-        "        if not endpoint:\n",
-        "            display(Markdown(f\"**Error getting endpoints for `{SERVICE}`:**\\n\"))\n",
-        "            return\n",
-        "\n",
-        "        # prompt test the deployment endpoint\n",
         "        try:\n",
-        "            response = process_response(request, pod_name, endpoint, is_vllm, NAMESPACE)\n",
-        "            display(Markdown(f\"**Response:**\\n\\n{response}\"))\n",
+        "            model_response = execute_chat_completion(\n",
+        "                DEPLOYMENT,\n",
+        "                NAMESPACE,\n",
+        "                POD_PORT,\n",
+        "                user_prompt_widget.value,\n",
+        "                temperature_widget.value,\n",
+        "                max_tokens_widget.value,\n",
+        "            )\n",
+        "            clear_output()\n",
+        "            display(Markdown(f\"**Response:**\\n\\n{model_response}\"))\n",
         "        except Exception as e:\n",
-        "            display(Markdown(f\"**Unexpected Error:**\\n```\\n{e}\\n```\"))\n",
+        "            clear_output()\n",
+        "            display(Markdown(f\"**An error occurred:**\\n```\\n{e}\\n```\"))\n",
         "\n",
         "\n",
         "# --- Display Widgets ---\n",
@@ -379,7 +396,6 @@
     },
     {
       "cell_type": "markdown",
-      "id": "5b6ZM2K3fux0",
       "metadata": {
         "id": "5b6ZM2K3fux0"
       },
@@ -453,7 +469,7 @@
   "metadata": {
     "colab": {
       "name": "gke_model_ui_deployment_notebook_auto.ipynb",
-      "toc_visible": true
+      "provenance": []
     },
     "kernelspec": {
       "display_name": "Python 3",