Add files via upload

Marktechpost · web-flow · commit c68a4cd11d7d · 2025-09-23T16:18:58.000-07:00
diff --git a/ML Project Codes/A Coding Implementation to End-to-End Transformer Model Optimization with Hugging Face Optimum, ONNX Runtime, and Quantization.ipynb b/ML Project Codes/A Coding Implementation to End-to-End Transformer Model Optimization with Hugging Face Optimum, ONNX Runtime, and Quantization.ipynb
@@ -0,0 +1,230 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip -q install \"transformers>=4.49\" \"optimum[onnxruntime]>=1.20.0\" \"datasets>=2.20\" \"evaluate>=0.4\" accelerate\n",
+        "\n",
+        "from pathlib import Path\n",
+        "import os, time, numpy as np, torch\n",
+        "from datasets import load_dataset\n",
+        "import evaluate\n",
+        "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
+        "from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer\n",
+        "from optimum.onnxruntime.configuration import QuantizationConfig\n",
+        "\n",
+        "os.environ.setdefault(\"OMP_NUM_THREADS\", \"1\")\n",
+        "os.environ.setdefault(\"MKL_NUM_THREADS\", \"1\")\n",
+        "\n",
+        "MODEL_ID = \"distilbert-base-uncased-finetuned-sst-2-english\"\n",
+        "ORT_DIR  = Path(\"onnx-distilbert\")\n",
+        "Q_DIR    = Path(\"onnx-distilbert-quant\")\n",
+        "DEVICE   = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+        "BATCH    = 16\n",
+        "MAXLEN   = 128\n",
+        "N_WARM   = 3\n",
+        "N_ITERS  = 8\n",
+        "\n",
+        "print(f\"Device: {DEVICE} | torch={torch.__version__}\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Eli2cXUjJsiT",
+        "outputId": "7a623347-afc8-4620-99b2-432ca491fac2"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Device: cpu | torch=2.8.0+cu126\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "ds = load_dataset(\"glue\", \"sst2\", split=\"validation[:20%]\")\n",
+        "texts, labels = ds[\"sentence\"], ds[\"label\"]\n",
+        "metric = evaluate.load(\"accuracy\")\n",
+        "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+        "\n",
+        "def make_batches(texts, max_len=MAXLEN, batch=BATCH):\n",
+        "    for i in range(0, len(texts), batch):\n",
+        "        yield tokenizer(texts[i:i+batch], padding=True, truncation=True,\n",
+        "                        max_length=max_len, return_tensors=\"pt\")\n",
+        "\n",
+        "def run_eval(predict_fn, texts, labels):\n",
+        "    preds = []\n",
+        "    for toks in make_batches(texts):\n",
+        "        preds.extend(predict_fn(toks))\n",
+        "    return metric.compute(predictions=preds, references=labels)[\"accuracy\"]\n",
+        "\n",
+        "def bench(predict_fn, texts, n_warm=N_WARM, n_iters=N_ITERS):\n",
+        "    for _ in range(n_warm):\n",
+        "        for toks in make_batches(texts[:BATCH*2]):\n",
+        "            predict_fn(toks)\n",
+        "    times = []\n",
+        "    for _ in range(n_iters):\n",
+        "        t0 = time.time()\n",
+        "        for toks in make_batches(texts):\n",
+        "            predict_fn(toks)\n",
+        "        times.append((time.time() - t0) * 1000)\n",
+        "    return float(np.mean(times)), float(np.std(times))"
+      ],
+      "metadata": {
+        "id": "7E_aUakIJyWU"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "torch_model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).to(DEVICE).eval()\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def pt_predict(toks):\n",
+        "    toks = {k: v.to(DEVICE) for k, v in toks.items()}\n",
+        "    logits = torch_model(**toks).logits\n",
+        "    return logits.argmax(-1).detach().cpu().tolist()\n",
+        "\n",
+        "pt_ms, pt_sd = bench(pt_predict, texts)\n",
+        "pt_acc = run_eval(pt_predict, texts, labels)\n",
+        "print(f\"[PyTorch eager]   {pt_ms:.1f}±{pt_sd:.1f} ms | acc={pt_acc:.4f}\")\n",
+        "\n",
+        "compiled_model = torch_model\n",
+        "compile_ok = False\n",
+        "try:\n",
+        "    compiled_model = torch.compile(torch_model, mode=\"reduce-overhead\", fullgraph=False)\n",
+        "    compile_ok = True\n",
+        "except Exception as e:\n",
+        "    print(\"torch.compile unavailable or failed -> skipping:\", repr(e))\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def ptc_predict(toks):\n",
+        "    toks = {k: v.to(DEVICE) for k, v in toks.items()}\n",
+        "    logits = compiled_model(**toks).logits\n",
+        "    return logits.argmax(-1).detach().cpu().tolist()\n",
+        "\n",
+        "if compile_ok:\n",
+        "    ptc_ms, ptc_sd = bench(ptc_predict, texts)\n",
+        "    ptc_acc = run_eval(ptc_predict, texts, labels)\n",
+        "    print(f\"[torch.compile]   {ptc_ms:.1f}±{ptc_sd:.1f} ms | acc={ptc_acc:.4f}\")"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8uYCXTJHJ3cu",
+        "outputId": "4cf2584c-d8f8-4ee5-83fb-47a68164a809"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[PyTorch eager]   20008.8±6119.6 ms | acc=0.9080\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "provider = \"CUDAExecutionProvider\" if DEVICE == \"cuda\" else \"CPUExecutionProvider\"\n",
+        "ort_model = ORTModelForSequenceClassification.from_pretrained(\n",
+        "    MODEL_ID, export=True, provider=provider, cache_dir=ORT_DIR\n",
+        ")\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def ort_predict(toks):\n",
+        "    logits = ort_model(**{k: v.cpu() for k, v in toks.items()}).logits\n",
+        "    return logits.argmax(-1).cpu().tolist()\n",
+        "\n",
+        "ort_ms, ort_sd = bench(ort_predict, texts)\n",
+        "ort_acc = run_eval(ort_predict, texts, labels)\n",
+        "print(f\"[ONNX Runtime]    {ort_ms:.1f}±{ort_sd:.1f} ms | acc={ort_acc:.4f}\")\n",
+        "\n",
+        "Q_DIR.mkdir(parents=True, exist_ok=True)\n",
+        "quantizer = ORTQuantizer.from_pretrained(ORT_DIR)\n",
+        "qconfig = QuantizationConfig(approach=\"dynamic\", per_channel=False, reduce_range=True)\n",
+        "quantizer.quantize(model_input=ORT_DIR, quantization_config=qconfig, save_dir=Q_DIR)\n",
+        "\n",
+        "ort_quant = ORTModelForSequenceClassification.from_pretrained(Q_DIR, provider=provider)\n",
+        "\n",
+        "@torch.no_grad()\n",
+        "def ortq_predict(toks):\n",
+        "    logits = ort_quant(**{k: v.cpu() for k, v in toks.items()}).logits\n",
+        "    return logits.argmax(-1).cpu().tolist()\n",
+        "\n",
+        "oq_ms, oq_sd = bench(ortq_predict, texts)\n",
+        "oq_acc = run_eval(ortq_predict, texts, labels)\n",
+        "print(f\"[ORT Quantized]   {oq_ms:.1f}±{oq_sd:.1f} ms | acc={oq_acc:.4f}\")"
+      ],
+      "metadata": {
+        "id": "znWVaD4BJ7Nc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hHqiX_aNGjsI"
+      },
+      "outputs": [],
+      "source": [
+        "pt_pipe  = pipeline(\"sentiment-analysis\", model=torch_model, tokenizer=tokenizer,\n",
+        "                    device=0 if DEVICE==\"cuda\" else -1)\n",
+        "ort_pipe = pipeline(\"sentiment-analysis\", model=ort_model, tokenizer=tokenizer, device=-1)\n",
+        "samples = [\n",
+        "    \"What a fantastic movie—performed brilliantly!\",\n",
+        "    \"This was a complete waste of time.\",\n",
+        "    \"I’m not sure how I feel about this one.\"\n",
+        "]\n",
+        "print(\"\\nSample predictions (PT | ORT):\")\n",
+        "for s in samples:\n",
+        "    a = pt_pipe(s)[0][\"label\"]\n",
+        "    b = ort_pipe(s)[0][\"label\"]\n",
+        "    print(f\"- {s}\\n  PT={a} | ORT={b}\")\n",
+        "\n",
+        "import pandas as pd\n",
+        "rows = [[\"PyTorch eager\", pt_ms, pt_sd, pt_acc],\n",
+        "        [\"ONNX Runtime\",  ort_ms, ort_sd, ort_acc],\n",
+        "        [\"ORT Quantized\", oq_ms, oq_sd, oq_acc]]\n",
+        "if compile_ok: rows.insert(1, [\"torch.compile\", ptc_ms, ptc_sd, ptc_acc])\n",
+        "df = pd.DataFrame(rows, columns=[\"Engine\", \"Mean ms (↓)\", \"Std ms\", \"Accuracy\"])\n",
+        "display(df)\n",
+        "\n",
+        "print(\"\"\"\n",
+        "Notes:\n",
+        "- BetterTransformer is deprecated on transformers>=4.49, hence omitted.\n",
+        "- For larger gains on GPU, also try FlashAttention2 models or FP8 with TensorRT-LLM.\n",
+        "- For CPU, tune threads: set OMP_NUM_THREADS/MKL_NUM_THREADS; try NUMA pinning.\n",
+        "- For static (calibrated) quantization, use QuantizationConfig(approach='static') with a calibration set.\n",
+        "\"\"\")"
+      ]
+    }
+  ]
+}