Add files via upload

Marktechpost · web-flow · commit 7a425f361eb6 · 2025-09-17T09:21:12.000-07:00
diff --git a/AI Agents Codes/how_to_build_an_advanced_end_to_end_voice_ai_agent_using_hugging_face_pipelines.py b/AI Agents Codes/how_to_build_an_advanced_end_to_end_voice_ai_agent_using_hugging_face_pipelines.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+"""How to Build an Advanced End-to-End Voice AI Agent Using Hugging Face Pipelines.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1ggPLWH3fOjLp9r2dXQLZci552ux0fWBl
+"""
+
+!pip -q install "transformers>=4.42.0" accelerate torchaudio sentencepiece gradio soundfile
+
+import os, torch, tempfile, numpy as np
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+
+DEVICE = 0 if torch.cuda.is_available() else -1
+
+asr = pipeline(
+    "automatic-speech-recognition",
+    model="openai/whisper-small.en",
+    device=DEVICE,
+    chunk_length_s=30,
+    return_timestamps=False
+)
+
+LLM_MODEL = "google/flan-t5-base"
+tok = AutoTokenizer.from_pretrained(LLM_MODEL)
+llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL, device_map="auto")
+
+tts = pipeline("text-to-speech", model="suno/bark-small")
+
+SYSTEM_PROMPT = (
+    "You are a helpful, concise voice assistant. "
+    "Prefer direct, structured answers. "
+    "If the user asks for steps or code, use short bullet points."
+)
+
+def format_dialog(history, user_text):
+    turns = []
+    for u, a in history:
+        if u: turns.append(f"User: {u}")
+        if a: turns.append(f"Assistant: {a}")
+    turns.append(f"User: {user_text}")
+    prompt = (
+        "Instruction:\n"
+        f"{SYSTEM_PROMPT}\n\n"
+        "Dialog so far:\n" + "\n".join(turns) + "\n\n"
+        "Assistant:"
+    )
+    return prompt
+
+def transcribe(filepath):
+    out = asr(filepath)
+    text = out["text"].strip()
+    return text
+
+def generate_reply(history, user_text, max_new_tokens=256):
+    prompt = format_dialog(history, user_text)
+    inputs = tok(prompt, return_tensors="pt", truncation=True).to(llm.device)
+    with torch.no_grad():
+        ids = llm.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.7,
+            do_sample=True,
+            top_p=0.9,
+            repetition_penalty=1.05,
+        )
+    reply = tok.decode(ids[0], skip_special_tokens=True).strip()
+    return reply
+
+def synthesize_speech(text):
+    out = tts(text)
+    audio = out["audio"]
+    sr = out["sampling_rate"]
+    audio = np.asarray(audio, dtype=np.float32)
+    return (sr, audio)
+
+def clear_history():
+    return [], []
+
+def voice_to_voice(mic_file, history):
+    history = history or []
+    if not mic_file:
+        return history, None, "Please record something!"
+    try:
+        user_text = transcribe(mic_file)
+    except Exception as e:
+        return history, None, f"ASR error: {e}"
+
+    if not user_text:
+        return history, None, "Didn't catch that. Try again?"
+
+    try:
+        reply = generate_reply(history, user_text)
+    except Exception as e:
+        return history, None, f"LLM error: {e}"
+
+    try:
+        sr, wav = synthesize_speech(reply)
+    except Exception as e:
+        return history + [(user_text, reply)], None, f"TTS error: {e}"
+
+    return history + [(user_text, reply)], (sr, wav), f"User: {user_text}\nAssistant: {reply}"
+
+def text_to_voice(user_text, history):
+    history = history or []
+    user_text = (user_text or "").strip()
+    if not user_text:
+        return history, None, "Type a message first."
+    try:
+        reply = generate_reply(history, user_text)
+        sr, wav = synthesize_speech(reply)
+    except Exception as e:
+        return history, None, f"Error: {e}"
+    return history + [(user_text, reply)], (sr, wav), f"User: {user_text}\nAssistant: {reply}"
+
+def export_chat(history):
+    lines = []
+    for u, a in history or []:
+        lines += [f"User: {u}", f"Assistant: {a}", ""]
+    text = "\n".join(lines).strip() or "No conversation yet."
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w") as f:
+        f.write(text)
+        path = f.name
+    return path
+
+with gr.Blocks(title="Advanced Voice AI Agent (HF Pipelines)") as demo:
+    gr.Markdown(
+        "## 🎙️ Advanced Voice AI Agent (Hugging Face Pipelines Only)\n"
+        "- **ASR**: openai/whisper-small.en\n"
+        "- **LLM**: google/flan-t5-base\n"
+        "- **TTS**: suno/bark-small\n"
+        "Speak or type; the agent replies with voice + text."
+    )
+
+    with gr.Row():
+        with gr.Column(scale=1):
+            mic = gr.Audio(sources=["microphone"], type="filepath", label="Record")
+            say_btn = gr.Button("🎤 Speak")
+            text_in = gr.Textbox(label="Or type instead", placeholder="Ask me anything…")
+            text_btn = gr.Button("💬 Send")
+            export_btn = gr.Button("⬇️ Export Chat (.txt)")
+            reset_btn = gr.Button("♻️ Reset")
+        with gr.Column(scale=1):
+            audio_out = gr.Audio(label="Assistant Voice", autoplay=True)
+            transcript = gr.Textbox(label="Transcript", lines=6)
+            chat = gr.Chatbot(height=360)
+    state = gr.State([])
+
+    def update_chat(history):
+        return [(u, a) for u, a in (history or [])]
+
+    say_btn.click(voice_to_voice, [mic, state], [state, audio_out, transcript]).then(
+        update_chat, inputs=state, outputs=chat
+    )
+    text_btn.click(text_to_voice, [text_in, state], [state, audio_out, transcript]).then(
+        update_chat, inputs=state, outputs=chat
+    )
+    reset_btn.click(clear_history, None, [chat, state])
+    export_btn.click(export_chat, state, gr.File(label="Download chat.txt"))
+
+demo.launch(debug=False)