Skip to content

Commit 7a425f3

Browse files
authored
Add files via upload
1 parent 4dc60bf commit 7a425f3

1 file changed

Lines changed: 163 additions & 0 deletions

File tree

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# -*- coding: utf-8 -*-
2+
"""How to Build an Advanced End-to-End Voice AI Agent Using Hugging Face Pipelines.ipynb
3+
4+
Automatically generated by Colab.
5+
6+
Original file is located at
7+
https://colab.research.google.com/drive/1ggPLWH3fOjLp9r2dXQLZci552ux0fWBl
8+
"""
9+
10+
!pip -q install "transformers>=4.42.0" accelerate torchaudio sentencepiece gradio soundfile
11+
12+
import os, torch, tempfile, numpy as np
13+
import gradio as gr
14+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
15+
16+
DEVICE = 0 if torch.cuda.is_available() else -1
17+
18+
asr = pipeline(
19+
"automatic-speech-recognition",
20+
model="openai/whisper-small.en",
21+
device=DEVICE,
22+
chunk_length_s=30,
23+
return_timestamps=False
24+
)
25+
26+
LLM_MODEL = "google/flan-t5-base"
27+
tok = AutoTokenizer.from_pretrained(LLM_MODEL)
28+
llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL, device_map="auto")
29+
30+
tts = pipeline("text-to-speech", model="suno/bark-small")
31+
32+
SYSTEM_PROMPT = (
33+
"You are a helpful, concise voice assistant. "
34+
"Prefer direct, structured answers. "
35+
"If the user asks for steps or code, use short bullet points."
36+
)
37+
38+
def format_dialog(history, user_text):
39+
turns = []
40+
for u, a in history:
41+
if u: turns.append(f"User: {u}")
42+
if a: turns.append(f"Assistant: {a}")
43+
turns.append(f"User: {user_text}")
44+
prompt = (
45+
"Instruction:\n"
46+
f"{SYSTEM_PROMPT}\n\n"
47+
"Dialog so far:\n" + "\n".join(turns) + "\n\n"
48+
"Assistant:"
49+
)
50+
return prompt
51+
52+
def transcribe(filepath):
53+
out = asr(filepath)
54+
text = out["text"].strip()
55+
return text
56+
57+
def generate_reply(history, user_text, max_new_tokens=256):
58+
prompt = format_dialog(history, user_text)
59+
inputs = tok(prompt, return_tensors="pt", truncation=True).to(llm.device)
60+
with torch.no_grad():
61+
ids = llm.generate(
62+
**inputs,
63+
max_new_tokens=max_new_tokens,
64+
temperature=0.7,
65+
do_sample=True,
66+
top_p=0.9,
67+
repetition_penalty=1.05,
68+
)
69+
reply = tok.decode(ids[0], skip_special_tokens=True).strip()
70+
return reply
71+
72+
def synthesize_speech(text):
73+
out = tts(text)
74+
audio = out["audio"]
75+
sr = out["sampling_rate"]
76+
audio = np.asarray(audio, dtype=np.float32)
77+
return (sr, audio)
78+
79+
def clear_history():
80+
return [], []
81+
82+
def voice_to_voice(mic_file, history):
83+
history = history or []
84+
if not mic_file:
85+
return history, None, "Please record something!"
86+
try:
87+
user_text = transcribe(mic_file)
88+
except Exception as e:
89+
return history, None, f"ASR error: {e}"
90+
91+
if not user_text:
92+
return history, None, "Didn't catch that. Try again?"
93+
94+
try:
95+
reply = generate_reply(history, user_text)
96+
except Exception as e:
97+
return history, None, f"LLM error: {e}"
98+
99+
try:
100+
sr, wav = synthesize_speech(reply)
101+
except Exception as e:
102+
return history + [(user_text, reply)], None, f"TTS error: {e}"
103+
104+
return history + [(user_text, reply)], (sr, wav), f"User: {user_text}\nAssistant: {reply}"
105+
106+
def text_to_voice(user_text, history):
107+
history = history or []
108+
user_text = (user_text or "").strip()
109+
if not user_text:
110+
return history, None, "Type a message first."
111+
try:
112+
reply = generate_reply(history, user_text)
113+
sr, wav = synthesize_speech(reply)
114+
except Exception as e:
115+
return history, None, f"Error: {e}"
116+
return history + [(user_text, reply)], (sr, wav), f"User: {user_text}\nAssistant: {reply}"
117+
118+
def export_chat(history):
119+
lines = []
120+
for u, a in history or []:
121+
lines += [f"User: {u}", f"Assistant: {a}", ""]
122+
text = "\n".join(lines).strip() or "No conversation yet."
123+
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w") as f:
124+
f.write(text)
125+
path = f.name
126+
return path
127+
128+
with gr.Blocks(title="Advanced Voice AI Agent (HF Pipelines)") as demo:
129+
gr.Markdown(
130+
"## 🎙️ Advanced Voice AI Agent (Hugging Face Pipelines Only)\n"
131+
"- **ASR**: openai/whisper-small.en\n"
132+
"- **LLM**: google/flan-t5-base\n"
133+
"- **TTS**: suno/bark-small\n"
134+
"Speak or type; the agent replies with voice + text."
135+
)
136+
137+
with gr.Row():
138+
with gr.Column(scale=1):
139+
mic = gr.Audio(sources=["microphone"], type="filepath", label="Record")
140+
say_btn = gr.Button("🎤 Speak")
141+
text_in = gr.Textbox(label="Or type instead", placeholder="Ask me anything…")
142+
text_btn = gr.Button("💬 Send")
143+
export_btn = gr.Button("⬇️ Export Chat (.txt)")
144+
reset_btn = gr.Button("♻️ Reset")
145+
with gr.Column(scale=1):
146+
audio_out = gr.Audio(label="Assistant Voice", autoplay=True)
147+
transcript = gr.Textbox(label="Transcript", lines=6)
148+
chat = gr.Chatbot(height=360)
149+
state = gr.State([])
150+
151+
def update_chat(history):
152+
return [(u, a) for u, a in (history or [])]
153+
154+
say_btn.click(voice_to_voice, [mic, state], [state, audio_out, transcript]).then(
155+
update_chat, inputs=state, outputs=chat
156+
)
157+
text_btn.click(text_to_voice, [text_in, state], [state, audio_out, transcript]).then(
158+
update_chat, inputs=state, outputs=chat
159+
)
160+
reset_btn.click(clear_history, None, [chat, state])
161+
export_btn.click(export_chat, state, gr.File(label="Download chat.txt"))
162+
163+
demo.launch(debug=False)

0 commit comments

Comments
 (0)