1+ # -*- coding: utf-8 -*-
2+ """How to Build an Advanced End-to-End Voice AI Agent Using Hugging Face Pipelines.ipynb
3+
4+ Automatically generated by Colab.
5+
6+ Original file is located at
7+ https://colab.research.google.com/drive/1ggPLWH3fOjLp9r2dXQLZci552ux0fWBl
8+ """
9+
10+ !pip - q install "transformers>=4.42.0" accelerate torchaudio sentencepiece gradio soundfile
11+
12+ import os , torch , tempfile , numpy as np
13+ import gradio as gr
14+ from transformers import pipeline , AutoTokenizer , AutoModelForSeq2SeqLM
15+
16+ DEVICE = 0 if torch .cuda .is_available () else - 1
17+
18+ asr = pipeline (
19+ "automatic-speech-recognition" ,
20+ model = "openai/whisper-small.en" ,
21+ device = DEVICE ,
22+ chunk_length_s = 30 ,
23+ return_timestamps = False
24+ )
25+
26+ LLM_MODEL = "google/flan-t5-base"
27+ tok = AutoTokenizer .from_pretrained (LLM_MODEL )
28+ llm = AutoModelForSeq2SeqLM .from_pretrained (LLM_MODEL , device_map = "auto" )
29+
30+ tts = pipeline ("text-to-speech" , model = "suno/bark-small" )
31+
32+ SYSTEM_PROMPT = (
33+ "You are a helpful, concise voice assistant. "
34+ "Prefer direct, structured answers. "
35+ "If the user asks for steps or code, use short bullet points."
36+ )
37+
38+ def format_dialog (history , user_text ):
39+ turns = []
40+ for u , a in history :
41+ if u : turns .append (f"User: { u } " )
42+ if a : turns .append (f"Assistant: { a } " )
43+ turns .append (f"User: { user_text } " )
44+ prompt = (
45+ "Instruction:\n "
46+ f"{ SYSTEM_PROMPT } \n \n "
47+ "Dialog so far:\n " + "\n " .join (turns ) + "\n \n "
48+ "Assistant:"
49+ )
50+ return prompt
51+
52+ def transcribe (filepath ):
53+ out = asr (filepath )
54+ text = out ["text" ].strip ()
55+ return text
56+
57+ def generate_reply (history , user_text , max_new_tokens = 256 ):
58+ prompt = format_dialog (history , user_text )
59+ inputs = tok (prompt , return_tensors = "pt" , truncation = True ).to (llm .device )
60+ with torch .no_grad ():
61+ ids = llm .generate (
62+ ** inputs ,
63+ max_new_tokens = max_new_tokens ,
64+ temperature = 0.7 ,
65+ do_sample = True ,
66+ top_p = 0.9 ,
67+ repetition_penalty = 1.05 ,
68+ )
69+ reply = tok .decode (ids [0 ], skip_special_tokens = True ).strip ()
70+ return reply
71+
72+ def synthesize_speech (text ):
73+ out = tts (text )
74+ audio = out ["audio" ]
75+ sr = out ["sampling_rate" ]
76+ audio = np .asarray (audio , dtype = np .float32 )
77+ return (sr , audio )
78+
79+ def clear_history ():
80+ return [], []
81+
82+ def voice_to_voice (mic_file , history ):
83+ history = history or []
84+ if not mic_file :
85+ return history , None , "Please record something!"
86+ try :
87+ user_text = transcribe (mic_file )
88+ except Exception as e :
89+ return history , None , f"ASR error: { e } "
90+
91+ if not user_text :
92+ return history , None , "Didn't catch that. Try again?"
93+
94+ try :
95+ reply = generate_reply (history , user_text )
96+ except Exception as e :
97+ return history , None , f"LLM error: { e } "
98+
99+ try :
100+ sr , wav = synthesize_speech (reply )
101+ except Exception as e :
102+ return history + [(user_text , reply )], None , f"TTS error: { e } "
103+
104+ return history + [(user_text , reply )], (sr , wav ), f"User: { user_text } \n Assistant: { reply } "
105+
106+ def text_to_voice (user_text , history ):
107+ history = history or []
108+ user_text = (user_text or "" ).strip ()
109+ if not user_text :
110+ return history , None , "Type a message first."
111+ try :
112+ reply = generate_reply (history , user_text )
113+ sr , wav = synthesize_speech (reply )
114+ except Exception as e :
115+ return history , None , f"Error: { e } "
116+ return history + [(user_text , reply )], (sr , wav ), f"User: { user_text } \n Assistant: { reply } "
117+
118+ def export_chat (history ):
119+ lines = []
120+ for u , a in history or []:
121+ lines += [f"User: { u } " , f"Assistant: { a } " , "" ]
122+ text = "\n " .join (lines ).strip () or "No conversation yet."
123+ with tempfile .NamedTemporaryFile (delete = False , suffix = ".txt" , mode = "w" ) as f :
124+ f .write (text )
125+ path = f .name
126+ return path
127+
128+ with gr .Blocks (title = "Advanced Voice AI Agent (HF Pipelines)" ) as demo :
129+ gr .Markdown (
130+ "## 🎙️ Advanced Voice AI Agent (Hugging Face Pipelines Only)\n "
131+ "- **ASR**: openai/whisper-small.en\n "
132+ "- **LLM**: google/flan-t5-base\n "
133+ "- **TTS**: suno/bark-small\n "
134+ "Speak or type; the agent replies with voice + text."
135+ )
136+
137+ with gr .Row ():
138+ with gr .Column (scale = 1 ):
139+ mic = gr .Audio (sources = ["microphone" ], type = "filepath" , label = "Record" )
140+ say_btn = gr .Button ("🎤 Speak" )
141+ text_in = gr .Textbox (label = "Or type instead" , placeholder = "Ask me anything…" )
142+ text_btn = gr .Button ("💬 Send" )
143+ export_btn = gr .Button ("⬇️ Export Chat (.txt)" )
144+ reset_btn = gr .Button ("♻️ Reset" )
145+ with gr .Column (scale = 1 ):
146+ audio_out = gr .Audio (label = "Assistant Voice" , autoplay = True )
147+ transcript = gr .Textbox (label = "Transcript" , lines = 6 )
148+ chat = gr .Chatbot (height = 360 )
149+ state = gr .State ([])
150+
151+ def update_chat (history ):
152+ return [(u , a ) for u , a in (history or [])]
153+
154+ say_btn .click (voice_to_voice , [mic , state ], [state , audio_out , transcript ]).then (
155+ update_chat , inputs = state , outputs = chat
156+ )
157+ text_btn .click (text_to_voice , [text_in , state ], [state , audio_out , transcript ]).then (
158+ update_chat , inputs = state , outputs = chat
159+ )
160+ reset_btn .click (clear_history , None , [chat , state ])
161+ export_btn .click (export_chat , state , gr .File (label = "Download chat.txt" ))
162+
163+ demo .launch (debug = False )
0 commit comments