fix: replace PCM16 capture with MediaRecorder (Opus/webm)
PCM16 capture via AudioContext was streaming raw audio continuously, causing massive accumulated buffers that took ~20s to transcribe. Replaced with MediaRecorder which records compressed Opus/webm and sends a single blob on release — much smaller, faster to transcribe. Also removed all unused PCM16/WAV helper functions from both frontend and backend.
This commit is contained in:
+4
-9
@@ -96,17 +96,12 @@ async def run_conversation(text: str, user_id: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
async def transcribe_audio(audio_bytes: bytes) -> str | None:
|
||||||
"""Transcribe audio bytes using cheapest STT model.
|
"""Transcribe Opus/webm audio using cheapest STT model."""
|
||||||
|
|
||||||
Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically.
|
|
||||||
"""
|
|
||||||
client = get_openai()
|
client = get_openai()
|
||||||
try:
|
try:
|
||||||
# Wrap raw PCM16 in WAV container for the API
|
|
||||||
wav_bytes = _pcm16_to_wav(audio_bytes)
|
|
||||||
transcript = await client.audio.transcriptions.create(
|
transcript = await client.audio.transcriptions.create(
|
||||||
model="gpt-4o-mini-transcribe",
|
model="gpt-4o-mini-transcribe",
|
||||||
file=("audio.wav", wav_bytes, "audio/wav"),
|
file=("audio.webm", audio_bytes, "audio/webm"),
|
||||||
response_format="text",
|
response_format="text",
|
||||||
)
|
)
|
||||||
return transcript.strip() if transcript and transcript.strip() else None
|
return transcript.strip() if transcript and transcript.strip() else None
|
||||||
@@ -211,8 +206,8 @@ async def conversation_ws(websocket: WebSocket):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# ── Conversation ──
|
# ── Conversation ──
|
||||||
if msg_type == "audio":
|
if msg_type == "audio_chunk":
|
||||||
# Accumulate PCM16 audio chunks
|
# Single Opus/webm blob from MediaRecorder
|
||||||
chunk = base64.b64decode(msg["data"])
|
chunk = base64.b64decode(msg["data"])
|
||||||
audio_buffer.extend(chunk)
|
audio_buffer.extend(chunk)
|
||||||
|
|
||||||
|
|||||||
@@ -25,41 +25,6 @@ function saveUserId(id: string) {
|
|||||||
localStorage.setItem(USER_ID_KEY, id);
|
localStorage.setItem(USER_ID_KEY, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Capture PCM16 mono 24kHz audio from mic and send via callback. */
|
|
||||||
function startPCMCapture(
|
|
||||||
stream: MediaStream,
|
|
||||||
onChunk: (pcm16: Uint8Array) => void,
|
|
||||||
): { stop: () => void } {
|
|
||||||
const ctx = new AudioContext({ sampleRate: 24000 });
|
|
||||||
const source = ctx.createMediaStreamSource(stream);
|
|
||||||
const processor = ctx.createScriptProcessor(4096, 1, 1);
|
|
||||||
let running = true;
|
|
||||||
|
|
||||||
processor.onaudioprocess = (e) => {
|
|
||||||
if (!running) return;
|
|
||||||
const input = e.inputBuffer.getChannelData(0); // Float32Array [-1, 1]
|
|
||||||
// Convert float32 → PCM16 int16
|
|
||||||
const pcm16 = new Int16Array(input.length);
|
|
||||||
for (let i = 0; i < input.length; i++) {
|
|
||||||
const s = Math.max(-1, Math.min(1, input[i]));
|
|
||||||
pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
||||||
}
|
|
||||||
onChunk(new Uint8Array(pcm16.buffer));
|
|
||||||
};
|
|
||||||
|
|
||||||
source.connect(processor);
|
|
||||||
processor.connect(ctx.destination);
|
|
||||||
|
|
||||||
return {
|
|
||||||
stop: () => {
|
|
||||||
running = false;
|
|
||||||
source.disconnect();
|
|
||||||
processor.disconnect();
|
|
||||||
ctx.close();
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export function useConversation() {
|
export function useConversation() {
|
||||||
const [messages, setMessages] = useState<Message[]>([]);
|
const [messages, setMessages] = useState<Message[]>([]);
|
||||||
const [isConnected, setIsConnected] = useState(false);
|
const [isConnected, setIsConnected] = useState(false);
|
||||||
@@ -77,9 +42,8 @@ export function useConversation() {
|
|||||||
|
|
||||||
const wsRef = useRef<WebSocket | null>(null);
|
const wsRef = useRef<WebSocket | null>(null);
|
||||||
const audioRef = useRef<HTMLAudioElement | null>(null);
|
const audioRef = useRef<HTMLAudioElement | null>(null);
|
||||||
const captureRef = useRef<{ stop: () => void } | null>(null);
|
const recorderRef = useRef<MediaRecorder | null>(null);
|
||||||
const streamRef = useRef<MediaStream | null>(null);
|
const streamRef = useRef<MediaStream | null>(null);
|
||||||
const audioBufferRef = useRef<Uint8Array[]>([]);
|
|
||||||
|
|
||||||
// Connect WebSocket
|
// Connect WebSocket
|
||||||
const connect = useCallback(() => {
|
const connect = useCallback(() => {
|
||||||
@@ -168,12 +132,10 @@ export function useConversation() {
|
|||||||
|
|
||||||
case 'speaking_end':
|
case 'speaking_end':
|
||||||
setIsKiraSpeaking(false);
|
setIsKiraSpeaking(false);
|
||||||
audioBufferRef.current = [];
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'interruption':
|
case 'interruption':
|
||||||
setIsKiraSpeaking(false);
|
setIsKiraSpeaking(false);
|
||||||
audioBufferRef.current = [];
|
|
||||||
if (audioRef.current) {
|
if (audioRef.current) {
|
||||||
audioRef.current.pause();
|
audioRef.current.pause();
|
||||||
audioRef.current.currentTime = 0;
|
audioRef.current.currentTime = 0;
|
||||||
@@ -235,15 +197,37 @@ export function useConversation() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start PCM16 capture — each chunk sent as WS message
|
// Record Opus/webm — much more efficient than PCM16
|
||||||
const capture = startPCMCapture(stream, (pcm16) => {
|
const chunks: BlobPart[] = [];
|
||||||
if (ws.readyState === WebSocket.OPEN) {
|
const recorder = new MediaRecorder(stream, {
|
||||||
const base64 = arrayBufferToBase64(pcm16.buffer);
|
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||||
ws.send(JSON.stringify({ type: 'audio', data: base64 }));
|
? 'audio/webm;codecs=opus'
|
||||||
}
|
: 'audio/webm',
|
||||||
});
|
});
|
||||||
|
|
||||||
captureRef.current = capture;
|
recorder.ondataavailable = (e) => {
|
||||||
|
if (e.data.size > 0) chunks.push(e.data);
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.onstop = () => {
|
||||||
|
// Send recorded audio as one blob, then transcribe
|
||||||
|
const blob = new Blob(chunks, { type: 'audio/webm' });
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = () => {
|
||||||
|
const base64 = (reader.result as string).split(',')[1];
|
||||||
|
if (ws.readyState === WebSocket.OPEN) {
|
||||||
|
ws.send(JSON.stringify({ type: 'audio_chunk', data: base64 }));
|
||||||
|
ws.send(JSON.stringify({ type: 'transcribe' }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.readAsDataURL(blob);
|
||||||
|
|
||||||
|
stream.getTracks().forEach((t) => t.stop());
|
||||||
|
setIsRecording(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
recorder.start();
|
||||||
|
recorderRef.current = recorder;
|
||||||
setIsRecording(true);
|
setIsRecording(true);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
const msg = err instanceof Error ? err.message : String(err);
|
const msg = err instanceof Error ? err.message : String(err);
|
||||||
@@ -253,16 +237,7 @@ export function useConversation() {
|
|||||||
}, [addMessage]);
|
}, [addMessage]);
|
||||||
|
|
||||||
const stopRecording = useCallback(() => {
|
const stopRecording = useCallback(() => {
|
||||||
captureRef.current?.stop();
|
recorderRef.current?.stop();
|
||||||
captureRef.current = null;
|
|
||||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
|
||||||
streamRef.current = null;
|
|
||||||
setIsRecording(false);
|
|
||||||
|
|
||||||
// Tell backend to process accumulated audio
|
|
||||||
if (wsRef.current?.readyState === WebSocket.OPEN) {
|
|
||||||
wsRef.current.send(JSON.stringify({ type: 'transcribe' }));
|
|
||||||
}
|
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
// ── Text ──
|
// ── Text ──
|
||||||
@@ -279,7 +254,7 @@ export function useConversation() {
|
|||||||
connect();
|
connect();
|
||||||
return () => {
|
return () => {
|
||||||
wsRef.current?.close();
|
wsRef.current?.close();
|
||||||
captureRef.current?.stop();
|
recorderRef.current?.stop();
|
||||||
streamRef.current?.getTracks().forEach((t) => t.stop());
|
streamRef.current?.getTracks().forEach((t) => t.stop());
|
||||||
};
|
};
|
||||||
}, [connect]);
|
}, [connect]);
|
||||||
@@ -300,61 +275,3 @@ export function useConversation() {
|
|||||||
stopRecording,
|
stopRecording,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Helpers ──
|
|
||||||
|
|
||||||
function arrayBufferToBase64(buffer: ArrayBufferLike): string {
|
|
||||||
const bytes = new Uint8Array(buffer);
|
|
||||||
let binary = '';
|
|
||||||
for (let i = 0; i < bytes.length; i++) {
|
|
||||||
binary += String.fromCharCode(bytes[i]);
|
|
||||||
}
|
|
||||||
return btoa(binary);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Convert raw PCM16 mono 24kHz to a playable WAV blob. */
|
|
||||||
function pcm16ToWav(pcm16: Uint8Array): ArrayBuffer {
|
|
||||||
const numChannels = 1;
|
|
||||||
const sampleRate = 24000;
|
|
||||||
const bitsPerSample = 16;
|
|
||||||
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
|
|
||||||
const blockAlign = numChannels * (bitsPerSample / 8);
|
|
||||||
const dataSize = pcm16.length;
|
|
||||||
const headerSize = 44;
|
|
||||||
const totalSize = headerSize + dataSize;
|
|
||||||
|
|
||||||
const buf = new ArrayBuffer(totalSize);
|
|
||||||
const view = new DataView(buf);
|
|
||||||
|
|
||||||
// RIFF header
|
|
||||||
writeString(view, 0, 'RIFF');
|
|
||||||
view.setUint32(4, totalSize - 8, true);
|
|
||||||
writeString(view, 8, 'WAVE');
|
|
||||||
|
|
||||||
// fmt subchunk
|
|
||||||
writeString(view, 12, 'fmt ');
|
|
||||||
view.setUint32(16, 16, true); // subchunk size
|
|
||||||
view.setUint16(20, 1, true); // PCM
|
|
||||||
view.setUint16(22, numChannels, true);
|
|
||||||
view.setUint32(24, sampleRate, true);
|
|
||||||
view.setUint32(28, byteRate, true);
|
|
||||||
view.setUint16(32, blockAlign, true);
|
|
||||||
view.setUint16(34, bitsPerSample, true);
|
|
||||||
|
|
||||||
// data subchunk
|
|
||||||
writeString(view, 36, 'data');
|
|
||||||
view.setUint32(40, dataSize, true);
|
|
||||||
|
|
||||||
// PCM data
|
|
||||||
for (let i = 0; i < pcm16.length; i++) {
|
|
||||||
view.setUint8(44 + i, pcm16[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
function writeString(view: DataView, offset: number, str: string) {
|
|
||||||
for (let i = 0; i < str.length; i++) {
|
|
||||||
view.setUint8(offset + i, str.charCodeAt(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user