diff --git a/backend/main.py b/backend/main.py index 681e8c8..cba6858 100644 --- a/backend/main.py +++ b/backend/main.py @@ -96,12 +96,17 @@ async def run_conversation(text: str, user_id: str) -> str: async def transcribe_audio(audio_bytes: bytes) -> str | None: - """Transcribe audio bytes using cheapest STT model.""" + """Transcribe audio bytes using cheapest STT model. + + Accepts raw PCM16 mono 24kHz data — wraps in WAV container automatically. + """ client = get_openai() try: + # Wrap raw PCM16 in WAV container for the API + wav_bytes = _pcm16_to_wav(audio_bytes) transcript = await client.audio.transcriptions.create( model="gpt-4o-mini-transcribe", - file=("audio.webm", audio_bytes, "audio/webm"), + file=("audio.wav", wav_bytes, "audio/wav"), response_format="text", ) return transcript.strip() if transcript and transcript.strip() else None @@ -126,6 +131,35 @@ async def synthesize_speech(text: str) -> bytes: return b"" +def _pcm16_to_wav(pcm_data: bytes) -> bytes: + """Wrap raw PCM16 mono 24kHz data in a WAV container.""" + import struct + num_channels = 1 + sample_rate = 24000 + bits_per_sample = 16 + byte_rate = sample_rate * num_channels * (bits_per_sample // 8) + block_align = num_channels * (bits_per_sample // 8) + data_size = len(pcm_data) + header_size = 44 + total_size = header_size + data_size + + header = b"RIFF" + header += struct.pack("