feat: restore voice-call plugin parity
This commit is contained in:
303
extensions/voice-call/src/providers/stt-openai-realtime.ts
Normal file
303
extensions/voice-call/src/providers/stt-openai-realtime.ts
Normal file
@@ -0,0 +1,303 @@
|
||||
/**
|
||||
* OpenAI Realtime STT Provider
|
||||
*
|
||||
* Uses the OpenAI Realtime API for streaming transcription with:
|
||||
* - Direct mu-law audio support (no conversion needed)
|
||||
* - Built-in server-side VAD for turn detection
|
||||
* - Low-latency streaming transcription
|
||||
* - Partial transcript callbacks for real-time UI updates
|
||||
*/
|
||||
|
||||
import WebSocket from "ws";
|
||||
|
||||
/**
|
||||
* Configuration for OpenAI Realtime STT.
|
||||
*/
|
||||
export interface RealtimeSTTConfig {
|
||||
/** OpenAI API key */
|
||||
apiKey: string;
|
||||
/** Model to use (default: gpt-4o-transcribe) */
|
||||
model?: string;
|
||||
/** Silence duration in ms before considering speech ended (default: 800) */
|
||||
silenceDurationMs?: number;
|
||||
/** VAD threshold 0-1 (default: 0.5) */
|
||||
vadThreshold?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Session for streaming audio and receiving transcripts.
|
||||
*/
|
||||
export interface RealtimeSTTSession {
|
||||
/** Connect to the transcription service */
|
||||
connect(): Promise<void>;
|
||||
/** Send mu-law audio data (8kHz mono) */
|
||||
sendAudio(audio: Buffer): void;
|
||||
/** Wait for next complete transcript (after VAD detects end of speech) */
|
||||
waitForTranscript(timeoutMs?: number): Promise<string>;
|
||||
/** Set callback for partial transcripts (streaming) */
|
||||
onPartial(callback: (partial: string) => void): void;
|
||||
/** Set callback for final transcripts */
|
||||
onTranscript(callback: (transcript: string) => void): void;
|
||||
/** Close the session */
|
||||
close(): void;
|
||||
/** Check if session is connected */
|
||||
isConnected(): boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provider factory for OpenAI Realtime STT sessions.
|
||||
*/
|
||||
export class OpenAIRealtimeSTTProvider {
|
||||
readonly name = "openai-realtime";
|
||||
private apiKey: string;
|
||||
private model: string;
|
||||
private silenceDurationMs: number;
|
||||
private vadThreshold: number;
|
||||
|
||||
constructor(config: RealtimeSTTConfig) {
|
||||
if (!config.apiKey) {
|
||||
throw new Error("OpenAI API key required for Realtime STT");
|
||||
}
|
||||
this.apiKey = config.apiKey;
|
||||
this.model = config.model || "gpt-4o-transcribe";
|
||||
this.silenceDurationMs = config.silenceDurationMs || 800;
|
||||
this.vadThreshold = config.vadThreshold || 0.5;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new realtime transcription session.
|
||||
*/
|
||||
createSession(): RealtimeSTTSession {
|
||||
return new OpenAIRealtimeSTTSession(
|
||||
this.apiKey,
|
||||
this.model,
|
||||
this.silenceDurationMs,
|
||||
this.vadThreshold,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* WebSocket-based session for real-time speech-to-text.
|
||||
*/
|
||||
class OpenAIRealtimeSTTSession implements RealtimeSTTSession {
|
||||
private static readonly MAX_RECONNECT_ATTEMPTS = 5;
|
||||
private static readonly RECONNECT_DELAY_MS = 1000;
|
||||
|
||||
private ws: WebSocket | null = null;
|
||||
private connected = false;
|
||||
private closed = false;
|
||||
private reconnectAttempts = 0;
|
||||
private pendingTranscript = "";
|
||||
private onTranscriptCallback: ((transcript: string) => void) | null = null;
|
||||
private onPartialCallback: ((partial: string) => void) | null = null;
|
||||
|
||||
constructor(
|
||||
private readonly apiKey: string,
|
||||
private readonly model: string,
|
||||
private readonly silenceDurationMs: number,
|
||||
private readonly vadThreshold: number,
|
||||
) {}
|
||||
|
||||
async connect(): Promise<void> {
|
||||
this.closed = false;
|
||||
this.reconnectAttempts = 0;
|
||||
return this.doConnect();
|
||||
}
|
||||
|
||||
private async doConnect(): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const url = "wss://api.openai.com/v1/realtime?intent=transcription";
|
||||
|
||||
this.ws = new WebSocket(url, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
"OpenAI-Beta": "realtime=v1",
|
||||
},
|
||||
});
|
||||
|
||||
this.ws.on("open", () => {
|
||||
console.log("[RealtimeSTT] WebSocket connected");
|
||||
this.connected = true;
|
||||
this.reconnectAttempts = 0;
|
||||
|
||||
// Configure the transcription session
|
||||
this.sendEvent({
|
||||
type: "transcription_session.update",
|
||||
session: {
|
||||
input_audio_format: "g711_ulaw",
|
||||
input_audio_transcription: {
|
||||
model: this.model,
|
||||
},
|
||||
turn_detection: {
|
||||
type: "server_vad",
|
||||
threshold: this.vadThreshold,
|
||||
prefix_padding_ms: 300,
|
||||
silence_duration_ms: this.silenceDurationMs,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
resolve();
|
||||
});
|
||||
|
||||
this.ws.on("message", (data: Buffer) => {
|
||||
try {
|
||||
const event = JSON.parse(data.toString());
|
||||
this.handleEvent(event);
|
||||
} catch (e) {
|
||||
console.error("[RealtimeSTT] Failed to parse event:", e);
|
||||
}
|
||||
});
|
||||
|
||||
this.ws.on("error", (error) => {
|
||||
console.error("[RealtimeSTT] WebSocket error:", error);
|
||||
if (!this.connected) reject(error);
|
||||
});
|
||||
|
||||
this.ws.on("close", (code, reason) => {
|
||||
console.log(
|
||||
`[RealtimeSTT] WebSocket closed (code: ${code}, reason: ${reason?.toString() || "none"})`,
|
||||
);
|
||||
this.connected = false;
|
||||
|
||||
// Attempt reconnection if not intentionally closed
|
||||
if (!this.closed) {
|
||||
void this.attemptReconnect();
|
||||
}
|
||||
});
|
||||
|
||||
setTimeout(() => {
|
||||
if (!this.connected) {
|
||||
reject(new Error("Realtime STT connection timeout"));
|
||||
}
|
||||
}, 10000);
|
||||
});
|
||||
}
|
||||
|
||||
private async attemptReconnect(): Promise<void> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
this.reconnectAttempts >= OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS
|
||||
) {
|
||||
console.error(
|
||||
`[RealtimeSTT] Max reconnect attempts (${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS}) reached`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
this.reconnectAttempts++;
|
||||
const delay =
|
||||
OpenAIRealtimeSTTSession.RECONNECT_DELAY_MS *
|
||||
2 ** (this.reconnectAttempts - 1);
|
||||
console.log(
|
||||
`[RealtimeSTT] Reconnecting ${this.reconnectAttempts}/${OpenAIRealtimeSTTSession.MAX_RECONNECT_ATTEMPTS} in ${delay}ms...`,
|
||||
);
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, delay));
|
||||
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.doConnect();
|
||||
console.log("[RealtimeSTT] Reconnected successfully");
|
||||
} catch (error) {
|
||||
console.error("[RealtimeSTT] Reconnect failed:", error);
|
||||
}
|
||||
}
|
||||
|
||||
private handleEvent(event: {
|
||||
type: string;
|
||||
delta?: string;
|
||||
transcript?: string;
|
||||
error?: unknown;
|
||||
}): void {
|
||||
switch (event.type) {
|
||||
case "transcription_session.created":
|
||||
case "transcription_session.updated":
|
||||
case "input_audio_buffer.speech_stopped":
|
||||
case "input_audio_buffer.committed":
|
||||
console.log(`[RealtimeSTT] ${event.type}`);
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.delta":
|
||||
if (event.delta) {
|
||||
this.pendingTranscript += event.delta;
|
||||
this.onPartialCallback?.(this.pendingTranscript);
|
||||
}
|
||||
break;
|
||||
|
||||
case "conversation.item.input_audio_transcription.completed":
|
||||
if (event.transcript) {
|
||||
console.log(`[RealtimeSTT] Transcript: ${event.transcript}`);
|
||||
this.onTranscriptCallback?.(event.transcript);
|
||||
}
|
||||
this.pendingTranscript = "";
|
||||
break;
|
||||
|
||||
case "input_audio_buffer.speech_started":
|
||||
console.log("[RealtimeSTT] Speech started");
|
||||
this.pendingTranscript = "";
|
||||
break;
|
||||
|
||||
case "error":
|
||||
console.error("[RealtimeSTT] Error:", event.error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private sendEvent(event: unknown): void {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify(event));
|
||||
}
|
||||
}
|
||||
|
||||
sendAudio(muLawData: Buffer): void {
|
||||
if (!this.connected) return;
|
||||
this.sendEvent({
|
||||
type: "input_audio_buffer.append",
|
||||
audio: muLawData.toString("base64"),
|
||||
});
|
||||
}
|
||||
|
||||
onPartial(callback: (partial: string) => void): void {
|
||||
this.onPartialCallback = callback;
|
||||
}
|
||||
|
||||
onTranscript(callback: (transcript: string) => void): void {
|
||||
this.onTranscriptCallback = callback;
|
||||
}
|
||||
|
||||
async waitForTranscript(timeoutMs = 30000): Promise<string> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
this.onTranscriptCallback = null;
|
||||
reject(new Error("Transcript timeout"));
|
||||
}, timeoutMs);
|
||||
|
||||
this.onTranscriptCallback = (transcript) => {
|
||||
clearTimeout(timeout);
|
||||
this.onTranscriptCallback = null;
|
||||
resolve(transcript);
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
close(): void {
|
||||
this.closed = true;
|
||||
if (this.ws) {
|
||||
this.ws.close();
|
||||
this.ws = null;
|
||||
}
|
||||
this.connected = false;
|
||||
}
|
||||
|
||||
isConnected(): boolean {
|
||||
return this.connected;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user