feat: restore voice-call plugin parity

2026-01-12 21:40:22 +00:00
parent 3467b0ba07
commit 42c17adb5e
27 changed files with 6036 additions and 516 deletions
--- a/extensions/voice-call/src/config.ts
+++ b/extensions/voice-call/src/config.ts
@@ -0,0 +1,355 @@
+import { z } from "zod";
+
+// -----------------------------------------------------------------------------
+// Phone Number Validation
+// -----------------------------------------------------------------------------
+
+/**
+ * E.164 phone number format: +[country code][number]
+ * Examples use 555 prefix (reserved for fictional numbers)
+ */
+export const E164Schema = z
+  .string()
+  .regex(/^\+[1-9]\d{1,14}$/, "Expected E.164 format, e.g. +15550001234");
+
+// -----------------------------------------------------------------------------
+// Inbound Policy
+// -----------------------------------------------------------------------------
+
+/**
+ * Controls how inbound calls are handled:
+ * - "disabled": Block all inbound calls (outbound only)
+ * - "allowlist": Only accept calls from numbers in allowFrom
+ * - "pairing": Unknown callers can request pairing (future)
+ * - "open": Accept all inbound calls (dangerous!)
+ */
+export const InboundPolicySchema = z.enum([
+  "disabled",
+  "allowlist",
+  "pairing",
+  "open",
+]);
+export type InboundPolicy = z.infer<typeof InboundPolicySchema>;
+
+// -----------------------------------------------------------------------------
+// Provider-Specific Configuration
+// -----------------------------------------------------------------------------
+
+export const TelnyxConfigSchema = z.object({
+  /** Telnyx API v2 key */
+  apiKey: z.string().min(1).optional(),
+  /** Telnyx connection ID (from Call Control app) */
+  connectionId: z.string().min(1).optional(),
+  /** Public key for webhook signature verification */
+  publicKey: z.string().min(1).optional(),
+});
+export type TelnyxConfig = z.infer<typeof TelnyxConfigSchema>;
+
+export const TwilioConfigSchema = z.object({
+  /** Twilio Account SID */
+  accountSid: z.string().min(1).optional(),
+  /** Twilio Auth Token */
+  authToken: z.string().min(1).optional(),
+});
+export type TwilioConfig = z.infer<typeof TwilioConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// STT/TTS Configuration
+// -----------------------------------------------------------------------------
+
+export const SttConfigSchema = z
+  .object({
+    /** STT provider (currently only OpenAI supported) */
+    provider: z.literal("openai").default("openai"),
+    /** Whisper model to use */
+    model: z.string().min(1).default("whisper-1"),
+  })
+  .default({ provider: "openai", model: "whisper-1" });
+export type SttConfig = z.infer<typeof SttConfigSchema>;
+
+export const TtsConfigSchema = z
+  .object({
+    /** TTS provider (currently only OpenAI supported) */
+    provider: z.literal("openai").default("openai"),
+    /**
+     * TTS model to use:
+     * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
+     * - tts-1: lower latency
+     * - tts-1-hd: higher quality
+     */
+    model: z.string().min(1).default("gpt-4o-mini-tts"),
+    /**
+     * Voice ID. For best quality, use marin or cedar.
+     * All voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
+     */
+    voice: z.string().min(1).default("coral"),
+    /**
+     * Instructions for speech style (only works with gpt-4o-mini-tts).
+     * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
+     */
+    instructions: z.string().optional(),
+  })
+  .default({ provider: "openai", model: "gpt-4o-mini-tts", voice: "coral" });
+export type TtsConfig = z.infer<typeof TtsConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// Webhook Server Configuration
+// -----------------------------------------------------------------------------
+
+export const VoiceCallServeConfigSchema = z
+  .object({
+    /** Port to listen on */
+    port: z.number().int().positive().default(3334),
+    /** Bind address */
+    bind: z.string().default("127.0.0.1"),
+    /** Webhook path */
+    path: z.string().min(1).default("/voice/webhook"),
+  })
+  .default({ port: 3334, bind: "127.0.0.1", path: "/voice/webhook" });
+export type VoiceCallServeConfig = z.infer<typeof VoiceCallServeConfigSchema>;
+
+export const VoiceCallTailscaleConfigSchema = z
+  .object({
+    /**
+     * Tailscale exposure mode:
+     * - "off": No Tailscale exposure
+     * - "serve": Tailscale serve (private to tailnet)
+     * - "funnel": Tailscale funnel (public HTTPS)
+     */
+    mode: z.enum(["off", "serve", "funnel"]).default("off"),
+    /** Path for Tailscale serve/funnel (should usually match serve.path) */
+    path: z.string().min(1).default("/voice/webhook"),
+  })
+  .default({ mode: "off", path: "/voice/webhook" });
+export type VoiceCallTailscaleConfig = z.infer<
+  typeof VoiceCallTailscaleConfigSchema
+>;
+
+// -----------------------------------------------------------------------------
+// Tunnel Configuration (unified ngrok/tailscale)
+// -----------------------------------------------------------------------------
+
+export const VoiceCallTunnelConfigSchema = z
+  .object({
+    /**
+     * Tunnel provider:
+     * - "none": No tunnel (use publicUrl if set, or manual setup)
+     * - "ngrok": Use ngrok for public HTTPS tunnel
+     * - "tailscale-serve": Tailscale serve (private to tailnet)
+     * - "tailscale-funnel": Tailscale funnel (public HTTPS)
+     */
+    provider: z
+      .enum(["none", "ngrok", "tailscale-serve", "tailscale-funnel"])
+      .default("none"),
+    /** ngrok auth token (optional, enables longer sessions and more features) */
+    ngrokAuthToken: z.string().min(1).optional(),
+    /** ngrok custom domain (paid feature, e.g., "myapp.ngrok.io") */
+    ngrokDomain: z.string().min(1).optional(),
+    /**
+     * Allow ngrok free tier compatibility mode.
+     * When true, signature verification failures on ngrok-free.app URLs
+     * will be logged but allowed through. Less secure, but necessary
+     * for ngrok free tier which may modify URLs.
+     */
+    allowNgrokFreeTier: z.boolean().default(true),
+  })
+  .default({ provider: "none", allowNgrokFreeTier: true });
+export type VoiceCallTunnelConfig = z.infer<typeof VoiceCallTunnelConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// Outbound Call Configuration
+// -----------------------------------------------------------------------------
+
+/**
+ * Call mode determines how outbound calls behave:
+ * - "notify": Deliver message and auto-hangup after delay (one-way notification)
+ * - "conversation": Stay open for back-and-forth until explicit end or timeout
+ */
+export const CallModeSchema = z.enum(["notify", "conversation"]);
+export type CallMode = z.infer<typeof CallModeSchema>;
+
+export const OutboundConfigSchema = z
+  .object({
+    /** Default call mode for outbound calls */
+    defaultMode: CallModeSchema.default("notify"),
+    /** Seconds to wait after TTS before auto-hangup in notify mode */
+    notifyHangupDelaySec: z.number().int().nonnegative().default(3),
+  })
+  .default({ defaultMode: "notify", notifyHangupDelaySec: 3 });
+export type OutboundConfig = z.infer<typeof OutboundConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// Streaming Configuration (OpenAI Realtime STT)
+// -----------------------------------------------------------------------------
+
+export const VoiceCallStreamingConfigSchema = z
+  .object({
+    /** Enable real-time audio streaming (requires WebSocket support) */
+    enabled: z.boolean().default(false),
+    /** STT provider for real-time transcription */
+    sttProvider: z.enum(["openai-realtime"]).default("openai-realtime"),
+    /** OpenAI API key for Realtime API (uses OPENAI_API_KEY env if not set) */
+    openaiApiKey: z.string().min(1).optional(),
+    /** OpenAI transcription model (default: gpt-4o-transcribe) */
+    sttModel: z.string().min(1).default("gpt-4o-transcribe"),
+    /** VAD silence duration in ms before considering speech ended */
+    silenceDurationMs: z.number().int().positive().default(800),
+    /** VAD threshold 0-1 (higher = less sensitive) */
+    vadThreshold: z.number().min(0).max(1).default(0.5),
+    /** WebSocket path for media stream connections */
+    streamPath: z.string().min(1).default("/voice/stream"),
+  })
+  .default({
+    enabled: false,
+    sttProvider: "openai-realtime",
+    sttModel: "gpt-4o-transcribe",
+    silenceDurationMs: 800,
+    vadThreshold: 0.5,
+    streamPath: "/voice/stream",
+  });
+export type VoiceCallStreamingConfig = z.infer<
+  typeof VoiceCallStreamingConfigSchema
+>;
+
+// -----------------------------------------------------------------------------
+// Main Voice Call Configuration
+// -----------------------------------------------------------------------------
+
+export const VoiceCallConfigSchema = z.object({
+  /** Enable voice call functionality */
+  enabled: z.boolean().default(false),
+
+  /** Active provider (telnyx, twilio, or mock) */
+  provider: z.enum(["telnyx", "twilio", "mock"]).optional(),
+
+  /** Telnyx-specific configuration */
+  telnyx: TelnyxConfigSchema.optional(),
+
+  /** Twilio-specific configuration */
+  twilio: TwilioConfigSchema.optional(),
+
+  /** Phone number to call from (E.164) */
+  fromNumber: E164Schema.optional(),
+
+  /** Default phone number to call (E.164) */
+  toNumber: E164Schema.optional(),
+
+  /** Inbound call policy */
+  inboundPolicy: InboundPolicySchema.default("disabled"),
+
+  /** Allowlist of phone numbers for inbound calls (E.164) */
+  allowFrom: z.array(E164Schema).default([]),
+
+  /** Greeting message for inbound calls */
+  inboundGreeting: z.string().optional(),
+
+  /** Outbound call configuration */
+  outbound: OutboundConfigSchema,
+
+  /** Maximum call duration in seconds */
+  maxDurationSeconds: z.number().int().positive().default(300),
+
+  /** Silence timeout for end-of-speech detection (ms) */
+  silenceTimeoutMs: z.number().int().positive().default(800),
+
+  /** Timeout for user transcript (ms) */
+  transcriptTimeoutMs: z.number().int().positive().default(180000),
+
+  /** Ring timeout for outbound calls (ms) */
+  ringTimeoutMs: z.number().int().positive().default(30000),
+
+  /** Maximum concurrent calls */
+  maxConcurrentCalls: z.number().int().positive().default(1),
+
+  /** Webhook server configuration */
+  serve: VoiceCallServeConfigSchema,
+
+  /** Tailscale exposure configuration (legacy, prefer tunnel config) */
+  tailscale: VoiceCallTailscaleConfigSchema,
+
+  /** Tunnel configuration (unified ngrok/tailscale) */
+  tunnel: VoiceCallTunnelConfigSchema,
+
+  /** Real-time audio streaming configuration */
+  streaming: VoiceCallStreamingConfigSchema,
+
+  /** Public webhook URL override (if set, bypasses tunnel auto-detection) */
+  publicUrl: z.string().url().optional(),
+
+  /** Skip webhook signature verification (development only, NOT for production) */
+  skipSignatureVerification: z.boolean().default(false),
+
+  /** STT configuration */
+  stt: SttConfigSchema,
+
+  /** TTS configuration */
+  tts: TtsConfigSchema,
+
+  /** Store path for call logs */
+  store: z.string().optional(),
+
+  /** Model for generating voice responses (e.g., "anthropic/claude-sonnet-4", "openai/gpt-4o") */
+  responseModel: z.string().default("openai/gpt-4o-mini"),
+
+  /** System prompt for voice responses */
+  responseSystemPrompt: z.string().optional(),
+
+  /** Timeout for response generation in ms (default 30s) */
+  responseTimeoutMs: z.number().int().positive().default(30000),
+});
+
+export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
+
+// -----------------------------------------------------------------------------
+// Configuration Helpers
+// -----------------------------------------------------------------------------
+
+/**
+ * Validate that the configuration has all required fields for the selected provider.
+ */
+export function validateProviderConfig(config: VoiceCallConfig): {
+  valid: boolean;
+  errors: string[];
+} {
+  const errors: string[] = [];
+
+  if (!config.enabled) {
+    return { valid: true, errors: [] };
+  }
+
+  if (!config.provider) {
+    errors.push("plugins.entries.voice-call.config.provider is required");
+  }
+
+  if (!config.fromNumber && config.provider !== "mock") {
+    errors.push("plugins.entries.voice-call.config.fromNumber is required");
+  }
+
+  if (config.provider === "telnyx") {
+    if (!config.telnyx?.apiKey) {
+      errors.push(
+        "plugins.entries.voice-call.config.telnyx.apiKey is required (or set TELNYX_API_KEY env)",
+      );
+    }
+    if (!config.telnyx?.connectionId) {
+      errors.push(
+        "plugins.entries.voice-call.config.telnyx.connectionId is required (or set TELNYX_CONNECTION_ID env)",
+      );
+    }
+  }
+
+  if (config.provider === "twilio") {
+    if (!config.twilio?.accountSid) {
+      errors.push(
+        "plugins.entries.voice-call.config.twilio.accountSid is required (or set TWILIO_ACCOUNT_SID env)",
+      );
+    }
+    if (!config.twilio?.authToken) {
+      errors.push(
+        "plugins.entries.voice-call.config.twilio.authToken is required (or set TWILIO_AUTH_TOKEN env)",
+      );
+    }
+  }
+
+  return { valid: errors.length === 0, errors };
+}