From 6a9301c27d179804a2477ab957804b2534af7cc2 Mon Sep 17 00:00:00 2001
From: zhixian <louhuanqing@gmail.com>
Date: Sun, 25 Jan 2026 17:04:20 +0900
Subject: [PATCH] feat(tts): support custom OpenAI-compatible TTS endpoints
 (#1701)

* feat(tts): support custom OpenAI-compatible TTS endpoints

Add OPENAI_TTS_BASE_URL environment variable to allow using self-hosted
or third-party OpenAI-compatible TTS services like Kokoro, LocalAI, or
OpenedAI-Speech.

Changes:
- Add OPENAI_TTS_BASE_URL env var (defaults to OpenAI official API)
- Relax model/voice validation when using custom endpoints
- Add tts-1 and tts-1-hd to the model allowlist

This enables users to:
- Use local TTS for privacy and cost savings
- Use models with better non-English language support (Chinese, Japanese)
- Reduce latency with local inference

Example usage:
  OPENAI_TTS_BASE_URL=http://localhost:8880/v1

Tested with Kokoro-FastAPI.

* fix: strip trailing slashes from OPENAI_TTS_BASE_URL

Address review feedback: normalize the base URL by removing trailing
slashes to prevent double-slash paths like /v1//audio/speech which
cause 404 errors on some OpenAI-compatible servers.

* style: format code with oxfmt

* test: update tests for expanded OpenAI TTS model list

- Accept tts-1 and tts-1-hd as valid models
- Update OPENAI_TTS_MODELS length expectation to 3

---------

Co-authored-by: zhixian <zhixian@bunker.local>
---
 src/tts/tts.test.ts | 14 ++++++++------
 src/tts/tts.ts      | 18 ++++++++++++++++--
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts
index a8c9dce9c..8462cba01 100644
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -109,13 +109,13 @@ describe("tts", () => {
   });
 
   describe("isValidOpenAIModel", () => {
-    it("accepts gpt-4o-mini-tts model", () => {
+    it("accepts supported models", () => {
       expect(isValidOpenAIModel("gpt-4o-mini-tts")).toBe(true);
+      expect(isValidOpenAIModel("tts-1")).toBe(true);
+      expect(isValidOpenAIModel("tts-1-hd")).toBe(true);
     });
 
-    it("rejects other models", () => {
-      expect(isValidOpenAIModel("tts-1")).toBe(false);
-      expect(isValidOpenAIModel("tts-1-hd")).toBe(false);
+    it("rejects unsupported models", () => {
       expect(isValidOpenAIModel("invalid")).toBe(false);
       expect(isValidOpenAIModel("")).toBe(false);
       expect(isValidOpenAIModel("gpt-4")).toBe(false);
@@ -123,9 +123,11 @@ describe("tts", () => {
   });
 
   describe("OPENAI_TTS_MODELS", () => {
-    it("contains only gpt-4o-mini-tts", () => {
+    it("contains supported models", () => {
       expect(OPENAI_TTS_MODELS).toContain("gpt-4o-mini-tts");
-      expect(OPENAI_TTS_MODELS).toHaveLength(1);
+      expect(OPENAI_TTS_MODELS).toContain("tts-1");
+      expect(OPENAI_TTS_MODELS).toContain("tts-1-hd");
+      expect(OPENAI_TTS_MODELS).toHaveLength(3);
     });
 
     it("is a non-empty array", () => {
diff --git a/src/tts/tts.ts b/src/tts/tts.ts
index 5fa06f8d4..5f911ec14 100644
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -736,7 +736,17 @@ function parseTtsDirectives(
   };
 }
 
-export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts"] as const;
+export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as const;
+
+/**
+ * Custom OpenAI-compatible TTS endpoint.
+ * When set, model/voice validation is relaxed to allow non-OpenAI models.
+ * Example: OPENAI_TTS_BASE_URL=http://localhost:8880/v1
+ */
+const OPENAI_TTS_BASE_URL = (
+  process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1"
+).replace(/\/+$/, "");
+const isCustomOpenAIEndpoint = OPENAI_TTS_BASE_URL !== "https://api.openai.com/v1";
 export const OPENAI_TTS_VOICES = [
   "alloy",
   "ash",
@@ -752,10 +762,14 @@ export const OPENAI_TTS_VOICES = [
 type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number];
 
 function isValidOpenAIModel(model: string): boolean {
+  // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI)
+  if (isCustomOpenAIEndpoint) return true;
   return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
 }
 
 function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice {
+  // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
+  if (isCustomOpenAIEndpoint) return true;
   return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice);
 }
 
@@ -982,7 +996,7 @@ async function openaiTTS(params: {
   const timeout = setTimeout(() => controller.abort(), timeoutMs);
 
   try {
-    const response = await fetch("https://api.openai.com/v1/audio/speech", {
+    const response = await fetch(`${OPENAI_TTS_BASE_URL}/audio/speech`, {
       method: "POST",
       headers: {
         Authorization: `Bearer ${apiKey}`,