TTS支持参考音频逻辑

2025-10-31 15:50:35 +08:00
parent 7c3a49f55b
commit 2fe5e7c0fa
8 changed files with 435 additions and 316 deletions
--- a/pixelle_video/models/storyboard.py
+++ b/pixelle_video/models/storyboard.py
@@ -29,6 +29,7 @@ class StoryboardConfig:
    voice_id: str = "[Chinese] zh-CN Yunjian"     # Default voice
    tts_workflow: Optional[str] = None         # TTS workflow filename (None = use default)
    tts_speed: float = 1.2                     # TTS speed multiplier (1.0 = normal, >1.0 = faster)
+    ref_audio: Optional[str] = None            # Reference audio for voice cloning (only some workflows support this)
    
    # Image parameters
    image_width: int = 1024
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -125,13 +125,19 @@ class FrameProcessor:
        output_path = get_task_frame_path(config.task_id, frame.index, "audio")
        
        # Call TTS with specific output path and workflow
-        audio_path = await self.core.tts(
-            text=frame.narration,
-            workflow=config.tts_workflow,  # Use workflow from config
-            voice=config.voice_id,
-            speed=config.tts_speed,  # Use speed (not rate) from config
-            output_path=output_path,
-        )
+        tts_params = {
+            "text": frame.narration,
+            "workflow": config.tts_workflow,
+            "voice": config.voice_id,
+            "speed": config.tts_speed,
+            "output_path": output_path,
+        }
+        
+        # Add ref_audio if provided
+        if config.ref_audio:
+            tts_params["ref_audio"] = config.ref_audio
+        
+        audio_path = await self.core.tts(**tts_params)
        
        frame.audio_path = audio_path
        
--- a/pixelle_video/services/video_generator.py
+++ b/pixelle_video/services/video_generator.py
@@ -57,6 +57,7 @@ class VideoGeneratorService:
        voice_id: str = "[Chinese] zh-CN Yunjian",
        tts_workflow: Optional[str] = None,
        tts_speed: float = 1.2,
+        ref_audio: Optional[str] = None,  # Reference audio for voice cloning
        output_path: Optional[str] = None,
        
        # === LLM Parameters ===
@@ -225,6 +226,7 @@ class VideoGeneratorService:
            voice_id=voice_id,
            tts_workflow=tts_workflow,
            tts_speed=tts_speed,
+            ref_audio=ref_audio,
            image_width=image_width,
            image_height=image_height,
            image_workflow=image_workflow,