feat(P1): Add align-prompt feature for better text-image relevance

2026-01-06 23:29:41 +08:00
parent 2978622f7f
commit 1d343e55ba
4 changed files with 166 additions and 0 deletions
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -41,6 +41,8 @@ from api.schemas.editor import (
    ExportRequest,
    ExportResponse,
    ExportStatusResponse,
+    AlignPromptRequest,
+    AlignPromptResponse,
 )
 from fastapi import BackgroundTasks
 import asyncio
@@ -598,6 +600,98 @@ async def regenerate_frame_audio(
        raise HTTPException(status_code=500, detail=str(e))


+@router.post(
+    "/storyboard/{storyboard_id}/frames/{frame_id}/align-prompt",
+    response_model=AlignPromptResponse
+)
+async def align_frame_prompt(
+    storyboard_id: str = Path(..., description="Storyboard/task ID"),
+    frame_id: str = Path(..., description="Frame ID"),
+    request: AlignPromptRequest = None
+):
+    """
+    Align image prompt with narration
+    
+    Regenerates the image prompt based on the frame's narration using 
+    enhanced core imagery extraction for better semantic relevance.
+    """
+    if storyboard_id not in _storyboard_cache:
+        raise HTTPException(status_code=404, detail=f"Storyboard {storyboard_id} not found")
+    
+    storyboard = _storyboard_cache[storyboard_id]
+    frames = storyboard["frames"]
+    
+    # Find frame
+    target_frame = None
+    for frame in frames:
+        if frame["id"] == frame_id:
+            target_frame = frame
+            break
+    
+    if not target_frame:
+        raise HTTPException(status_code=404, detail=f"Frame {frame_id} not found")
+    
+    # Get narration to use
+    narration = request.narration if request and request.narration else target_frame.get("narration", "")
+    
+    if not narration:
+        raise HTTPException(status_code=400, detail="No narration text available")
+    
+    try:
+        from api.dependencies import get_pixelle_video
+        
+        pixelle_video = await get_pixelle_video()
+        
+        # Use LLM to generate aligned image prompt
+        from pixelle_video.prompts import build_image_prompt_prompt
+        
+        prompt = build_image_prompt_prompt(
+            narrations=[narration],
+            min_words=30,
+            max_words=60
+        )
+        
+        response = await pixelle_video.llm(
+            prompt=prompt,
+            temperature=0.7,
+            max_tokens=500
+        )
+        
+        # Parse response
+        import json
+        import re
+        
+        # Try to extract JSON
+        try:
+            result = json.loads(response)
+        except json.JSONDecodeError:
+            # Try markdown code block
+            match = re.search(r'```(?:json)?\s*([\s\S]+?)\s*```', response)
+            if match:
+                result = json.loads(match.group(1))
+            else:
+                raise ValueError("Failed to parse LLM response")
+        
+        if "image_prompts" not in result or len(result["image_prompts"]) == 0:
+            raise ValueError("No image prompts in response")
+        
+        new_prompt = result["image_prompts"][0]
+        
+        # Update frame
+        target_frame["image_prompt"] = new_prompt
+        _storyboard_cache[storyboard_id] = storyboard
+        
+        logger.info(f"Aligned image prompt for frame {frame_id}")
+        
+        return AlignPromptResponse(
+            image_prompt=new_prompt,
+            success=True
+        )
+        
+    except Exception as e:
+        logger.error(f"Prompt alignment failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
@router.post(
    "/storyboard/{storyboard_id}/frames/{frame_id}/inpaint",
    response_model=InpaintResponse
--- a/api/schemas/editor.py
+++ b/api/schemas/editor.py
@@ -144,3 +144,12 @@ class ExportStatusResponse(BaseModel):
    error: Optional[str] = None


+class AlignPromptRequest(BaseModel):
+    """Request to align image prompt with narration"""
+    narration: Optional[str] = Field(None, description="Override narration text")
+
+
+class AlignPromptResponse(BaseModel):
+    """Response after aligning prompt"""
+    image_prompt: str
+    success: bool = True