feat(P1): Add align-prompt feature for better text-image relevance

2026-01-06 23:29:41 +08:00
parent 2978622f7f
commit 1d343e55ba
4 changed files with 166 additions and 0 deletions
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -41,6 +41,8 @@ from api.schemas.editor import (
    ExportRequest,
    ExportResponse,
    ExportStatusResponse,
+    AlignPromptRequest,
+    AlignPromptResponse,
 )
 from fastapi import BackgroundTasks
 import asyncio
@@ -598,6 +600,98 @@ async def regenerate_frame_audio(
        raise HTTPException(status_code=500, detail=str(e))


+@router.post(
+    "/storyboard/{storyboard_id}/frames/{frame_id}/align-prompt",
+    response_model=AlignPromptResponse
+)
+async def align_frame_prompt(
+    storyboard_id: str = Path(..., description="Storyboard/task ID"),
+    frame_id: str = Path(..., description="Frame ID"),
+    request: AlignPromptRequest = None
+):
+    """
+    Align image prompt with narration
+    
+    Regenerates the image prompt based on the frame's narration using 
+    enhanced core imagery extraction for better semantic relevance.
+    """
+    if storyboard_id not in _storyboard_cache:
+        raise HTTPException(status_code=404, detail=f"Storyboard {storyboard_id} not found")
+    
+    storyboard = _storyboard_cache[storyboard_id]
+    frames = storyboard["frames"]
+    
+    # Find frame
+    target_frame = None
+    for frame in frames:
+        if frame["id"] == frame_id:
+            target_frame = frame
+            break
+    
+    if not target_frame:
+        raise HTTPException(status_code=404, detail=f"Frame {frame_id} not found")
+    
+    # Get narration to use
+    narration = request.narration if request and request.narration else target_frame.get("narration", "")
+    
+    if not narration:
+        raise HTTPException(status_code=400, detail="No narration text available")
+    
+    try:
+        from api.dependencies import get_pixelle_video
+        
+        pixelle_video = await get_pixelle_video()
+        
+        # Use LLM to generate aligned image prompt
+        from pixelle_video.prompts import build_image_prompt_prompt
+        
+        prompt = build_image_prompt_prompt(
+            narrations=[narration],
+            min_words=30,
+            max_words=60
+        )
+        
+        response = await pixelle_video.llm(
+            prompt=prompt,
+            temperature=0.7,
+            max_tokens=500
+        )
+        
+        # Parse response
+        import json
+        import re
+        
+        # Try to extract JSON
+        try:
+            result = json.loads(response)
+        except json.JSONDecodeError:
+            # Try markdown code block
+            match = re.search(r'```(?:json)?\s*([\s\S]+?)\s*```', response)
+            if match:
+                result = json.loads(match.group(1))
+            else:
+                raise ValueError("Failed to parse LLM response")
+        
+        if "image_prompts" not in result or len(result["image_prompts"]) == 0:
+            raise ValueError("No image prompts in response")
+        
+        new_prompt = result["image_prompts"][0]
+        
+        # Update frame
+        target_frame["image_prompt"] = new_prompt
+        _storyboard_cache[storyboard_id] = storyboard
+        
+        logger.info(f"Aligned image prompt for frame {frame_id}")
+        
+        return AlignPromptResponse(
+            image_prompt=new_prompt,
+            success=True
+        )
+        
+    except Exception as e:
+        logger.error(f"Prompt alignment failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
@router.post(
    "/storyboard/{storyboard_id}/frames/{frame_id}/inpaint",
    response_model=InpaintResponse
--- a/api/schemas/editor.py
+++ b/api/schemas/editor.py
@@ -144,3 +144,12 @@ class ExportStatusResponse(BaseModel):
    error: Optional[str] = None


+class AlignPromptRequest(BaseModel):
+    """Request to align image prompt with narration"""
+    narration: Optional[str] = Field(None, description="Override narration text")
+
+
+class AlignPromptResponse(BaseModel):
+    """Response after aligning prompt"""
+    image_prompt: str
+    success: bool = True
--- a/frontend/src/app/editor/page.tsx
+++ b/frontend/src/app/editor/page.tsx
@@ -229,6 +229,7 @@ function SelectedFrameDetails() {
    const [isSaving, setIsSaving] = useState(false)
    const [isRegeneratingImage, setIsRegeneratingImage] = useState(false)
    const [isRegeneratingAudio, setIsRegeneratingAudio] = useState(false)
+    const [isAligningPrompt, setIsAligningPrompt] = useState(false)
    const [error, setError] = useState<string | null>(null)

    // Update local state when frame changes
@@ -322,6 +323,31 @@ function SelectedFrameDetails() {
        }
    }

+    const handleAlignPrompt = async () => {
+        if (!storyboard || !selectedFrame) return
+
+        setIsAligningPrompt(true)
+        setError(null)
+
+        try {
+            const result = await editorApi.alignPrompt(
+                storyboard.id,
+                selectedFrame.id,
+                narration || selectedFrame.narration
+            )
+
+            // Update local store with new image prompt
+            updateFrame(selectedFrame.id, {
+                imagePrompt: result.image_prompt,
+            })
+            setImagePrompt(result.image_prompt)
+        } catch (err: any) {
+            setError(err.message || '对齐提示词失败')
+        } finally {
+            setIsAligningPrompt(false)
+        }
+    }
+
    return (
        <div className="space-y-4">
            {error && (
@@ -434,6 +460,18 @@ function SelectedFrameDetails() {
                        ) : null}
                        重新生成音频
                    </Button>
+                    <Button
+                        size="sm"
+                        variant="outline"
+                        className="w-full"
+                        onClick={handleAlignPrompt}
+                        disabled={isAligningPrompt}
+                    >
+                        {isAligningPrompt ? (
+                            <Loader2 className="h-4 w-4 animate-spin mr-2" />
+                        ) : null}
+                        对齐提示词
+                    </Button>
                </div>
            )}
        </div>
--- a/frontend/src/services/editor-api.ts
+++ b/frontend/src/services/editor-api.ts
@@ -197,6 +197,31 @@ class EditorApiClient {
        return response.json()
    }

+    /**
+     * Align image prompt with narration - regenerate prompt based on narration
+     */
+    async alignPrompt(
+        storyboardId: string,
+        frameId: string,
+        narration?: string
+    ): Promise<{ image_prompt: string; success: boolean }> {
+        const response = await fetch(
+            `${this.baseUrl}/editor/storyboard/${storyboardId}/frames/${frameId}/align-prompt`,
+            {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ narration }),
+            }
+        )
+
+        if (!response.ok) {
+            const error = await response.json().catch(() => ({ detail: response.statusText }))
+            throw new Error(error.detail || `Failed to align prompt: ${response.statusText}`)
+        }
+
+        return response.json()
+    }
+
    /**
     * Inpaint (局部重绘) image for a frame
     */