feat(P1): Add align-prompt feature for better text-image relevance

2026-01-06 23:29:41 +08:00
parent 2978622f7f
commit 1d343e55ba
4 changed files with 166 additions and 0 deletions
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -41,6 +41,8 @@ from api.schemas.editor import (
    ExportRequest,
    ExportResponse,
    ExportStatusResponse,
    AlignPromptRequest,
    AlignPromptResponse,
 )
 from fastapi import BackgroundTasks
 import asyncio
@@ -598,6 +600,98 @@ async def regenerate_frame_audio(
        raise HTTPException(status_code=500, detail=str(e))
@router.post(
    "/storyboard/{storyboard_id}/frames/{frame_id}/align-prompt",
    response_model=AlignPromptResponse
 )
 async def align_frame_prompt(
    storyboard_id: str = Path(..., description="Storyboard/task ID"),
    frame_id: str = Path(..., description="Frame ID"),
    request: AlignPromptRequest = None
 ):
    """
    Align image prompt with narration
    Regenerates the image prompt based on the frame's narration using 
    enhanced core imagery extraction for better semantic relevance.
    """
    if storyboard_id not in _storyboard_cache:
        raise HTTPException(status_code=404, detail=f"Storyboard {storyboard_id} not found")
    storyboard = _storyboard_cache[storyboard_id]
    frames = storyboard["frames"]
    # Find frame
    target_frame = None
    for frame in frames:
        if frame["id"] == frame_id:
            target_frame = frame
            break
    if not target_frame:
        raise HTTPException(status_code=404, detail=f"Frame {frame_id} not found")
    # Get narration to use
    narration = request.narration if request and request.narration else target_frame.get("narration", "")
    if not narration:
        raise HTTPException(status_code=400, detail="No narration text available")
    try:
        from api.dependencies import get_pixelle_video
        pixelle_video = await get_pixelle_video()
        # Use LLM to generate aligned image prompt
        from pixelle_video.prompts import build_image_prompt_prompt
        prompt = build_image_prompt_prompt(
            narrations=[narration],
            min_words=30,
            max_words=60
        )
        response = await pixelle_video.llm(
            prompt=prompt,
            temperature=0.7,
            max_tokens=500
        )
        # Parse response
        import json
        import re
        # Try to extract JSON
        try:
            result = json.loads(response)
        except json.JSONDecodeError:
            # Try markdown code block
            match = re.search(r'```(?:json)?\s*([\s\S]+?)\s*```', response)
            if match:
                result = json.loads(match.group(1))
            else:
                raise ValueError("Failed to parse LLM response")
        if "image_prompts" not in result or len(result["image_prompts"]) == 0:
            raise ValueError("No image prompts in response")
        new_prompt = result["image_prompts"][0]
        # Update frame
        target_frame["image_prompt"] = new_prompt
        _storyboard_cache[storyboard_id] = storyboard
        logger.info(f"Aligned image prompt for frame {frame_id}")
        return AlignPromptResponse(
            image_prompt=new_prompt,
            success=True
        )
    except Exception as e:
        logger.error(f"Prompt alignment failed: {e}")
        raise HTTPException(status_code=500, detail=str(e))
@router.post(
    "/storyboard/{storyboard_id}/frames/{frame_id}/inpaint",
    response_model=InpaintResponse
--- a/api/schemas/editor.py
+++ b/api/schemas/editor.py
@@ -144,3 +144,12 @@ class ExportStatusResponse(BaseModel):
    error: Optional[str] = None
 class AlignPromptRequest(BaseModel):
    """Request to align image prompt with narration"""
    narration: Optional[str] = Field(None, description="Override narration text")
 class AlignPromptResponse(BaseModel):
    """Response after aligning prompt"""
    image_prompt: str
    success: bool = True
--- a/frontend/src/app/editor/page.tsx
+++ b/frontend/src/app/editor/page.tsx
@@ -229,6 +229,7 @@ function SelectedFrameDetails() {
    const [isSaving, setIsSaving] = useState(false)
    const [isRegeneratingImage, setIsRegeneratingImage] = useState(false)
    const [isRegeneratingAudio, setIsRegeneratingAudio] = useState(false)
    const [isAligningPrompt, setIsAligningPrompt] = useState(false)
    const [error, setError] = useState<string | null>(null)
    // Update local state when frame changes
@@ -322,6 +323,31 @@ function SelectedFrameDetails() {
        }
    }
    const handleAlignPrompt = async () => {
        if (!storyboard || !selectedFrame) return
        setIsAligningPrompt(true)
        setError(null)
        try {
            const result = await editorApi.alignPrompt(
                storyboard.id,
                selectedFrame.id,
                narration || selectedFrame.narration
            )
            // Update local store with new image prompt
            updateFrame(selectedFrame.id, {
                imagePrompt: result.image_prompt,
            })
            setImagePrompt(result.image_prompt)
        } catch (err: any) {
            setError(err.message || '对齐提示词失败')
        } finally {
            setIsAligningPrompt(false)
        }
    }
    return (
        <div className="space-y-4">
            {error && (
@@ -434,6 +460,18 @@ function SelectedFrameDetails() {
                        ) : null}
                        重新生成音频
                    </Button>
                    <Button
                        size="sm"
                        variant="outline"
                        className="w-full"
                        onClick={handleAlignPrompt}
                        disabled={isAligningPrompt}
                    >
                        {isAligningPrompt ? (
                            <Loader2 className="h-4 w-4 animate-spin mr-2" />
                        ) : null}
                        对齐提示词
                    </Button>
                </div>
            )}
        </div>
--- a/frontend/src/services/editor-api.ts
+++ b/frontend/src/services/editor-api.ts
@@ -197,6 +197,31 @@ class EditorApiClient {
        return response.json()
    }
    /**
     * Align image prompt with narration - regenerate prompt based on narration
     */
    async alignPrompt(
        storyboardId: string,
        frameId: string,
        narration?: string
    ): Promise<{ image_prompt: string; success: boolean }> {
        const response = await fetch(
            `${this.baseUrl}/editor/storyboard/${storyboardId}/frames/${frameId}/align-prompt`,
            {
                method: 'POST',
                headers: { 'Content-Type': 'application/json' },
                body: JSON.stringify({ narration }),
            }
        )
        if (!response.ok) {
            const error = await response.json().catch(() => ({ detail: response.statusText }))
            throw new Error(error.detail || `Failed to align prompt: ${response.statusText}`)
        }
        return response.json()
    }
    /**
     * Inpaint (局部重绘) image for a frame
     */