feat(P1): Add align-prompt feature for better text-image relevance

This commit is contained in:
empty
2026-01-06 23:29:41 +08:00
parent 2978622f7f
commit 1d343e55ba
4 changed files with 166 additions and 0 deletions

View File

@@ -41,6 +41,8 @@ from api.schemas.editor import (
ExportRequest,
ExportResponse,
ExportStatusResponse,
AlignPromptRequest,
AlignPromptResponse,
)
from fastapi import BackgroundTasks
import asyncio
@@ -598,6 +600,98 @@ async def regenerate_frame_audio(
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/storyboard/{storyboard_id}/frames/{frame_id}/align-prompt",
response_model=AlignPromptResponse
)
async def align_frame_prompt(
storyboard_id: str = Path(..., description="Storyboard/task ID"),
frame_id: str = Path(..., description="Frame ID"),
request: AlignPromptRequest = None
):
"""
Align image prompt with narration
Regenerates the image prompt based on the frame's narration using
enhanced core imagery extraction for better semantic relevance.
"""
if storyboard_id not in _storyboard_cache:
raise HTTPException(status_code=404, detail=f"Storyboard {storyboard_id} not found")
storyboard = _storyboard_cache[storyboard_id]
frames = storyboard["frames"]
# Find frame
target_frame = None
for frame in frames:
if frame["id"] == frame_id:
target_frame = frame
break
if not target_frame:
raise HTTPException(status_code=404, detail=f"Frame {frame_id} not found")
# Get narration to use
narration = request.narration if request and request.narration else target_frame.get("narration", "")
if not narration:
raise HTTPException(status_code=400, detail="No narration text available")
try:
from api.dependencies import get_pixelle_video
pixelle_video = await get_pixelle_video()
# Use LLM to generate aligned image prompt
from pixelle_video.prompts import build_image_prompt_prompt
prompt = build_image_prompt_prompt(
narrations=[narration],
min_words=30,
max_words=60
)
response = await pixelle_video.llm(
prompt=prompt,
temperature=0.7,
max_tokens=500
)
# Parse response
import json
import re
# Try to extract JSON
try:
result = json.loads(response)
except json.JSONDecodeError:
# Try markdown code block
match = re.search(r'```(?:json)?\s*([\s\S]+?)\s*```', response)
if match:
result = json.loads(match.group(1))
else:
raise ValueError("Failed to parse LLM response")
if "image_prompts" not in result or len(result["image_prompts"]) == 0:
raise ValueError("No image prompts in response")
new_prompt = result["image_prompts"][0]
# Update frame
target_frame["image_prompt"] = new_prompt
_storyboard_cache[storyboard_id] = storyboard
logger.info(f"Aligned image prompt for frame {frame_id}")
return AlignPromptResponse(
image_prompt=new_prompt,
success=True
)
except Exception as e:
logger.error(f"Prompt alignment failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/storyboard/{storyboard_id}/frames/{frame_id}/inpaint",
response_model=InpaintResponse

View File

@@ -144,3 +144,12 @@ class ExportStatusResponse(BaseModel):
error: Optional[str] = None
class AlignPromptRequest(BaseModel):
"""Request to align image prompt with narration"""
narration: Optional[str] = Field(None, description="Override narration text")
class AlignPromptResponse(BaseModel):
"""Response after aligning prompt"""
image_prompt: str
success: bool = True