feat: Implement Character Memory V1 - VLM analysis and prompt injection

2026-01-07 03:08:29 +08:00
parent da98d0842a
commit b3cf9e64e5
3 changed files with 307 additions and 6 deletions
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -473,8 +473,35 @@ async def regenerate_frame_image(
        else:
            logger.warning(f"[REGEN-IMG] No style anchor found for {storyboard_id}")
-        # Apply style prefix to prompt
+        # Get character descriptions for prompt injection
-        final_prompt = f"{style_prefix}, {prompt}" if style_prefix else prompt
+        character_prefix = ""
        from api.routers.quality import _character_stores
        if storyboard_id in _character_stores:
            char_descriptions = []
            for char_data in _character_stores[storyboard_id].values():
                appearance = char_data.get("appearance_description", "")
                clothing = char_data.get("clothing_description", "")
                name = char_data.get("name", "character")
                if appearance or clothing:
                    parts = [f"{name}:"]
                    if appearance:
                        parts.append(appearance)
                    if clothing:
                        parts.append(f"wearing {clothing}")
                    char_descriptions.append(" ".join(parts))
            if char_descriptions:
                character_prefix = "Characters: " + "; ".join(char_descriptions) + ". "
                logger.info(f"[REGEN-IMG] Injecting character descriptions: {character_prefix[:80]}...")
        # Apply style prefix and character descriptions to prompt
        final_prompt = ""
        if style_prefix:
            final_prompt += f"{style_prefix}, "
        if character_prefix:
            final_prompt += character_prefix
        final_prompt += prompt
        logger.info(f"[REGEN-IMG] Final prompt: {final_prompt[:120]}...")
        # Use MediaService to generate image via RunningHub workflow
--- a/api/routers/quality.py
+++ b/api/routers/quality.py
@@ -45,6 +45,20 @@ class CharacterCreateRequest(BaseModel):
    clothing_description: str = Field("", description="Clothing description")
    distinctive_features: List[str] = Field(default_factory=list)
    character_type: str = Field("person")
    reference_image_path: Optional[str] = Field(None, description="Reference image path for VLM analysis")
 class CharacterAnalyzeRequest(BaseModel):
    """Request to analyze a character image"""
    image_path: str = Field(..., description="Path to the reference image")
 class CharacterAnalyzeResponse(BaseModel):
    """Response from character image analysis"""
    appearance_description: str = ""
    clothing_description: str = ""
    distinctive_features: List[str] = []
    prompt_description: str = ""  # Combined description for prompt injection
 class ContentCheckRequest(BaseModel):
@@ -115,20 +129,49 @@ async def create_character(
    storyboard_id: str = Path(..., description="Storyboard ID"),
    request: CharacterCreateRequest = Body(...)
 ):
-    """Register a new character"""
+    """
    Register a new character
    If reference_image_path is provided and appearance_description is empty,
    VLM will analyze the image to extract appearance descriptions automatically.
    """
    import uuid
    if storyboard_id not in _character_stores:
        _character_stores[storyboard_id] = {}
    # Auto-analyze reference image if provided and no description
    appearance_desc = request.appearance_description
    clothing_desc = request.clothing_description
    distinctive = request.distinctive_features
    ref_image = request.reference_image_path
    if ref_image and not appearance_desc:
        try:
            from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
            analyzer = CharacterAnalyzer()
            result = await analyzer.analyze_reference_image(ref_image)
            if result.appearance_description:
                appearance_desc = result.appearance_description
            if result.clothing_description:
                clothing_desc = result.clothing_description
            if result.distinctive_features:
                distinctive = result.distinctive_features
            logger.info(f"Auto-analyzed character from image: {ref_image}")
        except Exception as e:
            logger.warning(f"Failed to auto-analyze character image: {e}")
    char_id = f"char_{uuid.uuid4().hex[:8]}"
    character = CharacterSchema(
        id=char_id,
        name=request.name,
-        appearance_description=request.appearance_description,
+        appearance_description=appearance_desc,
-        clothing_description=request.clothing_description,
+        clothing_description=clothing_desc,
-        distinctive_features=request.distinctive_features,
+        distinctive_features=distinctive,
        character_type=request.character_type,
        reference_image=ref_image,
    )
    _character_stores[storyboard_id][char_id] = character.model_dump()
@@ -184,6 +227,34 @@ async def delete_character(
    return {"deleted": True}
@router.post(
    "/characters/{storyboard_id}/analyze-image",
    response_model=CharacterAnalyzeResponse
 )
 async def analyze_character_image(
    storyboard_id: str = Path(..., description="Storyboard ID"),
    request: CharacterAnalyzeRequest = Body(...)
 ):
    """
    Analyze a character reference image using VLM
    Extracts detailed appearance descriptions that can be used
    to maintain character consistency across frames.
    """
    from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
    logger.info(f"Analyzing character image for storyboard {storyboard_id}: {request.image_path}")
    analyzer = CharacterAnalyzer()
    result = await analyzer.analyze_reference_image(request.image_path)
    return CharacterAnalyzeResponse(
        appearance_description=result.appearance_description,
        clothing_description=result.clothing_description,
        distinctive_features=result.distinctive_features,
        prompt_description=result.to_prompt_description()
    )
 # ============================================================
 # Content Filter Endpoints
 # ============================================================
--- a/pixelle_video/services/quality/character_analyzer.py
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -0,0 +1,203 @@
 # Copyright (C) 2025 AIDC-AI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 CharacterAnalyzer - VLM-based character appearance extraction
 Analyzes reference images to extract detailed character descriptions
 for maintaining visual consistency across video frames.
 """
 import base64
 import json
 import os
 import re
 from dataclasses import dataclass
 from typing import List, Optional
 from loguru import logger
 from openai import AsyncOpenAI
@dataclass
 class CharacterAnalysisResult:
    """Result of character image analysis"""
    appearance_description: str = ""     # Physical features
    clothing_description: str = ""       # What they're wearing
    distinctive_features: List[str] = None  # Unique identifying features
    def __post_init__(self):
        if self.distinctive_features is None:
            self.distinctive_features = []
    def to_prompt_description(self) -> str:
        """Generate a prompt-ready character description"""
        parts = []
        if self.appearance_description:
            parts.append(self.appearance_description)
        if self.clothing_description:
            parts.append(f"wearing {self.clothing_description}")
        if self.distinctive_features:
            features = ", ".join(self.distinctive_features)
            parts.append(f"with {features}")
        return ", ".join(parts) if parts else ""
    def to_dict(self) -> dict:
        return {
            "appearance_description": self.appearance_description,
            "clothing_description": self.clothing_description,
            "distinctive_features": self.distinctive_features,
        }
 class CharacterAnalyzer:
    """
    VLM-based character appearance analyzer
    Analyzes reference images to extract detailed character descriptions
    that can be injected into image generation prompts.
    Example:
        >>> analyzer = CharacterAnalyzer()
        >>> result = await analyzer.analyze_reference_image("character.png")
        >>> print(result.appearance_description)
        "young woman with long black hair, round face, fair skin"
        >>> print(result.to_prompt_description())
        "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses"
    """
    def __init__(self):
        """Initialize CharacterAnalyzer"""
        pass
    async def analyze_reference_image(
        self,
        image_path: str,
    ) -> CharacterAnalysisResult:
        """
        Analyze a reference image to extract character appearance
        Args:
            image_path: Path to the reference image
        Returns:
            CharacterAnalysisResult with extracted descriptions
        """
        logger.info(f"Analyzing character reference image: {image_path}")
        # Check if file exists
        if not os.path.exists(image_path):
            logger.warning(f"Image not found: {image_path}")
            return CharacterAnalysisResult()
        try:
            # Read and encode image
            with open(image_path, "rb") as f:
                image_data = base64.b64encode(f.read()).decode("utf-8")
            # Determine image type
            ext = os.path.splitext(image_path)[1].lower()
            media_type = "image/png" if ext == ".png" else "image/jpeg"
            # VLM prompt for character analysis
            analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions.
 Provide your analysis in JSON format:
 {
    "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.",
    "clothing_description": "What they're wearing - describe colors, style, and notable items.",
    "distinctive_features": ["list", "of", "unique", "identifying", "features"]
 }
 Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character.
 Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick"
 Output ONLY the JSON object, no additional text."""
            # Build multimodal message
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": analysis_prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{media_type};base64,{image_data}"
                            }
                        }
                    ]
                }
            ]
            # Get LLM config
            from pixelle_video.config import config_manager
            llm_config = config_manager.config.llm
            # Create OpenAI client for VLM call
            client = AsyncOpenAI(
                api_key=llm_config.api_key,
                base_url=llm_config.base_url
            )
            # Call VLM
            response = await client.chat.completions.create(
                model=llm_config.model,
                messages=messages,
                temperature=0.3,
                max_tokens=800
            )
            vlm_response = response.choices[0].message.content
            logger.debug(f"VLM character analysis response: {vlm_response[:150]}...")
            # Parse response
            return self._parse_response(vlm_response)
        except Exception as e:
            logger.error(f"Character analysis failed: {e}")
            return CharacterAnalysisResult()
    def _parse_response(self, response: str) -> CharacterAnalysisResult:
        """Parse VLM response into CharacterAnalysisResult"""
        try:
            # Try to extract JSON from response
            match = re.search(r'\{[\s\S]*\}', response)
            if match:
                data = json.loads(match.group())
            else:
                data = json.loads(response)
            result = CharacterAnalysisResult(
                appearance_description=data.get("appearance_description", ""),
                clothing_description=data.get("clothing_description", ""),
                distinctive_features=data.get("distinctive_features", []),
            )
            logger.info(f"Character analysis extracted: {result.appearance_description[:80]}...")
            return result
        except (json.JSONDecodeError, KeyError) as e:
            logger.warning(f"Failed to parse VLM response: {e}")
            # Try to use the raw response as appearance description
            if len(response) < 500 and len(response) > 20:
                return CharacterAnalysisResult(
                    appearance_description=response.strip()
                )
            return CharacterAnalysisResult()