From b3cf9e64e57c76cb4dab158d36e72291cf1e89fc Mon Sep 17 00:00:00 2001
From: empty <let5sne.mac@gmail.com>
Date: Wed, 7 Jan 2026 03:08:29 +0800
Subject: [PATCH] feat: Implement Character Memory V1 - VLM analysis and prompt
 injection

---
 api/routers/editor.py                         |  31 ++-
 api/routers/quality.py                        |  79 ++++++-
 .../services/quality/character_analyzer.py    | 203 ++++++++++++++++++
 3 files changed, 307 insertions(+), 6 deletions(-)
 create mode 100644 pixelle_video/services/quality/character_analyzer.py

diff --git a/api/routers/editor.py b/api/routers/editor.py
index fd37203..00fb884 100644
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -473,8 +473,35 @@ async def regenerate_frame_image(
         else:
             logger.warning(f"[REGEN-IMG] No style anchor found for {storyboard_id}")
         
-        # Apply style prefix to prompt
-        final_prompt = f"{style_prefix}, {prompt}" if style_prefix else prompt
+        # Get character descriptions for prompt injection
+        character_prefix = ""
+        from api.routers.quality import _character_stores
+        if storyboard_id in _character_stores:
+            char_descriptions = []
+            for char_data in _character_stores[storyboard_id].values():
+                appearance = char_data.get("appearance_description", "")
+                clothing = char_data.get("clothing_description", "")
+                name = char_data.get("name", "character")
+                
+                if appearance or clothing:
+                    parts = [f"{name}:"]
+                    if appearance:
+                        parts.append(appearance)
+                    if clothing:
+                        parts.append(f"wearing {clothing}")
+                    char_descriptions.append(" ".join(parts))
+            
+            if char_descriptions:
+                character_prefix = "Characters: " + "; ".join(char_descriptions) + ". "
+                logger.info(f"[REGEN-IMG] Injecting character descriptions: {character_prefix[:80]}...")
+        
+        # Apply style prefix and character descriptions to prompt
+        final_prompt = ""
+        if style_prefix:
+            final_prompt += f"{style_prefix}, "
+        if character_prefix:
+            final_prompt += character_prefix
+        final_prompt += prompt
         logger.info(f"[REGEN-IMG] Final prompt: {final_prompt[:120]}...")
         
         # Use MediaService to generate image via RunningHub workflow
diff --git a/api/routers/quality.py b/api/routers/quality.py
index b8fe2e0..0e84c42 100644
--- a/api/routers/quality.py
+++ b/api/routers/quality.py
@@ -45,6 +45,20 @@ class CharacterCreateRequest(BaseModel):
     clothing_description: str = Field("", description="Clothing description")
     distinctive_features: List[str] = Field(default_factory=list)
     character_type: str = Field("person")
+    reference_image_path: Optional[str] = Field(None, description="Reference image path for VLM analysis")
+
+
+class CharacterAnalyzeRequest(BaseModel):
+    """Request to analyze a character image"""
+    image_path: str = Field(..., description="Path to the reference image")
+
+
+class CharacterAnalyzeResponse(BaseModel):
+    """Response from character image analysis"""
+    appearance_description: str = ""
+    clothing_description: str = ""
+    distinctive_features: List[str] = []
+    prompt_description: str = ""  # Combined description for prompt injection
 
 
 class ContentCheckRequest(BaseModel):
@@ -115,20 +129,49 @@ async def create_character(
     storyboard_id: str = Path(..., description="Storyboard ID"),
     request: CharacterCreateRequest = Body(...)
 ):
-    """Register a new character"""
+    """
+    Register a new character
+    
+    If reference_image_path is provided and appearance_description is empty,
+    VLM will analyze the image to extract appearance descriptions automatically.
+    """
     import uuid
     
     if storyboard_id not in _character_stores:
         _character_stores[storyboard_id] = {}
     
+    # Auto-analyze reference image if provided and no description
+    appearance_desc = request.appearance_description
+    clothing_desc = request.clothing_description
+    distinctive = request.distinctive_features
+    ref_image = request.reference_image_path
+    
+    if ref_image and not appearance_desc:
+        try:
+            from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
+            analyzer = CharacterAnalyzer()
+            result = await analyzer.analyze_reference_image(ref_image)
+            
+            if result.appearance_description:
+                appearance_desc = result.appearance_description
+            if result.clothing_description:
+                clothing_desc = result.clothing_description
+            if result.distinctive_features:
+                distinctive = result.distinctive_features
+            
+            logger.info(f"Auto-analyzed character from image: {ref_image}")
+        except Exception as e:
+            logger.warning(f"Failed to auto-analyze character image: {e}")
+    
     char_id = f"char_{uuid.uuid4().hex[:8]}"
     character = CharacterSchema(
         id=char_id,
         name=request.name,
-        appearance_description=request.appearance_description,
-        clothing_description=request.clothing_description,
-        distinctive_features=request.distinctive_features,
+        appearance_description=appearance_desc,
+        clothing_description=clothing_desc,
+        distinctive_features=distinctive,
         character_type=request.character_type,
+        reference_image=ref_image,
     )
     
     _character_stores[storyboard_id][char_id] = character.model_dump()
@@ -184,6 +227,34 @@ async def delete_character(
     return {"deleted": True}
 
 
+@router.post(
+    "/characters/{storyboard_id}/analyze-image",
+    response_model=CharacterAnalyzeResponse
+)
+async def analyze_character_image(
+    storyboard_id: str = Path(..., description="Storyboard ID"),
+    request: CharacterAnalyzeRequest = Body(...)
+):
+    """
+    Analyze a character reference image using VLM
+    
+    Extracts detailed appearance descriptions that can be used
+    to maintain character consistency across frames.
+    """
+    from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
+    
+    logger.info(f"Analyzing character image for storyboard {storyboard_id}: {request.image_path}")
+    
+    analyzer = CharacterAnalyzer()
+    result = await analyzer.analyze_reference_image(request.image_path)
+    
+    return CharacterAnalyzeResponse(
+        appearance_description=result.appearance_description,
+        clothing_description=result.clothing_description,
+        distinctive_features=result.distinctive_features,
+        prompt_description=result.to_prompt_description()
+    )
+
 # ============================================================
 # Content Filter Endpoints
 # ============================================================
diff --git a/pixelle_video/services/quality/character_analyzer.py b/pixelle_video/services/quality/character_analyzer.py
new file mode 100644
index 0000000..9b3c0d1
--- /dev/null
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -0,0 +1,203 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CharacterAnalyzer - VLM-based character appearance extraction
+
+Analyzes reference images to extract detailed character descriptions
+for maintaining visual consistency across video frames.
+"""
+
+import base64
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+from loguru import logger
+from openai import AsyncOpenAI
+
+
+@dataclass
+class CharacterAnalysisResult:
+    """Result of character image analysis"""
+    
+    appearance_description: str = ""     # Physical features
+    clothing_description: str = ""       # What they're wearing
+    distinctive_features: List[str] = None  # Unique identifying features
+    
+    def __post_init__(self):
+        if self.distinctive_features is None:
+            self.distinctive_features = []
+    
+    def to_prompt_description(self) -> str:
+        """Generate a prompt-ready character description"""
+        parts = []
+        
+        if self.appearance_description:
+            parts.append(self.appearance_description)
+        
+        if self.clothing_description:
+            parts.append(f"wearing {self.clothing_description}")
+        
+        if self.distinctive_features:
+            features = ", ".join(self.distinctive_features)
+            parts.append(f"with {features}")
+        
+        return ", ".join(parts) if parts else ""
+    
+    def to_dict(self) -> dict:
+        return {
+            "appearance_description": self.appearance_description,
+            "clothing_description": self.clothing_description,
+            "distinctive_features": self.distinctive_features,
+        }
+
+
+class CharacterAnalyzer:
+    """
+    VLM-based character appearance analyzer
+    
+    Analyzes reference images to extract detailed character descriptions
+    that can be injected into image generation prompts.
+    
+    Example:
+        >>> analyzer = CharacterAnalyzer()
+        >>> result = await analyzer.analyze_reference_image("character.png")
+        >>> print(result.appearance_description)
+        "young woman with long black hair, round face, fair skin"
+        >>> print(result.to_prompt_description())
+        "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses"
+    """
+    
+    def __init__(self):
+        """Initialize CharacterAnalyzer"""
+        pass
+    
+    async def analyze_reference_image(
+        self,
+        image_path: str,
+    ) -> CharacterAnalysisResult:
+        """
+        Analyze a reference image to extract character appearance
+        
+        Args:
+            image_path: Path to the reference image
+            
+        Returns:
+            CharacterAnalysisResult with extracted descriptions
+        """
+        logger.info(f"Analyzing character reference image: {image_path}")
+        
+        # Check if file exists
+        if not os.path.exists(image_path):
+            logger.warning(f"Image not found: {image_path}")
+            return CharacterAnalysisResult()
+        
+        try:
+            # Read and encode image
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode("utf-8")
+            
+            # Determine image type
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = "image/png" if ext == ".png" else "image/jpeg"
+            
+            # VLM prompt for character analysis
+            analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions.
+
+Provide your analysis in JSON format:
+{
+    "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.",
+    "clothing_description": "What they're wearing - describe colors, style, and notable items.",
+    "distinctive_features": ["list", "of", "unique", "identifying", "features"]
+}
+
+Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character.
+
+Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick"
+
+Output ONLY the JSON object, no additional text."""
+
+            # Build multimodal message
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": analysis_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{media_type};base64,{image_data}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # Get LLM config
+            from pixelle_video.config import config_manager
+            llm_config = config_manager.config.llm
+            
+            # Create OpenAI client for VLM call
+            client = AsyncOpenAI(
+                api_key=llm_config.api_key,
+                base_url=llm_config.base_url
+            )
+            
+            # Call VLM
+            response = await client.chat.completions.create(
+                model=llm_config.model,
+                messages=messages,
+                temperature=0.3,
+                max_tokens=800
+            )
+            vlm_response = response.choices[0].message.content
+            logger.debug(f"VLM character analysis response: {vlm_response[:150]}...")
+            
+            # Parse response
+            return self._parse_response(vlm_response)
+            
+        except Exception as e:
+            logger.error(f"Character analysis failed: {e}")
+            return CharacterAnalysisResult()
+    
+    def _parse_response(self, response: str) -> CharacterAnalysisResult:
+        """Parse VLM response into CharacterAnalysisResult"""
+        try:
+            # Try to extract JSON from response
+            match = re.search(r'\{[\s\S]*\}', response)
+            if match:
+                data = json.loads(match.group())
+            else:
+                data = json.loads(response)
+            
+            result = CharacterAnalysisResult(
+                appearance_description=data.get("appearance_description", ""),
+                clothing_description=data.get("clothing_description", ""),
+                distinctive_features=data.get("distinctive_features", []),
+            )
+            
+            logger.info(f"Character analysis extracted: {result.appearance_description[:80]}...")
+            return result
+            
+        except (json.JSONDecodeError, KeyError) as e:
+            logger.warning(f"Failed to parse VLM response: {e}")
+            
+            # Try to use the raw response as appearance description
+            if len(response) < 500 and len(response) > 20:
+                return CharacterAnalysisResult(
+                    appearance_description=response.strip()
+                )
+            
+            return CharacterAnalysisResult()