feat: Implement Character Memory V1 - VLM analysis and prompt injection

2026-01-07 03:08:29 +08:00
parent da98d0842a
commit b3cf9e64e5
3 changed files with 307 additions and 6 deletions
--- a/pixelle_video/services/quality/character_analyzer.py
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -0,0 +1,203 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CharacterAnalyzer - VLM-based character appearance extraction
+
+Analyzes reference images to extract detailed character descriptions
+for maintaining visual consistency across video frames.
+"""
+
+import base64
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+from loguru import logger
+from openai import AsyncOpenAI
+
+
+@dataclass
+class CharacterAnalysisResult:
+    """Result of character image analysis"""
+    
+    appearance_description: str = ""     # Physical features
+    clothing_description: str = ""       # What they're wearing
+    distinctive_features: List[str] = None  # Unique identifying features
+    
+    def __post_init__(self):
+        if self.distinctive_features is None:
+            self.distinctive_features = []
+    
+    def to_prompt_description(self) -> str:
+        """Generate a prompt-ready character description"""
+        parts = []
+        
+        if self.appearance_description:
+            parts.append(self.appearance_description)
+        
+        if self.clothing_description:
+            parts.append(f"wearing {self.clothing_description}")
+        
+        if self.distinctive_features:
+            features = ", ".join(self.distinctive_features)
+            parts.append(f"with {features}")
+        
+        return ", ".join(parts) if parts else ""
+    
+    def to_dict(self) -> dict:
+        return {
+            "appearance_description": self.appearance_description,
+            "clothing_description": self.clothing_description,
+            "distinctive_features": self.distinctive_features,
+        }
+
+
+class CharacterAnalyzer:
+    """
+    VLM-based character appearance analyzer
+    
+    Analyzes reference images to extract detailed character descriptions
+    that can be injected into image generation prompts.
+    
+    Example:
+        >>> analyzer = CharacterAnalyzer()
+        >>> result = await analyzer.analyze_reference_image("character.png")
+        >>> print(result.appearance_description)
+        "young woman with long black hair, round face, fair skin"
+        >>> print(result.to_prompt_description())
+        "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses"
+    """
+    
+    def __init__(self):
+        """Initialize CharacterAnalyzer"""
+        pass
+    
+    async def analyze_reference_image(
+        self,
+        image_path: str,
+    ) -> CharacterAnalysisResult:
+        """
+        Analyze a reference image to extract character appearance
+        
+        Args:
+            image_path: Path to the reference image
+            
+        Returns:
+            CharacterAnalysisResult with extracted descriptions
+        """
+        logger.info(f"Analyzing character reference image: {image_path}")
+        
+        # Check if file exists
+        if not os.path.exists(image_path):
+            logger.warning(f"Image not found: {image_path}")
+            return CharacterAnalysisResult()
+        
+        try:
+            # Read and encode image
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode("utf-8")
+            
+            # Determine image type
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = "image/png" if ext == ".png" else "image/jpeg"
+            
+            # VLM prompt for character analysis
+            analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions.
+
+Provide your analysis in JSON format:
+{
+    "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.",
+    "clothing_description": "What they're wearing - describe colors, style, and notable items.",
+    "distinctive_features": ["list", "of", "unique", "identifying", "features"]
+}
+
+Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character.
+
+Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick"
+
+Output ONLY the JSON object, no additional text."""
+
+            # Build multimodal message
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": analysis_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{media_type};base64,{image_data}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # Get LLM config
+            from pixelle_video.config import config_manager
+            llm_config = config_manager.config.llm
+            
+            # Create OpenAI client for VLM call
+            client = AsyncOpenAI(
+                api_key=llm_config.api_key,
+                base_url=llm_config.base_url
+            )
+            
+            # Call VLM
+            response = await client.chat.completions.create(
+                model=llm_config.model,
+                messages=messages,
+                temperature=0.3,
+                max_tokens=800
+            )
+            vlm_response = response.choices[0].message.content
+            logger.debug(f"VLM character analysis response: {vlm_response[:150]}...")
+            
+            # Parse response
+            return self._parse_response(vlm_response)
+            
+        except Exception as e:
+            logger.error(f"Character analysis failed: {e}")
+            return CharacterAnalysisResult()
+    
+    def _parse_response(self, response: str) -> CharacterAnalysisResult:
+        """Parse VLM response into CharacterAnalysisResult"""
+        try:
+            # Try to extract JSON from response
+            match = re.search(r'\{[\s\S]*\}', response)
+            if match:
+                data = json.loads(match.group())
+            else:
+                data = json.loads(response)
+            
+            result = CharacterAnalysisResult(
+                appearance_description=data.get("appearance_description", ""),
+                clothing_description=data.get("clothing_description", ""),
+                distinctive_features=data.get("distinctive_features", []),
+            )
+            
+            logger.info(f"Character analysis extracted: {result.appearance_description[:80]}...")
+            return result
+            
+        except (json.JSONDecodeError, KeyError) as e:
+            logger.warning(f"Failed to parse VLM response: {e}")
+            
+            # Try to use the raw response as appearance description
+            if len(response) < 500 and len(response) > 20:
+                return CharacterAnalysisResult(
+                    appearance_description=response.strip()
+                )
+            
+            return CharacterAnalysisResult()