# Copyright (C) 2025 AIDC-AI # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ CharacterAnalyzer - VLM-based character appearance extraction Analyzes reference images to extract detailed character descriptions for maintaining visual consistency across video frames. """ import base64 import json import os import re from dataclasses import dataclass from typing import List, Optional from loguru import logger from openai import AsyncOpenAI @dataclass class CharacterAnalysisResult: """Result of character image analysis""" appearance_description: str = "" # Physical features clothing_description: str = "" # What they're wearing distinctive_features: List[str] = None # Unique identifying features def __post_init__(self): if self.distinctive_features is None: self.distinctive_features = [] def to_prompt_description(self) -> str: """Generate a prompt-ready character description""" parts = [] if self.appearance_description: parts.append(self.appearance_description) if self.clothing_description: parts.append(f"wearing {self.clothing_description}") if self.distinctive_features: features = ", ".join(self.distinctive_features) parts.append(f"with {features}") return ", ".join(parts) if parts else "" def to_dict(self) -> dict: return { "appearance_description": self.appearance_description, "clothing_description": self.clothing_description, "distinctive_features": self.distinctive_features, } class CharacterAnalyzer: """ VLM-based character appearance analyzer Analyzes reference images to extract detailed character descriptions that can be injected into image generation prompts. Example: >>> analyzer = CharacterAnalyzer() >>> result = await analyzer.analyze_reference_image("character.png") >>> print(result.appearance_description) "young woman with long black hair, round face, fair skin" >>> print(result.to_prompt_description()) "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses" """ def __init__(self): """Initialize CharacterAnalyzer""" pass async def analyze_reference_image( self, image_path: str, ) -> CharacterAnalysisResult: """ Analyze a reference image to extract character appearance Args: image_path: Path to the reference image Returns: CharacterAnalysisResult with extracted descriptions """ logger.info(f"Analyzing character reference image: {image_path}") # Check if file exists if not os.path.exists(image_path): logger.warning(f"Image not found: {image_path}") return CharacterAnalysisResult() try: # Read and encode image with open(image_path, "rb") as f: image_data = base64.b64encode(f.read()).decode("utf-8") # Determine image type ext = os.path.splitext(image_path)[1].lower() media_type = "image/png" if ext == ".png" else "image/jpeg" # VLM prompt for character analysis analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions. Provide your analysis in JSON format: { "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.", "clothing_description": "What they're wearing - describe colors, style, and notable items.", "distinctive_features": ["list", "of", "unique", "identifying", "features"] } Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character. Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick" Output ONLY the JSON object, no additional text.""" # Build multimodal message messages = [ { "role": "user", "content": [ {"type": "text", "text": analysis_prompt}, { "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{image_data}" } } ] } ] # Get LLM config from pixelle_video.config import config_manager llm_config = config_manager.config.llm # Create OpenAI client for VLM call client = AsyncOpenAI( api_key=llm_config.api_key, base_url=llm_config.base_url ) # Call VLM response = await client.chat.completions.create( model=llm_config.model, messages=messages, temperature=0.3, max_tokens=800 ) vlm_response = response.choices[0].message.content logger.debug(f"VLM character analysis response: {vlm_response[:150]}...") # Parse response return self._parse_response(vlm_response) except Exception as e: logger.error(f"Character analysis failed: {e}") return CharacterAnalysisResult() def _parse_response(self, response: str) -> CharacterAnalysisResult: """Parse VLM response into CharacterAnalysisResult""" try: # Try to extract JSON from response match = re.search(r'\{[\s\S]*\}', response) if match: data = json.loads(match.group()) else: data = json.loads(response) result = CharacterAnalysisResult( appearance_description=data.get("appearance_description", ""), clothing_description=data.get("clothing_description", ""), distinctive_features=data.get("distinctive_features", []), ) logger.info(f"Character analysis extracted: {result.appearance_description[:80]}...") return result except (json.JSONDecodeError, KeyError) as e: logger.warning(f"Failed to parse VLM response: {e}") # Try to use the raw response as appearance description if len(response) < 500 and len(response) > 20: return CharacterAnalysisResult( appearance_description=response.strip() ) return CharacterAnalysisResult()