From b3cf9e64e57c76cb4dab158d36e72291cf1e89fc Mon Sep 17 00:00:00 2001 From: empty Date: Wed, 7 Jan 2026 03:08:29 +0800 Subject: [PATCH] feat: Implement Character Memory V1 - VLM analysis and prompt injection --- api/routers/editor.py | 31 ++- api/routers/quality.py | 79 ++++++- .../services/quality/character_analyzer.py | 203 ++++++++++++++++++ 3 files changed, 307 insertions(+), 6 deletions(-) create mode 100644 pixelle_video/services/quality/character_analyzer.py diff --git a/api/routers/editor.py b/api/routers/editor.py index fd37203..00fb884 100644 --- a/api/routers/editor.py +++ b/api/routers/editor.py @@ -473,8 +473,35 @@ async def regenerate_frame_image( else: logger.warning(f"[REGEN-IMG] No style anchor found for {storyboard_id}") - # Apply style prefix to prompt - final_prompt = f"{style_prefix}, {prompt}" if style_prefix else prompt + # Get character descriptions for prompt injection + character_prefix = "" + from api.routers.quality import _character_stores + if storyboard_id in _character_stores: + char_descriptions = [] + for char_data in _character_stores[storyboard_id].values(): + appearance = char_data.get("appearance_description", "") + clothing = char_data.get("clothing_description", "") + name = char_data.get("name", "character") + + if appearance or clothing: + parts = [f"{name}:"] + if appearance: + parts.append(appearance) + if clothing: + parts.append(f"wearing {clothing}") + char_descriptions.append(" ".join(parts)) + + if char_descriptions: + character_prefix = "Characters: " + "; ".join(char_descriptions) + ". " + logger.info(f"[REGEN-IMG] Injecting character descriptions: {character_prefix[:80]}...") + + # Apply style prefix and character descriptions to prompt + final_prompt = "" + if style_prefix: + final_prompt += f"{style_prefix}, " + if character_prefix: + final_prompt += character_prefix + final_prompt += prompt logger.info(f"[REGEN-IMG] Final prompt: {final_prompt[:120]}...") # Use MediaService to generate image via RunningHub workflow diff --git a/api/routers/quality.py b/api/routers/quality.py index b8fe2e0..0e84c42 100644 --- a/api/routers/quality.py +++ b/api/routers/quality.py @@ -45,6 +45,20 @@ class CharacterCreateRequest(BaseModel): clothing_description: str = Field("", description="Clothing description") distinctive_features: List[str] = Field(default_factory=list) character_type: str = Field("person") + reference_image_path: Optional[str] = Field(None, description="Reference image path for VLM analysis") + + +class CharacterAnalyzeRequest(BaseModel): + """Request to analyze a character image""" + image_path: str = Field(..., description="Path to the reference image") + + +class CharacterAnalyzeResponse(BaseModel): + """Response from character image analysis""" + appearance_description: str = "" + clothing_description: str = "" + distinctive_features: List[str] = [] + prompt_description: str = "" # Combined description for prompt injection class ContentCheckRequest(BaseModel): @@ -115,20 +129,49 @@ async def create_character( storyboard_id: str = Path(..., description="Storyboard ID"), request: CharacterCreateRequest = Body(...) ): - """Register a new character""" + """ + Register a new character + + If reference_image_path is provided and appearance_description is empty, + VLM will analyze the image to extract appearance descriptions automatically. + """ import uuid if storyboard_id not in _character_stores: _character_stores[storyboard_id] = {} + # Auto-analyze reference image if provided and no description + appearance_desc = request.appearance_description + clothing_desc = request.clothing_description + distinctive = request.distinctive_features + ref_image = request.reference_image_path + + if ref_image and not appearance_desc: + try: + from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer + analyzer = CharacterAnalyzer() + result = await analyzer.analyze_reference_image(ref_image) + + if result.appearance_description: + appearance_desc = result.appearance_description + if result.clothing_description: + clothing_desc = result.clothing_description + if result.distinctive_features: + distinctive = result.distinctive_features + + logger.info(f"Auto-analyzed character from image: {ref_image}") + except Exception as e: + logger.warning(f"Failed to auto-analyze character image: {e}") + char_id = f"char_{uuid.uuid4().hex[:8]}" character = CharacterSchema( id=char_id, name=request.name, - appearance_description=request.appearance_description, - clothing_description=request.clothing_description, - distinctive_features=request.distinctive_features, + appearance_description=appearance_desc, + clothing_description=clothing_desc, + distinctive_features=distinctive, character_type=request.character_type, + reference_image=ref_image, ) _character_stores[storyboard_id][char_id] = character.model_dump() @@ -184,6 +227,34 @@ async def delete_character( return {"deleted": True} +@router.post( + "/characters/{storyboard_id}/analyze-image", + response_model=CharacterAnalyzeResponse +) +async def analyze_character_image( + storyboard_id: str = Path(..., description="Storyboard ID"), + request: CharacterAnalyzeRequest = Body(...) +): + """ + Analyze a character reference image using VLM + + Extracts detailed appearance descriptions that can be used + to maintain character consistency across frames. + """ + from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer + + logger.info(f"Analyzing character image for storyboard {storyboard_id}: {request.image_path}") + + analyzer = CharacterAnalyzer() + result = await analyzer.analyze_reference_image(request.image_path) + + return CharacterAnalyzeResponse( + appearance_description=result.appearance_description, + clothing_description=result.clothing_description, + distinctive_features=result.distinctive_features, + prompt_description=result.to_prompt_description() + ) + # ============================================================ # Content Filter Endpoints # ============================================================ diff --git a/pixelle_video/services/quality/character_analyzer.py b/pixelle_video/services/quality/character_analyzer.py new file mode 100644 index 0000000..9b3c0d1 --- /dev/null +++ b/pixelle_video/services/quality/character_analyzer.py @@ -0,0 +1,203 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CharacterAnalyzer - VLM-based character appearance extraction + +Analyzes reference images to extract detailed character descriptions +for maintaining visual consistency across video frames. +""" + +import base64 +import json +import os +import re +from dataclasses import dataclass +from typing import List, Optional + +from loguru import logger +from openai import AsyncOpenAI + + +@dataclass +class CharacterAnalysisResult: + """Result of character image analysis""" + + appearance_description: str = "" # Physical features + clothing_description: str = "" # What they're wearing + distinctive_features: List[str] = None # Unique identifying features + + def __post_init__(self): + if self.distinctive_features is None: + self.distinctive_features = [] + + def to_prompt_description(self) -> str: + """Generate a prompt-ready character description""" + parts = [] + + if self.appearance_description: + parts.append(self.appearance_description) + + if self.clothing_description: + parts.append(f"wearing {self.clothing_description}") + + if self.distinctive_features: + features = ", ".join(self.distinctive_features) + parts.append(f"with {features}") + + return ", ".join(parts) if parts else "" + + def to_dict(self) -> dict: + return { + "appearance_description": self.appearance_description, + "clothing_description": self.clothing_description, + "distinctive_features": self.distinctive_features, + } + + +class CharacterAnalyzer: + """ + VLM-based character appearance analyzer + + Analyzes reference images to extract detailed character descriptions + that can be injected into image generation prompts. + + Example: + >>> analyzer = CharacterAnalyzer() + >>> result = await analyzer.analyze_reference_image("character.png") + >>> print(result.appearance_description) + "young woman with long black hair, round face, fair skin" + >>> print(result.to_prompt_description()) + "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses" + """ + + def __init__(self): + """Initialize CharacterAnalyzer""" + pass + + async def analyze_reference_image( + self, + image_path: str, + ) -> CharacterAnalysisResult: + """ + Analyze a reference image to extract character appearance + + Args: + image_path: Path to the reference image + + Returns: + CharacterAnalysisResult with extracted descriptions + """ + logger.info(f"Analyzing character reference image: {image_path}") + + # Check if file exists + if not os.path.exists(image_path): + logger.warning(f"Image not found: {image_path}") + return CharacterAnalysisResult() + + try: + # Read and encode image + with open(image_path, "rb") as f: + image_data = base64.b64encode(f.read()).decode("utf-8") + + # Determine image type + ext = os.path.splitext(image_path)[1].lower() + media_type = "image/png" if ext == ".png" else "image/jpeg" + + # VLM prompt for character analysis + analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions. + +Provide your analysis in JSON format: +{ + "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.", + "clothing_description": "What they're wearing - describe colors, style, and notable items.", + "distinctive_features": ["list", "of", "unique", "identifying", "features"] +} + +Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character. + +Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick" + +Output ONLY the JSON object, no additional text.""" + + # Build multimodal message + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": analysis_prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:{media_type};base64,{image_data}" + } + } + ] + } + ] + + # Get LLM config + from pixelle_video.config import config_manager + llm_config = config_manager.config.llm + + # Create OpenAI client for VLM call + client = AsyncOpenAI( + api_key=llm_config.api_key, + base_url=llm_config.base_url + ) + + # Call VLM + response = await client.chat.completions.create( + model=llm_config.model, + messages=messages, + temperature=0.3, + max_tokens=800 + ) + vlm_response = response.choices[0].message.content + logger.debug(f"VLM character analysis response: {vlm_response[:150]}...") + + # Parse response + return self._parse_response(vlm_response) + + except Exception as e: + logger.error(f"Character analysis failed: {e}") + return CharacterAnalysisResult() + + def _parse_response(self, response: str) -> CharacterAnalysisResult: + """Parse VLM response into CharacterAnalysisResult""" + try: + # Try to extract JSON from response + match = re.search(r'\{[\s\S]*\}', response) + if match: + data = json.loads(match.group()) + else: + data = json.loads(response) + + result = CharacterAnalysisResult( + appearance_description=data.get("appearance_description", ""), + clothing_description=data.get("clothing_description", ""), + distinctive_features=data.get("distinctive_features", []), + ) + + logger.info(f"Character analysis extracted: {result.appearance_description[:80]}...") + return result + + except (json.JSONDecodeError, KeyError) as e: + logger.warning(f"Failed to parse VLM response: {e}") + + # Try to use the raw response as appearance description + if len(response) < 500 and len(response) > 20: + return CharacterAnalysisResult( + appearance_description=response.strip() + ) + + return CharacterAnalysisResult()