From 90ceb762964a2d99f6af9cf86fd096e9cbce2ae4 Mon Sep 17 00:00:00 2001 From: empty Date: Wed, 7 Jan 2026 09:44:27 +0800 Subject: [PATCH] feat: Optimize VLM prompt for storyboard consistency, focus on constant features --- .../services/quality/character_analyzer.py | 55 +++++++++++++------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/pixelle_video/services/quality/character_analyzer.py b/pixelle_video/services/quality/character_analyzer.py index 249c255..40f8e9d 100644 --- a/pixelle_video/services/quality/character_analyzer.py +++ b/pixelle_video/services/quality/character_analyzer.py @@ -113,21 +113,36 @@ class CharacterAnalyzer: ext = os.path.splitext(image_path)[1].lower() media_type = "image/png" if ext == ".png" else "image/jpeg" - # VLM prompt for character analysis - analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions. + # VLM prompt for character analysis - optimized for storyboard consistency + # Focus on CONSTANT features (identity), exclude VARIABLE features (pose/expression) + analysis_prompt = """Analyze this character/person for VIDEO STORYBOARD consistency. -Provide your analysis in JSON format: +GOAL: Extract features that should remain CONSISTENT across different video frames. +The output will be injected into image generation prompts for multiple scenes. + +Extract ONLY these CONSTANT features: +1. Identity: gender, approximate age group (child/young/middle-aged/elderly) +2. Hair: color, length, style (NOT affected by wind/movement) +3. Face: skin tone, face shape (NOT expressions) +4. Clothing: type and colors (assume same outfit throughout video) +5. Distinctive: glasses, accessories, tattoos, scars, unique marks + +DO NOT include: +- Expressions (smile, frown) - changes per scene +- Poses/gestures - changes per scene +- View angle - determined by scene composition +- Lighting/shadows - scene-dependent +- Background elements + +Output JSON format (simple strings for direct prompt injection): { - "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.", - "clothing_description": "What they're wearing - describe colors, style, and notable items.", - "distinctive_features": ["list", "of", "unique", "identifying", "features"] + "identity": "elderly man" or "young woman" etc, + "appearance": "short gray hair, light skin, round face", + "clothing": "brown sweater vest over white shirt, dark trousers", + "distinctive": ["round glasses", "silver watch"] } -Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character. - -Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick" - -Output ONLY the JSON object, no additional text.""" +Output ONLY the JSON, no explanation.""" # Build multimodal message messages = [ @@ -243,20 +258,28 @@ Output ONLY the JSON object, no additional text.""" data = json.loads(cleaned) # Handle nested JSON structures - flatten to strings - appearance = data.get("appearance_description", "") + # New field names: identity, appearance, clothing, distinctive + identity = data.get("identity", "") + appearance = data.get("appearance", "") or data.get("appearance_description", "") + if isinstance(appearance, dict): # Flatten nested object to descriptive string parts = [] for key, value in appearance.items(): if isinstance(value, dict): - # Further nested (e.g., hair: {color, length, style}) details = ", ".join(f"{k}: {v}" for k, v in value.items()) parts.append(f"{key} ({details})") else: parts.append(f"{key}: {value}") appearance = "; ".join(parts) - clothing = data.get("clothing_description", "") + # Combine identity + appearance for full description + if identity and appearance: + full_appearance = f"{identity}, {appearance}" + else: + full_appearance = identity or appearance + + clothing = data.get("clothing", "") or data.get("clothing_description", "") if isinstance(clothing, dict): # Flatten nested clothing description parts = [] @@ -268,12 +291,12 @@ Output ONLY the JSON object, no additional text.""" parts.append(f"{person}: {items}") clothing = "; ".join(parts) - distinctive = data.get("distinctive_features", []) + distinctive = data.get("distinctive", []) or data.get("distinctive_features", []) if not isinstance(distinctive, list): distinctive = [str(distinctive)] result = CharacterAnalysisResult( - appearance_description=appearance, + appearance_description=full_appearance, clothing_description=clothing, distinctive_features=distinctive, )