From 90ceb762964a2d99f6af9cf86fd096e9cbce2ae4 Mon Sep 17 00:00:00 2001
From: empty <let5sne.mac@gmail.com>
Date: Wed, 7 Jan 2026 09:44:27 +0800
Subject: [PATCH] feat: Optimize VLM prompt for storyboard consistency, focus
 on constant features

---
 .../services/quality/character_analyzer.py    | 55 +++++++++++++------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/pixelle_video/services/quality/character_analyzer.py b/pixelle_video/services/quality/character_analyzer.py
index 249c255..40f8e9d 100644
--- a/pixelle_video/services/quality/character_analyzer.py
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -113,21 +113,36 @@ class CharacterAnalyzer:
             ext = os.path.splitext(image_path)[1].lower()
             media_type = "image/png" if ext == ".png" else "image/jpeg"
             
-            # VLM prompt for character analysis
-            analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions.
+            # VLM prompt for character analysis - optimized for storyboard consistency
+            # Focus on CONSTANT features (identity), exclude VARIABLE features (pose/expression)
+            analysis_prompt = """Analyze this character/person for VIDEO STORYBOARD consistency.
 
-Provide your analysis in JSON format:
+GOAL: Extract features that should remain CONSISTENT across different video frames.
+The output will be injected into image generation prompts for multiple scenes.
+
+Extract ONLY these CONSTANT features:
+1. Identity: gender, approximate age group (child/young/middle-aged/elderly)
+2. Hair: color, length, style (NOT affected by wind/movement)
+3. Face: skin tone, face shape (NOT expressions)
+4. Clothing: type and colors (assume same outfit throughout video)
+5. Distinctive: glasses, accessories, tattoos, scars, unique marks
+
+DO NOT include:
+- Expressions (smile, frown) - changes per scene
+- Poses/gestures - changes per scene
+- View angle - determined by scene composition
+- Lighting/shadows - scene-dependent
+- Background elements
+
+Output JSON format (simple strings for direct prompt injection):
 {
-    "appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.",
-    "clothing_description": "What they're wearing - describe colors, style, and notable items.",
-    "distinctive_features": ["list", "of", "unique", "identifying", "features"]
+    "identity": "elderly man" or "young woman" etc,
+    "appearance": "short gray hair, light skin, round face",
+    "clothing": "brown sweater vest over white shirt, dark trousers",
+    "distinctive": ["round glasses", "silver watch"]
 }
 
-Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character.
-
-Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick"
-
-Output ONLY the JSON object, no additional text."""
+Output ONLY the JSON, no explanation."""
 
             # Build multimodal message
             messages = [
@@ -243,20 +258,28 @@ Output ONLY the JSON object, no additional text."""
                 data = json.loads(cleaned)
             
             # Handle nested JSON structures - flatten to strings
-            appearance = data.get("appearance_description", "")
+            # New field names: identity, appearance, clothing, distinctive
+            identity = data.get("identity", "")
+            appearance = data.get("appearance", "") or data.get("appearance_description", "")
+            
             if isinstance(appearance, dict):
                 # Flatten nested object to descriptive string
                 parts = []
                 for key, value in appearance.items():
                     if isinstance(value, dict):
-                        # Further nested (e.g., hair: {color, length, style})
                         details = ", ".join(f"{k}: {v}" for k, v in value.items())
                         parts.append(f"{key} ({details})")
                     else:
                         parts.append(f"{key}: {value}")
                 appearance = "; ".join(parts)
             
-            clothing = data.get("clothing_description", "")
+            # Combine identity + appearance for full description
+            if identity and appearance:
+                full_appearance = f"{identity}, {appearance}"
+            else:
+                full_appearance = identity or appearance
+            
+            clothing = data.get("clothing", "") or data.get("clothing_description", "")
             if isinstance(clothing, dict):
                 # Flatten nested clothing description
                 parts = []
@@ -268,12 +291,12 @@ Output ONLY the JSON object, no additional text."""
                         parts.append(f"{person}: {items}")
                 clothing = "; ".join(parts)
             
-            distinctive = data.get("distinctive_features", [])
+            distinctive = data.get("distinctive", []) or data.get("distinctive_features", [])
             if not isinstance(distinctive, list):
                 distinctive = [str(distinctive)]
             
             result = CharacterAnalysisResult(
-                appearance_description=appearance,
+                appearance_description=full_appearance,
                 clothing_description=clothing,
                 distinctive_features=distinctive,
             )