feat: Optimize VLM prompt for storyboard consistency, focus on constant features
This commit is contained in:
@@ -113,21 +113,36 @@ class CharacterAnalyzer:
|
|||||||
ext = os.path.splitext(image_path)[1].lower()
|
ext = os.path.splitext(image_path)[1].lower()
|
||||||
media_type = "image/png" if ext == ".png" else "image/jpeg"
|
media_type = "image/png" if ext == ".png" else "image/jpeg"
|
||||||
|
|
||||||
# VLM prompt for character analysis
|
# VLM prompt for character analysis - optimized for storyboard consistency
|
||||||
analysis_prompt = """Analyze this character/person image and extract detailed visual descriptions.
|
# Focus on CONSTANT features (identity), exclude VARIABLE features (pose/expression)
|
||||||
|
analysis_prompt = """Analyze this character/person for VIDEO STORYBOARD consistency.
|
||||||
|
|
||||||
Provide your analysis in JSON format:
|
GOAL: Extract features that should remain CONSISTENT across different video frames.
|
||||||
|
The output will be injected into image generation prompts for multiple scenes.
|
||||||
|
|
||||||
|
Extract ONLY these CONSTANT features:
|
||||||
|
1. Identity: gender, approximate age group (child/young/middle-aged/elderly)
|
||||||
|
2. Hair: color, length, style (NOT affected by wind/movement)
|
||||||
|
3. Face: skin tone, face shape (NOT expressions)
|
||||||
|
4. Clothing: type and colors (assume same outfit throughout video)
|
||||||
|
5. Distinctive: glasses, accessories, tattoos, scars, unique marks
|
||||||
|
|
||||||
|
DO NOT include:
|
||||||
|
- Expressions (smile, frown) - changes per scene
|
||||||
|
- Poses/gestures - changes per scene
|
||||||
|
- View angle - determined by scene composition
|
||||||
|
- Lighting/shadows - scene-dependent
|
||||||
|
- Background elements
|
||||||
|
|
||||||
|
Output JSON format (simple strings for direct prompt injection):
|
||||||
{
|
{
|
||||||
"appearance_description": "Detailed physical features including: hair (color, length, style), face shape, eye color, skin tone, approximate age, gender, body type. Be specific and descriptive.",
|
"identity": "elderly man" or "young woman" etc,
|
||||||
"clothing_description": "What they're wearing - describe colors, style, and notable items.",
|
"appearance": "short gray hair, light skin, round face",
|
||||||
"distinctive_features": ["list", "of", "unique", "identifying", "features"]
|
"clothing": "brown sweater vest over white shirt, dark trousers",
|
||||||
|
"distinctive": ["round glasses", "silver watch"]
|
||||||
}
|
}
|
||||||
|
|
||||||
Focus on visually distinctive and reproducible features. Be specific enough that another image generator could recreate a similar-looking character.
|
Output ONLY the JSON, no explanation."""
|
||||||
|
|
||||||
Examples of good distinctive_features: "round glasses", "freckles", "scar on left cheek", "silver earrings", "bright red lipstick"
|
|
||||||
|
|
||||||
Output ONLY the JSON object, no additional text."""
|
|
||||||
|
|
||||||
# Build multimodal message
|
# Build multimodal message
|
||||||
messages = [
|
messages = [
|
||||||
@@ -243,20 +258,28 @@ Output ONLY the JSON object, no additional text."""
|
|||||||
data = json.loads(cleaned)
|
data = json.loads(cleaned)
|
||||||
|
|
||||||
# Handle nested JSON structures - flatten to strings
|
# Handle nested JSON structures - flatten to strings
|
||||||
appearance = data.get("appearance_description", "")
|
# New field names: identity, appearance, clothing, distinctive
|
||||||
|
identity = data.get("identity", "")
|
||||||
|
appearance = data.get("appearance", "") or data.get("appearance_description", "")
|
||||||
|
|
||||||
if isinstance(appearance, dict):
|
if isinstance(appearance, dict):
|
||||||
# Flatten nested object to descriptive string
|
# Flatten nested object to descriptive string
|
||||||
parts = []
|
parts = []
|
||||||
for key, value in appearance.items():
|
for key, value in appearance.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# Further nested (e.g., hair: {color, length, style})
|
|
||||||
details = ", ".join(f"{k}: {v}" for k, v in value.items())
|
details = ", ".join(f"{k}: {v}" for k, v in value.items())
|
||||||
parts.append(f"{key} ({details})")
|
parts.append(f"{key} ({details})")
|
||||||
else:
|
else:
|
||||||
parts.append(f"{key}: {value}")
|
parts.append(f"{key}: {value}")
|
||||||
appearance = "; ".join(parts)
|
appearance = "; ".join(parts)
|
||||||
|
|
||||||
clothing = data.get("clothing_description", "")
|
# Combine identity + appearance for full description
|
||||||
|
if identity and appearance:
|
||||||
|
full_appearance = f"{identity}, {appearance}"
|
||||||
|
else:
|
||||||
|
full_appearance = identity or appearance
|
||||||
|
|
||||||
|
clothing = data.get("clothing", "") or data.get("clothing_description", "")
|
||||||
if isinstance(clothing, dict):
|
if isinstance(clothing, dict):
|
||||||
# Flatten nested clothing description
|
# Flatten nested clothing description
|
||||||
parts = []
|
parts = []
|
||||||
@@ -268,12 +291,12 @@ Output ONLY the JSON object, no additional text."""
|
|||||||
parts.append(f"{person}: {items}")
|
parts.append(f"{person}: {items}")
|
||||||
clothing = "; ".join(parts)
|
clothing = "; ".join(parts)
|
||||||
|
|
||||||
distinctive = data.get("distinctive_features", [])
|
distinctive = data.get("distinctive", []) or data.get("distinctive_features", [])
|
||||||
if not isinstance(distinctive, list):
|
if not isinstance(distinctive, list):
|
||||||
distinctive = [str(distinctive)]
|
distinctive = [str(distinctive)]
|
||||||
|
|
||||||
result = CharacterAnalysisResult(
|
result = CharacterAnalysisResult(
|
||||||
appearance_description=appearance,
|
appearance_description=full_appearance,
|
||||||
clothing_description=clothing,
|
clothing_description=clothing,
|
||||||
distinctive_features=distinctive,
|
distinctive_features=distinctive,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user