From 2978622f7f95b237d4ddc199a848b88969d546c4 Mon Sep 17 00:00:00 2001
From: empty <let5sne.mac@gmail.com>
Date: Tue, 6 Jan 2026 23:04:20 +0800
Subject: [PATCH] feat(P0): Enhance image prompt generation with core imagery
 extraction for better text-image alignment

---
 pixelle_video/prompts/image_generation.py | 49 +++++++++++++----------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/pixelle_video/prompts/image_generation.py b/pixelle_video/prompts/image_generation.py
index 0890666..19b1ff2 100644
--- a/pixelle_video/prompts/image_generation.py
+++ b/pixelle_video/prompts/image_generation.py
@@ -58,31 +58,37 @@ Based on the existing video script, create corresponding **English** image promp
 # Input Content
 {narrations_json}
 
+# ⭐ Core Imagery Extraction (Critical for Relevance)
+For EACH narration, you MUST:
+1. **Extract 2-3 core visual imagery/metaphors** that best represent the narration's meaning
+2. **Identify the emotional tone** (hopeful, melancholic, inspiring, etc.)
+3. **Determine concrete visual elements** that embody these abstract concepts
+
+Example thought process:
+- Narration: "给自己一个不设限的探索时间"
+- Core Imagery: exploration, freedom, open paths
+- Emotional Tone: hopeful, adventurous
+- Visual Elements: open road, person looking at horizon, map with unmarked routes
+
 # Output Requirements
 
 ## Image Prompt Specifications
 - Language: **Must use English** (for AI image generation models)
-- Description structure: scene + character action + emotion + symbolic elements
-- Description length: Ensure clear, complete, and creative descriptions (recommended 50-100 English words)
+- **REQUIRED Structure**: [Core imagery] + [Scene description] + [Character action] + [Emotional atmosphere]
+- Description length: 50-100 English words
+- **The image prompt MUST directly reflect the extracted core imagery from the narration**
 
 ## Visual Creative Requirements
 - Each image must accurately reflect the specific content and emotion of the corresponding narration
-- Use symbolic techniques to visualize abstract concepts (e.g., use paths to represent life choices, chains to represent constraints, etc.)
+- **Prioritize core visual metaphors** - the main visual elements must embody the narration's key message
+- Use symbolic techniques to visualize abstract concepts (e.g., paths=choices, chains=constraints, open doors=opportunities)
 - Scenes should express rich emotions and actions to enhance visual impact
-- Highlight themes through composition and element arrangement, avoid overly literal representations
 
-## Key English Vocabulary Reference
-- Symbolic elements: symbolic elements
-- Expression: expression / facial expression
-- Action: action / gesture / movement
-- Scene: scene / setting
-- Atmosphere: atmosphere / mood
-
-## Visual and Copy Coordination Principles
-- Images should serve the copy, becoming a visual extension of the copy content
-- Avoid visual elements unrelated to or contradicting the copy content
-- Choose visual presentation methods that best enhance the persuasiveness of the copy
-- Ensure the audience can quickly understand the core viewpoint of the copy through images
+## Visual and Narration Coordination Principles (Most Important)
+- **Direct semantic connection**: The main visual elements MUST represent the narration's core meaning
+- **Avoid decorative scenes**: Don't add unrelated beautiful scenery that doesn't support the message
+- **Ask yourself**: If someone saw only the image, could they guess what the narration is about?
+- **Test question**: What is the ONE THING this narration is about? Make sure that thing is visible in the image.
 
 ## Creative Guidance
 1. **Phenomenon Description Copy**: Use intuitive scenes to represent social phenomena
@@ -97,8 +103,8 @@ Strictly output in the following JSON format, **image prompts must be in English
 ```json
 {{
   "image_prompts": [
-    "[detailed English image prompt following the style requirements]",
-    "[detailed English image prompt following the style requirements]"
+    "[Core imagery visible] + [Scene with semantic connection to narration] + [Character/action reflecting the message] + [Emotional atmosphere]",
+    "[Next image prompt following the same structure]"
   ]
 }}
 ```
@@ -109,14 +115,15 @@ Strictly output in the following JSON format, **image prompts must be in English
 3. Input is {{"narrations": [narration array]}} format, output is {{"image_prompts": [image prompt array]}} format
 4. **The output image_prompts array must contain exactly {narrations_count} elements, corresponding one-to-one with the input narrations array**
 5. **Image prompts must use English** (for AI image generation models)
-6. Image prompts must accurately reflect the specific content and emotion of the corresponding narration
-7. Each image must be creative and visually impactful, avoid being monotonous
-8. Ensure visual scenes can enhance the persuasiveness of the copy and audience understanding
+6. **⭐ Most Critical: Each image prompt must have DIRECT semantic relevance to its narration**
+7. Before writing each prompt, mentally extract the core visual metaphor from the narration
+8. Verify: Could someone understand the narration's message from the image alone?
 
 Now, please create {narrations_count} corresponding **English** image prompts for the above {narrations_count} narrations. Only output JSON, no other content.
 """
 
 
+
 def build_image_prompt_prompt(
     narrations: List[str],
     min_words: int,