161 lines
6.7 KiB
Python
161 lines
6.7 KiB
Python
# Copyright (C) 2025 AIDC-AI
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Image prompt generation template
|
|
|
|
For generating image prompts from narrations.
|
|
"""
|
|
|
|
import json
|
|
from typing import List, Optional
|
|
|
|
|
|
# ==================== PRESET IMAGE STYLES ====================
|
|
# Predefined visual styles for different use cases
|
|
|
|
IMAGE_STYLE_PRESETS = {
|
|
"stick_figure": {
|
|
"name": "Stick Figure Sketch",
|
|
"description": "stick figure style sketch, black and white lines, pure white background, minimalist hand-drawn feel",
|
|
"use_case": "General scenes, simple and intuitive"
|
|
},
|
|
|
|
"minimal": {
|
|
"name": "Minimalist Abstract",
|
|
"description": "minimalist abstract art, geometric shapes, clean composition, modern design, soft pastel colors",
|
|
"use_case": "Modern, artistic feel"
|
|
},
|
|
|
|
"concept": {
|
|
"name": "Conceptual Visual",
|
|
"description": "conceptual visual metaphors, symbolic elements, thought-provoking imagery, artistic interpretation",
|
|
"use_case": "Deep content, philosophical thinking"
|
|
},
|
|
}
|
|
|
|
# Default preset
|
|
DEFAULT_IMAGE_STYLE = "stick_figure"
|
|
|
|
|
|
IMAGE_PROMPT_GENERATION_PROMPT = """# Role Definition
|
|
You are a professional visual creative designer, skilled at creating expressive and symbolic image prompts for video scripts, transforming abstract concepts into concrete visual scenes.
|
|
|
|
# Core Task
|
|
Based on the existing video script, create corresponding **English** image prompts for each storyboard's "narration content", ensuring visual scenes perfectly match the narrative content and enhance audience understanding and memory.
|
|
|
|
**Important: The input contains {narrations_count} narrations. You must generate one corresponding image prompt for each narration, totaling {narrations_count} image prompts.**
|
|
|
|
# Input Content
|
|
{narrations_json}
|
|
|
|
# ⭐ Core Imagery Extraction (Critical for Relevance)
|
|
For EACH narration, you MUST:
|
|
1. **Extract 2-3 core visual imagery/metaphors** that best represent the narration's meaning
|
|
2. **Identify the emotional tone** (hopeful, melancholic, inspiring, etc.)
|
|
3. **Determine concrete visual elements** that embody these abstract concepts
|
|
|
|
Example thought process:
|
|
- Narration: "给自己一个不设限的探索时间"
|
|
- Core Imagery: exploration, freedom, open paths
|
|
- Emotional Tone: hopeful, adventurous
|
|
- Visual Elements: open road, person looking at horizon, map with unmarked routes
|
|
|
|
# Output Requirements
|
|
|
|
## Image Prompt Specifications
|
|
- Language: **Must use English** (for AI image generation models)
|
|
- **REQUIRED Structure**: [Core imagery] + [Scene description] + [Character action] + [Emotional atmosphere]
|
|
- Description length: 50-100 English words
|
|
- **The image prompt MUST directly reflect the extracted core imagery from the narration**
|
|
|
|
## Visual Creative Requirements
|
|
- Each image must accurately reflect the specific content and emotion of the corresponding narration
|
|
- **Prioritize core visual metaphors** - the main visual elements must embody the narration's key message
|
|
- Use symbolic techniques to visualize abstract concepts (e.g., paths=choices, chains=constraints, open doors=opportunities)
|
|
- Scenes should express rich emotions and actions to enhance visual impact
|
|
|
|
## Visual and Narration Coordination Principles (Most Important)
|
|
- **Direct semantic connection**: The main visual elements MUST represent the narration's core meaning
|
|
- **Avoid decorative scenes**: Don't add unrelated beautiful scenery that doesn't support the message
|
|
- **Ask yourself**: If someone saw only the image, could they guess what the narration is about?
|
|
- **Test question**: What is the ONE THING this narration is about? Make sure that thing is visible in the image.
|
|
|
|
## Creative Guidance
|
|
1. **Phenomenon Description Copy**: Use intuitive scenes to represent social phenomena
|
|
2. **Cause Analysis Copy**: Use visual metaphors of cause-and-effect relationships to represent internal logic
|
|
3. **Impact Argumentation Copy**: Use consequence scenes or contrast techniques to represent the degree of impact
|
|
4. **In-depth Discussion Copy**: Use concretization of abstract concepts to represent deep thinking
|
|
5. **Conclusion Inspiration Copy**: Use open-ended scenes or guiding elements to represent inspiration
|
|
|
|
# Output Format
|
|
Strictly output in the following JSON format, **image prompts must be in English**:
|
|
|
|
```json
|
|
{{
|
|
"image_prompts": [
|
|
"[Core imagery visible] + [Scene with semantic connection to narration] + [Character/action reflecting the message] + [Emotional atmosphere]",
|
|
"[Next image prompt following the same structure]"
|
|
]
|
|
}}
|
|
```
|
|
|
|
# Important Reminders
|
|
1. Only output JSON format content, do not add any explanations
|
|
2. Ensure JSON format is strictly correct and can be directly parsed by the program
|
|
3. Input is {{"narrations": [narration array]}} format, output is {{"image_prompts": [image prompt array]}} format
|
|
4. **The output image_prompts array must contain exactly {narrations_count} elements, corresponding one-to-one with the input narrations array**
|
|
5. **Image prompts must use English** (for AI image generation models)
|
|
6. **⭐ Most Critical: Each image prompt must have DIRECT semantic relevance to its narration**
|
|
7. Before writing each prompt, mentally extract the core visual metaphor from the narration
|
|
8. Verify: Could someone understand the narration's message from the image alone?
|
|
|
|
Now, please create {narrations_count} corresponding **English** image prompts for the above {narrations_count} narrations. Only output JSON, no other content.
|
|
"""
|
|
|
|
|
|
|
|
def build_image_prompt_prompt(
|
|
narrations: List[str],
|
|
min_words: int,
|
|
max_words: int
|
|
) -> str:
|
|
"""
|
|
Build image prompt generation prompt
|
|
|
|
Note: Style/prefix will be applied later via prompt_prefix in config.
|
|
|
|
Args:
|
|
narrations: List of narrations
|
|
min_words: Minimum word count
|
|
max_words: Maximum word count
|
|
|
|
Returns:
|
|
Formatted prompt for LLM
|
|
|
|
Example:
|
|
>>> build_image_prompt_prompt(narrations, 50, 100)
|
|
"""
|
|
narrations_json = json.dumps(
|
|
{"narrations": narrations},
|
|
ensure_ascii=False,
|
|
indent=2
|
|
)
|
|
|
|
return IMAGE_PROMPT_GENERATION_PROMPT.format(
|
|
narrations_json=narrations_json,
|
|
narrations_count=len(narrations),
|
|
min_words=min_words,
|
|
max_words=max_words
|
|
)
|
|
|