AI-Video/pixelle_video/prompts/image_generation.py

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Image prompt generation template

For generating image prompts from narrations.
"""

import json
from typing import List, Optional


# ==================== PRESET IMAGE STYLES ====================
# Predefined visual styles for different use cases

IMAGE_STYLE_PRESETS = {
    "stick_figure": {
        "name": "Stick Figure Sketch",
        "description": "stick figure style sketch, black and white lines, pure white background, minimalist hand-drawn feel",
        "use_case": "General scenes, simple and intuitive"
    },

    "minimal": {
        "name": "Minimalist Abstract",
        "description": "minimalist abstract art, geometric shapes, clean composition, modern design, soft pastel colors",
        "use_case": "Modern, artistic feel"
    },

    "concept": {
        "name": "Conceptual Visual",
        "description": "conceptual visual metaphors, symbolic elements, thought-provoking imagery, artistic interpretation",
        "use_case": "Deep content, philosophical thinking"
    },
}

# Default preset
DEFAULT_IMAGE_STYLE = "stick_figure"


IMAGE_PROMPT_GENERATION_PROMPT = """# Role Definition
You are a professional visual creative designer, skilled at creating expressive and symbolic image prompts for video scripts, transforming abstract concepts into concrete visual scenes.

# Core Task
Based on the existing video script, create corresponding **English** image prompts for each storyboard's "narration content", ensuring visual scenes perfectly match the narrative content and enhance audience understanding and memory.

**Important: The input contains {narrations_count} narrations. You must generate one corresponding image prompt for each narration, totaling {narrations_count} image prompts.**

# Input Content
{narrations_json}

# ⭐ Core Imagery Extraction (Critical for Relevance)
For EACH narration, you MUST:
1. **Extract 2-3 core visual imagery/metaphors** that best represent the narration's meaning
2. **Identify the emotional tone** (hopeful, melancholic, inspiring, etc.)
3. **Determine concrete visual elements** that embody these abstract concepts

Example thought process:
- Narration: "给自己一个不设限的探索时间"
- Core Imagery: exploration, freedom, open paths
- Emotional Tone: hopeful, adventurous
- Visual Elements: open road, person looking at horizon, map with unmarked routes

# Output Requirements

## Image Prompt Specifications
- Language: **Must use English** (for AI image generation models)
- **REQUIRED Structure**: [Core imagery] + [Scene description] + [Character action] + [Emotional atmosphere]
- Description length: 50-100 English words
- **The image prompt MUST directly reflect the extracted core imagery from the narration**

## Visual Creative Requirements
- Each image must accurately reflect the specific content and emotion of the corresponding narration
- **Prioritize core visual metaphors** - the main visual elements must embody the narration's key message
- Use symbolic techniques to visualize abstract concepts (e.g., paths=choices, chains=constraints, open doors=opportunities)
- Scenes should express rich emotions and actions to enhance visual impact

## Visual and Narration Coordination Principles (Most Important)
- **Direct semantic connection**: The main visual elements MUST represent the narration's core meaning
- **Avoid decorative scenes**: Don't add unrelated beautiful scenery that doesn't support the message
- **Ask yourself**: If someone saw only the image, could they guess what the narration is about?
- **Test question**: What is the ONE THING this narration is about? Make sure that thing is visible in the image.

## Creative Guidance
1. **Phenomenon Description Copy**: Use intuitive scenes to represent social phenomena
2. **Cause Analysis Copy**: Use visual metaphors of cause-and-effect relationships to represent internal logic
3. **Impact Argumentation Copy**: Use consequence scenes or contrast techniques to represent the degree of impact
4. **In-depth Discussion Copy**: Use concretization of abstract concepts to represent deep thinking
5. **Conclusion Inspiration Copy**: Use open-ended scenes or guiding elements to represent inspiration

# Output Format
Strictly output in the following JSON format, **image prompts must be in English**:

```json
{{
  "image_prompts": [
    "[Core imagery visible] + [Scene with semantic connection to narration] + [Character/action reflecting the message] + [Emotional atmosphere]",
    "[Next image prompt following the same structure]"
  ]
}}
```

# Important Reminders
1. Only output JSON format content, do not add any explanations
2. Ensure JSON format is strictly correct and can be directly parsed by the program
3. Input is {{"narrations": [narration array]}} format, output is {{"image_prompts": [image prompt array]}} format
4. **The output image_prompts array must contain exactly {narrations_count} elements, corresponding one-to-one with the input narrations array**
5. **Image prompts must use English** (for AI image generation models)
6. **⭐ Most Critical: Each image prompt must have DIRECT semantic relevance to its narration**
7. Before writing each prompt, mentally extract the core visual metaphor from the narration
8. Verify: Could someone understand the narration's message from the image alone?

Now, please create {narrations_count} corresponding **English** image prompts for the above {narrations_count} narrations. Only output JSON, no other content.
"""


def build_image_prompt_prompt(
    narrations: List[str],
    min_words: int,
    max_words: int
) -> str:
    """
    Build image prompt generation prompt

    Note: Style/prefix will be applied later via prompt_prefix in config.

    Args:
        narrations: List of narrations
        min_words: Minimum word count
        max_words: Maximum word count

    Returns:
        Formatted prompt for LLM

    Example:
        >>> build_image_prompt_prompt(narrations, 50, 100)
    """
    narrations_json = json.dumps(
        {"narrations": narrations},
        ensure_ascii=False,
        indent=2
    )

    return IMAGE_PROMPT_GENERATION_PROMPT.format(
        narrations_json=narrations_json,
        narrations_count=len(narrations),
        min_words=min_words,
        max_words=max_words
    )