AI-Video/pixelle_video/prompts/image_generation.py

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Image prompt generation template

For generating image prompts from narrations.
"""

import json
from typing import List, Optional


# ==================== PRESET IMAGE STYLES ====================
# Predefined visual styles for different use cases

IMAGE_STYLE_PRESETS = {
    "stick_figure": {
        "name": "火柴人简笔画",
        "description": "stick figure style sketch, black and white lines, pure white background, minimalist hand-drawn feel",
        "use_case": "通用场景，简单直观"
    },

    "minimal": {
        "name": "极简抽象",
        "description": "minimalist abstract art, geometric shapes, clean composition, modern design, soft pastel colors",
        "use_case": "现代感、艺术感"
    },

    "concept": {
        "name": "概念化视觉",
        "description": "conceptual visual metaphors, symbolic elements, thought-provoking imagery, artistic interpretation",
        "use_case": "深度内容、哲学思考"
    },
}

# Default preset
DEFAULT_IMAGE_STYLE = "stick_figure"


IMAGE_PROMPT_GENERATION_PROMPT = """# 角色定位
你是一个专业的视觉创意设计师，擅长为视频脚本创作富有表现力和象征性的图像提示词，将抽象概念转化为具象的视觉画面。

# 核心任务
基于已有的视频脚本，为每个分镜的"旁白内容"创作对应的**英文**图像提示词，确保视觉画面与叙述内容完美配合，增强观众的理解和记忆。

**重要：输入包含 {narrations_count} 个旁白，你必须为每个旁白都生成一个对应的图像提示词，总共输出 {narrations_count} 个图像提示词。**

# 输入内容
{narrations_json}

# 输出要求

## 图像提示词规范
- 语言：**必须使用英文**（用于 AI 图像生成模型）
- 描述结构：scene + character action + emotion + symbolic elements
- 描述长度：确保描述清晰完整且富有创意（建议 50-100 个英文单词）

## 视觉创意要求
- 每个图像都要准确反映对应旁白的具体内容和情感
- 使用象征手法将抽象概念视觉化（如用路径代表人生选择，用锁链代表束缚等）
- 画面要表现出丰富的情感和动作，增强视觉冲击力
- 通过构图和元素安排突出主题，避免过于直白的表现方式

## 关键英文词汇参考
- 象征元素：symbolic elements
- 表情：expression / facial expression
- 动作：action / gesture / movement
- 场景：scene / setting
- 氛围：atmosphere / mood

## 视觉与文案配合原则
- 图像要服务于文案，成为文案内容的视觉延伸
- 避免与文案内容无关或矛盾的视觉元素
- 选择最能增强文案说服力的视觉表现方式
- 确保观众能通过图像快速理解文案的核心观点

## 创意指导
1. **现象描述类文案**：用直观的场景表现社会现象
2. **原因分析类文案**：用因果关系的视觉比喻表现内在逻辑
3. **影响论证类文案**：用后果场景或对比手法表现影响程度
4. **深入探讨类文案**：用抽象概念的具象化表现深刻思考
5. **结论启发类文案**：用开放式场景或指引性元素表现启发性

# 输出格式
严格按照以下JSON格式输出，**图像提示词必须是英文**：

```json
{{
  "image_prompts": [
    "[detailed English image prompt following the style requirements]",
    "[detailed English image prompt following the style requirements]"
  ]
}}
```

# 重要提醒
1. 只输出JSON格式内容，不要添加任何解释说明
2. 确保JSON格式严格正确，可以被程序直接解析
3. 输入是 {{"narrations": [旁白数组]}} 格式，输出是 {{"image_prompts": [图像提示词数组]}} 格式
4. **输出的image_prompts数组必须恰好包含 {narrations_count} 个元素，与输入的narrations数组一一对应**
5. **图像提示词必须使用英文**（for AI image generation models）
6. 图像提示词必须准确反映对应旁白的具体内容和情感
7. 每个图像都要有创意性和视觉冲击力，避免千篇一律
8. 确保视觉画面能增强文案的说服力和观众的理解度

现在，请为上述 {narrations_count} 个旁白创作对应的 {narrations_count} 个**英文**图像提示词。只输出JSON，不要其他内容。
"""


def build_image_prompt_prompt(
    narrations: List[str],
    min_words: int,
    max_words: int
) -> str:
    """
    Build image prompt generation prompt

    Note: Style/prefix will be applied later via prompt_prefix in config.

    Args:
        narrations: List of narrations
        min_words: Minimum word count
        max_words: Maximum word count

    Returns:
        Formatted prompt for LLM

    Example:
        >>> build_image_prompt_prompt(narrations, 50, 100)
    """
    narrations_json = json.dumps(
        {"narrations": narrations},
        ensure_ascii=False,
        indent=2
    )

    return IMAGE_PROMPT_GENERATION_PROMPT.format(
        narrations_json=narrations_json,
        narrations_count=len(narrations),
        min_words=min_words,
        max_words=max_words
    )