feat: Add smart paragraph merging mode with AI grouping
Some checks failed
Deploy Documentation / deploy (push) Has been cancelled

- Add "smart" split mode that uses LLM to intelligently merge related paragraphs
- Implement two-step approach: analyze text structure, then group by semantic relevance
- Add paragraph_merging.py with analysis and grouping prompts
- Update UI to support smart mode selection with auto-detect hint
- Add i18n translations for smart mode (en_US, zh_CN)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
empty
2026-01-17 00:19:46 +08:00
parent 3a8ec576ee
commit 3d3aba3670
8 changed files with 427 additions and 6 deletions

View File

@@ -124,7 +124,13 @@ class StandardPipeline(LinearVideoPipeline):
else: # fixed else: # fixed
self._report_progress(ctx.progress_callback, "splitting_script", 0.05) self._report_progress(ctx.progress_callback, "splitting_script", 0.05)
split_mode = ctx.params.get("split_mode", "paragraph") split_mode = ctx.params.get("split_mode", "paragraph")
ctx.narrations = await split_narration_script(text, split_mode=split_mode) target_segments = ctx.params.get("target_segments", 8)
ctx.narrations = await split_narration_script(
text,
split_mode=split_mode,
llm_service=self.llm if split_mode == "smart" else None,
target_segments=target_segments
)
logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})") logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})")
logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode") logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode")

View File

@@ -29,6 +29,13 @@ from pixelle_video.prompts.image_generation import (
) )
from pixelle_video.prompts.style_conversion import build_style_conversion_prompt from pixelle_video.prompts.style_conversion import build_style_conversion_prompt
# Paragraph merging (two-step: analysis + grouping)
from pixelle_video.prompts.paragraph_merging import (
build_paragraph_analysis_prompt,
build_paragraph_grouping_prompt,
build_paragraph_merging_prompt, # Legacy support
)
__all__ = [ __all__ = [
# Narration builders # Narration builders
@@ -40,6 +47,11 @@ __all__ = [
"build_image_prompt_prompt", "build_image_prompt_prompt",
"build_style_conversion_prompt", "build_style_conversion_prompt",
# Paragraph merging (two-step)
"build_paragraph_analysis_prompt",
"build_paragraph_grouping_prompt",
"build_paragraph_merging_prompt", # Legacy
# Image style presets # Image style presets
"IMAGE_STYLE_PRESETS", "IMAGE_STYLE_PRESETS",
"DEFAULT_IMAGE_STYLE", "DEFAULT_IMAGE_STYLE",

View File

@@ -0,0 +1,202 @@
# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Paragraph merging prompt
For intelligently merging short paragraphs into longer segments suitable for video storyboards.
Uses a two-step approach: first analyze, then group.
"""
import json
from typing import List
# Step 1: Analyze text and recommend segment count
PARAGRAPH_ANALYSIS_PROMPT = """# 任务定义
你是一个专业的视频分镜规划师。请分析以下文本,推荐最佳分镜数量。
# 核心任务
分析文本结构,根据以下原则推荐分镜数量:
## 分析原则
1. **语义边界**:识别场景切换、话题转换、情绪变化点
2. **叙事完整性**:保持对话回合完整(问-答不拆分)
3. **时长控制**:每个分镜语音时长建议 15-45 秒(约 60-180 字)
4. **视觉多样性**:确保分镜之间有足够的画面变化
## 文本信息
- 总段落数:{total_paragraphs}
- 预估总字数:{total_chars}
- 预估总时长:{estimated_duration}
## 输入段落预览
{paragraphs_preview}
# 输出格式
返回 JSON 格式的分析结果:
```json
{{
"recommended_segments": 8,
"reasoning": "文本包含开场设定、分手对话、争吵升级、离别等多个场景切换点...",
"scene_boundaries": [
{{"after_paragraph": 3, "reason": "场景从背景介绍转入对话"}},
{{"after_paragraph": 7, "reason": "对话情绪升级"}},
...
]
}}
```
# 重要提醒
1. recommended_segments 应该在 3-15 之间
2. 每个分镜平均字数建议 80-200 字
3. scene_boundaries 标记主要的场景切换点,用于后续分组参考
4. 只输出 JSON不要添加其他解释
"""
# Step 2: Group paragraphs based on analysis
PARAGRAPH_GROUPING_PROMPT = """# 任务定义
你是一个专业的文本分段专家。根据分析结果,将段落分组。
# 核心任务
{total_paragraphs} 个段落(编号 0 到 {max_index})分成 **{target_segments}** 个分组。
# 分析建议
{analysis_hint}
# 分组原则
1. **语义关联**:将描述同一场景、同一对话回合的段落放在一起
2. **对话完整**:一轮完整的对话(问与答)应该在同一分组
3. **场景统一**:同一时间、地点发生的事件应该在同一分组
4. **长度均衡**:每个分组的字数尽量均衡(目标 80-200 字/分组)
5. **顺序保持**:分组内段落必须连续
# 输入段落
{paragraphs_preview}
# 输出格式
返回 JSON 格式,包含每个分组的起始和结束索引(包含)。
```json
{{
"groups": [
{{"start": 0, "end": 3}},
{{"start": 4, "end": 7}},
{{"start": 8, "end": 12}}
]
}}
```
# 重要提醒
1. 必须输出正好 {target_segments} 个分组
2. 分组必须覆盖所有段落(从 0 到 {max_index}
3. 每个分组的 start 必须等于上一个 end + 1
4. 只输出 JSON不要添加其他解释
"""
def build_paragraph_analysis_prompt(
paragraphs: List[str],
) -> str:
"""
Build prompt for analyzing text and recommending segment count
Args:
paragraphs: List of original paragraphs
Returns:
Formatted prompt for analysis
"""
# Calculate stats
total_chars = sum(len(p) for p in paragraphs)
# Estimate: ~250 chars/minute for Chinese speech
estimated_duration = int(total_chars / 250 * 60)
# Create preview for each paragraph (first 50 chars)
previews = []
for i, para in enumerate(paragraphs):
preview = para[:50].replace('\n', ' ')
char_count = len(para)
if len(para) > 50:
preview += "..."
previews.append(f"[{i}] ({char_count}字) {preview}")
paragraphs_preview = "\n".join(previews)
return PARAGRAPH_ANALYSIS_PROMPT.format(
paragraphs_preview=paragraphs_preview,
total_paragraphs=len(paragraphs),
total_chars=total_chars,
estimated_duration=estimated_duration
)
def build_paragraph_grouping_prompt(
paragraphs: List[str],
target_segments: int,
analysis_result: dict = None,
) -> str:
"""
Build prompt for grouping paragraphs based on analysis
Args:
paragraphs: List of original paragraphs
target_segments: Target number of segments (from analysis)
analysis_result: Optional analysis result for context
Returns:
Formatted prompt for grouping
"""
# Create preview with char counts
previews = []
for i, para in enumerate(paragraphs):
preview = para[:50].replace('\n', ' ')
char_count = len(para)
if len(para) > 50:
preview += "..."
previews.append(f"[{i}] ({char_count}字) {preview}")
paragraphs_preview = "\n".join(previews)
# Build analysis hint if available
analysis_hint = ""
if analysis_result:
if "reasoning" in analysis_result:
analysis_hint += f"分析理由:{analysis_result['reasoning']}\n"
if "scene_boundaries" in analysis_result:
boundaries = [str(b.get("after_paragraph", "")) for b in analysis_result["scene_boundaries"]]
analysis_hint += f"建议场景切换点(段落后):{', '.join(boundaries)}"
if not analysis_hint:
analysis_hint = "无额外分析信息"
return PARAGRAPH_GROUPING_PROMPT.format(
paragraphs_preview=paragraphs_preview,
target_segments=target_segments,
total_paragraphs=len(paragraphs),
max_index=len(paragraphs) - 1,
analysis_hint=analysis_hint
)
# Legacy support - keep original function name for backward compatibility
def build_paragraph_merging_prompt(
paragraphs: List[str],
target_segments: int = 8,
) -> str:
"""
Legacy function for backward compatibility.
Now delegates to build_paragraph_grouping_prompt.
"""
return build_paragraph_grouping_prompt(paragraphs, target_segments)

View File

@@ -208,7 +208,9 @@ async def generate_narrations_from_content(
async def split_narration_script( async def split_narration_script(
script: str, script: str,
split_mode: Literal["paragraph", "line", "sentence"] = "paragraph", split_mode: Literal["paragraph", "line", "sentence", "smart"] = "paragraph",
llm_service = None,
target_segments: int = 8,
) -> List[str]: ) -> List[str]:
""" """
Split user-provided narration script into segments Split user-provided narration script into segments
@@ -219,6 +221,9 @@ async def split_narration_script(
- "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs - "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs
- "line": Split by single newline (\\n), each line is a segment - "line": Split by single newline (\\n), each line is a segment
- "sentence": Split by sentence-ending punctuation (。.!?) - "sentence": Split by sentence-ending punctuation (。.!?)
- "smart": First split by paragraph, then use LLM to intelligently merge related paragraphs
llm_service: LLM service instance (required for "smart" mode)
target_segments: Target number of segments for "smart" mode (default: 8)
Returns: Returns:
List of narration segments List of narration segments
@@ -227,7 +232,31 @@ async def split_narration_script(
narrations = [] narrations = []
if split_mode == "paragraph": if split_mode == "smart":
# Smart mode: first split by paragraph, then merge intelligently
if llm_service is None:
raise ValueError("llm_service is required for 'smart' split mode")
# Step 1: Split by paragraph first
paragraphs = re.split(r'\n\s*\n', script)
paragraphs = [p.strip() for p in paragraphs if p.strip()]
logger.info(f" Initial split: {len(paragraphs)} paragraphs")
# Step 2: Merge intelligently using LLM
# If target_segments is None, merge_paragraphs_smart will auto-analyze
if target_segments is not None and len(paragraphs) <= target_segments:
# No need to merge if already within target
logger.info(f" Paragraphs count ({len(paragraphs)}) <= target ({target_segments}), no merge needed")
narrations = paragraphs
else:
narrations = await merge_paragraphs_smart(
llm_service=llm_service,
paragraphs=paragraphs,
target_segments=target_segments # Can be None for auto-analysis
)
logger.info(f"✅ Smart split: {len(paragraphs)} paragraphs -> {len(narrations)} segments")
elif split_mode == "paragraph":
# Split by double newline (paragraph mode) # Split by double newline (paragraph mode)
# Preserve single newlines within paragraphs # Preserve single newlines within paragraphs
paragraphs = re.split(r'\n\s*\n', script) paragraphs = re.split(r'\n\s*\n', script)
@@ -266,6 +295,150 @@ async def split_narration_script(
return narrations return narrations
async def merge_paragraphs_smart(
llm_service,
paragraphs: List[str],
target_segments: int = None, # Now optional - auto-analyze if not provided
max_retries: int = 3,
) -> List[str]:
"""
Use LLM to intelligently merge paragraphs based on semantic relevance.
Two-step approach:
1. If target_segments is not provided, first analyze text to recommend optimal count
2. Then group paragraphs based on the target count
Args:
llm_service: LLM service instance
paragraphs: List of original paragraphs
target_segments: Target number of merged segments (auto-analyzed if None)
max_retries: Maximum retry attempts for each step
Returns:
List of merged paragraphs
"""
from pixelle_video.prompts import (
build_paragraph_analysis_prompt,
build_paragraph_grouping_prompt
)
# ========================================
# Step 1: Analyze and recommend segment count (if not provided)
# ========================================
if target_segments is None:
logger.info(f"Analyzing {len(paragraphs)} paragraphs to recommend segment count...")
analysis_prompt = build_paragraph_analysis_prompt(paragraphs)
analysis_result = None
for attempt in range(1, max_retries + 1):
try:
response = await llm_service(
prompt=analysis_prompt,
temperature=0.3,
max_tokens=1500
)
logger.debug(f"Analysis response length: {len(response)} chars")
result = _parse_json(response)
if "recommended_segments" not in result:
raise KeyError("Missing 'recommended_segments' in analysis")
target_segments = result["recommended_segments"]
analysis_result = result
# Validate range
if target_segments < 3:
target_segments = 3
elif target_segments > 15:
target_segments = 15
reasoning = result.get("reasoning", "N/A")
logger.info(f"✅ Analysis complete: recommended {target_segments} segments")
logger.info(f" Reasoning: {reasoning[:100]}...")
break
except Exception as e:
logger.error(f"Analysis attempt {attempt} failed: {e}")
if attempt >= max_retries:
# Fallback: use simple heuristic
target_segments = max(3, min(12, len(paragraphs) // 3))
logger.warning(f"Using fallback: {target_segments} segments (paragraphs/3)")
analysis_result = None
break
logger.info("Retrying analysis...")
else:
analysis_result = None
logger.info(f"Using provided target: {target_segments} segments")
# ========================================
# Step 2: Group paragraphs
# ========================================
logger.info(f"Grouping {len(paragraphs)} paragraphs into {target_segments} segments...")
grouping_prompt = build_paragraph_grouping_prompt(
paragraphs=paragraphs,
target_segments=target_segments,
analysis_result=analysis_result
)
for attempt in range(1, max_retries + 1):
try:
response = await llm_service(
prompt=grouping_prompt,
temperature=0.3,
max_tokens=2000
)
logger.debug(f"Grouping response length: {len(response)} chars")
result = _parse_json(response)
if "groups" not in result:
raise KeyError("Invalid response format: missing 'groups'")
groups = result["groups"]
# Validate count
if len(groups) != target_segments:
logger.warning(
f"Grouping attempt {attempt}: expected {target_segments} groups, got {len(groups)}"
)
if attempt < max_retries:
continue
logger.warning(f"Accepting {len(groups)} groups after {max_retries} attempts")
# Validate group boundaries
for i, group in enumerate(groups):
if "start" not in group or "end" not in group:
raise ValueError(f"Group {i} missing 'start' or 'end'")
if group["start"] > group["end"]:
raise ValueError(f"Group {i} has invalid range: start > end")
if group["start"] < 0 or group["end"] >= len(paragraphs):
raise ValueError(f"Group {i} has out-of-bounds indices")
# Merge paragraphs based on groups
merged = []
for group in groups:
start, end = group["start"], group["end"]
merged_text = "\n\n".join(paragraphs[start:end + 1])
merged.append(merged_text)
logger.info(f"✅ Successfully merged into {len(merged)} segments")
return merged
except Exception as e:
logger.error(f"Grouping attempt {attempt} failed: {e}")
if attempt >= max_retries:
raise
logger.info("Retrying grouping...")
# Fallback: should not reach here
return paragraphs
async def generate_image_prompts( async def generate_image_prompts(
llm_service, llm_service,
narrations: List[str], narrations: List[str],
@@ -489,8 +662,8 @@ def _parse_json(text: str) -> dict:
except json.JSONDecodeError: except json.JSONDecodeError:
pass pass
# Try to find any JSON object in the text # Try to find any JSON object with known keys (including analysis keys)
json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts")\s*:\s*\[[^\]]*\][^{}]*\}' json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts"|"video_prompts"|"merged_paragraphs"|"groups"|"recommended_segments"|"scene_boundaries")\s*:\s*[^{}]*\}'
match = re.search(json_pattern, text, re.DOTALL) match = re.search(json_pattern, text, re.DOTALL)
if match: if match:
try: try:
@@ -498,6 +671,17 @@ def _parse_json(text: str) -> dict:
except json.JSONDecodeError: except json.JSONDecodeError:
pass pass
# Try to find any JSON object that looks like it contains an array
# This is a more aggressive fallback for complex nested arrays
json_start = text.find('{')
json_end = text.rfind('}')
if json_start != -1 and json_end != -1 and json_end > json_start:
potential_json = text[json_start:json_end + 1]
try:
return json.loads(potential_json)
except json.JSONDecodeError:
pass
# If all fails, raise error # If all fails, raise error
raise json.JSONDecodeError("No valid JSON found", text, 0) raise json.JSONDecodeError("No valid JSON found", text, 0)

View File

@@ -65,6 +65,7 @@ def render_content_input():
"paragraph": tr("split.mode_paragraph"), "paragraph": tr("split.mode_paragraph"),
"line": tr("split.mode_line"), "line": tr("split.mode_line"),
"sentence": tr("split.mode_sentence"), "sentence": tr("split.mode_sentence"),
"smart": tr("split.mode_smart"),
} }
split_mode = st.selectbox( split_mode = st.selectbox(
tr("split.mode_label"), tr("split.mode_label"),
@@ -73,8 +74,16 @@ def render_content_input():
index=0, # Default to paragraph mode index=0, # Default to paragraph mode
help=tr("split.mode_help") help=tr("split.mode_help")
) )
# Show info for smart mode (auto-detect segment count)
if split_mode == "smart":
st.info(tr("split.smart_auto_hint"))
target_segments = None # Auto-detect
else:
target_segments = None # Not used for other modes
else: else:
split_mode = "paragraph" # Default for generate mode (not used) split_mode = "paragraph" # Default for generate mode (not used)
target_segments = None
# Title input (optional for both modes) # Title input (optional for both modes)
title = st.text_input( title = st.text_input(
@@ -105,7 +114,8 @@ def render_content_input():
"text": text, "text": text,
"title": title, "title": title,
"n_scenes": n_scenes, "n_scenes": n_scenes,
"split_mode": split_mode "split_mode": split_mode,
"target_segments": target_segments
} }
else: else:

View File

@@ -52,6 +52,7 @@ def render_single_output(pixelle_video, video_params):
title = video_params.get("title") title = video_params.get("title")
n_scenes = video_params.get("n_scenes", 5) n_scenes = video_params.get("n_scenes", 5)
split_mode = video_params.get("split_mode", "paragraph") split_mode = video_params.get("split_mode", "paragraph")
target_segments = video_params.get("target_segments", 8)
bgm_path = video_params.get("bgm_path") bgm_path = video_params.get("bgm_path")
bgm_volume = video_params.get("bgm_volume", 0.2) bgm_volume = video_params.get("bgm_volume", 0.2)
@@ -116,6 +117,7 @@ def render_single_output(pixelle_video, video_params):
"title": title if title else None, "title": title if title else None,
"n_scenes": n_scenes, "n_scenes": n_scenes,
"split_mode": split_mode, "split_mode": split_mode,
"target_segments": target_segments,
"media_workflow": workflow_key, "media_workflow": workflow_key,
"frame_template": frame_template, "frame_template": frame_template,
"prompt_prefix": prompt_prefix, "prompt_prefix": prompt_prefix,
@@ -222,6 +224,7 @@ def render_single_output(pixelle_video, video_params):
"title": title if title else None, "title": title if title else None,
"n_scenes": n_scenes, "n_scenes": n_scenes,
"split_mode": split_mode, "split_mode": split_mode,
"target_segments": target_segments,
"media_workflow": workflow_key, "media_workflow": workflow_key,
"frame_template": frame_template, "frame_template": frame_template,
"prompt_prefix": prompt_prefix, "prompt_prefix": prompt_prefix,

View File

@@ -26,6 +26,8 @@
"split.mode_paragraph": "📄 By Paragraph (\\n\\n)", "split.mode_paragraph": "📄 By Paragraph (\\n\\n)",
"split.mode_line": "📝 By Line (\\n)", "split.mode_line": "📝 By Line (\\n)",
"split.mode_sentence": "✂️ By Sentence (。.!?)", "split.mode_sentence": "✂️ By Sentence (。.!?)",
"split.mode_smart": "🧠 Smart Merge (AI Grouping)",
"split.smart_auto_hint": "🤖 AI will analyze text structure, recommend optimal segment count, and intelligently merge related paragraphs (dialogues, same scene)",
"input.content": "Content", "input.content": "Content",
"input.content_placeholder": "Used directly without modification (split by strategy below)\nExample:\nHello everyone, today I'll share three study tips.\n\nThe first tip is focus training, meditate for 10 minutes daily.\n\nThe second tip is active recall, review immediately after learning.", "input.content_placeholder": "Used directly without modification (split by strategy below)\nExample:\nHello everyone, today I'll share three study tips.\n\nThe first tip is focus training, meditate for 10 minutes daily.\n\nThe second tip is active recall, review immediately after learning.",
"input.content_help": "Provide your own content for video generation", "input.content_help": "Provide your own content for video generation",

View File

@@ -26,6 +26,8 @@
"split.mode_paragraph": "📄 按段落(\\n\\n", "split.mode_paragraph": "📄 按段落(\\n\\n",
"split.mode_line": "📝 按行(\\n", "split.mode_line": "📝 按行(\\n",
"split.mode_sentence": "✂️ 按句号(。.!?", "split.mode_sentence": "✂️ 按句号(。.!?",
"split.mode_smart": "🧠 智能合并AI 分组)",
"split.smart_auto_hint": "🤖 AI 将自动分析文本结构,推荐最佳分镜数量,并智能合并相关段落(对话、同一场景)",
"input.content": "内容", "input.content": "内容",
"input.content_placeholder": "直接使用,不做改写(根据下方分割方式切分)\n例如\n大家好今天跟你分享三个学习技巧。\n\n第一个技巧是专注力训练每天冥想10分钟。\n\n第二个技巧是主动回忆学完立即复述。", "input.content_placeholder": "直接使用,不做改写(根据下方分割方式切分)\n例如\n大家好今天跟你分享三个学习技巧。\n\n第一个技巧是专注力训练每天冥想10分钟。\n\n第二个技巧是主动回忆学完立即复述。",
"input.content_help": "提供您自己的内容用于视频生成", "input.content_help": "提供您自己的内容用于视频生成",