From 3d3aba3670bfc58fffd36016a5d2a05fa8316384 Mon Sep 17 00:00:00 2001 From: empty Date: Sat, 17 Jan 2026 00:19:46 +0800 Subject: [PATCH] feat: Add smart paragraph merging mode with AI grouping - Add "smart" split mode that uses LLM to intelligently merge related paragraphs - Implement two-step approach: analyze text structure, then group by semantic relevance - Add paragraph_merging.py with analysis and grouping prompts - Update UI to support smart mode selection with auto-detect hint - Add i18n translations for smart mode (en_US, zh_CN) Co-Authored-By: Claude Opus 4.5 --- pixelle_video/pipelines/standard.py | 8 +- pixelle_video/prompts/__init__.py | 12 ++ pixelle_video/prompts/paragraph_merging.py | 202 +++++++++++++++++++++ pixelle_video/utils/content_generators.py | 192 +++++++++++++++++++- web/components/content_input.py | 12 +- web/components/output_preview.py | 3 + web/i18n/locales/en_US.json | 2 + web/i18n/locales/zh_CN.json | 2 + 8 files changed, 427 insertions(+), 6 deletions(-) create mode 100644 pixelle_video/prompts/paragraph_merging.py diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py index 774b6c8..769d9e0 100644 --- a/pixelle_video/pipelines/standard.py +++ b/pixelle_video/pipelines/standard.py @@ -124,7 +124,13 @@ class StandardPipeline(LinearVideoPipeline): else: # fixed self._report_progress(ctx.progress_callback, "splitting_script", 0.05) split_mode = ctx.params.get("split_mode", "paragraph") - ctx.narrations = await split_narration_script(text, split_mode=split_mode) + target_segments = ctx.params.get("target_segments", 8) + ctx.narrations = await split_narration_script( + text, + split_mode=split_mode, + llm_service=self.llm if split_mode == "smart" else None, + target_segments=target_segments + ) logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})") logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode") diff --git a/pixelle_video/prompts/__init__.py b/pixelle_video/prompts/__init__.py index 4e1998f..201a7aa 100644 --- a/pixelle_video/prompts/__init__.py +++ b/pixelle_video/prompts/__init__.py @@ -29,6 +29,13 @@ from pixelle_video.prompts.image_generation import ( ) from pixelle_video.prompts.style_conversion import build_style_conversion_prompt +# Paragraph merging (two-step: analysis + grouping) +from pixelle_video.prompts.paragraph_merging import ( + build_paragraph_analysis_prompt, + build_paragraph_grouping_prompt, + build_paragraph_merging_prompt, # Legacy support +) + __all__ = [ # Narration builders @@ -40,6 +47,11 @@ __all__ = [ "build_image_prompt_prompt", "build_style_conversion_prompt", + # Paragraph merging (two-step) + "build_paragraph_analysis_prompt", + "build_paragraph_grouping_prompt", + "build_paragraph_merging_prompt", # Legacy + # Image style presets "IMAGE_STYLE_PRESETS", "DEFAULT_IMAGE_STYLE", diff --git a/pixelle_video/prompts/paragraph_merging.py b/pixelle_video/prompts/paragraph_merging.py new file mode 100644 index 0000000..2dfc7cd --- /dev/null +++ b/pixelle_video/prompts/paragraph_merging.py @@ -0,0 +1,202 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Paragraph merging prompt + +For intelligently merging short paragraphs into longer segments suitable for video storyboards. +Uses a two-step approach: first analyze, then group. +""" + +import json +from typing import List + + +# Step 1: Analyze text and recommend segment count +PARAGRAPH_ANALYSIS_PROMPT = """# 任务定义 +你是一个专业的视频分镜规划师。请分析以下文本,推荐最佳分镜数量。 + +# 核心任务 +分析文本结构,根据以下原则推荐分镜数量: + +## 分析原则 +1. **语义边界**:识别场景切换、话题转换、情绪变化点 +2. **叙事完整性**:保持对话回合完整(问-答不拆分) +3. **时长控制**:每个分镜语音时长建议 15-45 秒(约 60-180 字) +4. **视觉多样性**:确保分镜之间有足够的画面变化 + +## 文本信息 +- 总段落数:{total_paragraphs} +- 预估总字数:{total_chars} 字 +- 预估总时长:{estimated_duration} 秒 + +## 输入段落预览 +{paragraphs_preview} + +# 输出格式 +返回 JSON 格式的分析结果: + +```json +{{ + "recommended_segments": 8, + "reasoning": "文本包含开场设定、分手对话、争吵升级、离别等多个场景切换点...", + "scene_boundaries": [ + {{"after_paragraph": 3, "reason": "场景从背景介绍转入对话"}}, + {{"after_paragraph": 7, "reason": "对话情绪升级"}}, + ... + ] +}} +``` + +# 重要提醒 +1. recommended_segments 应该在 3-15 之间 +2. 每个分镜平均字数建议 80-200 字 +3. scene_boundaries 标记主要的场景切换点,用于后续分组参考 +4. 只输出 JSON,不要添加其他解释 +""" + + +# Step 2: Group paragraphs based on analysis +PARAGRAPH_GROUPING_PROMPT = """# 任务定义 +你是一个专业的文本分段专家。根据分析结果,将段落分组。 + +# 核心任务 +将 {total_paragraphs} 个段落(编号 0 到 {max_index})分成 **{target_segments}** 个分组。 + +# 分析建议 +{analysis_hint} + +# 分组原则 +1. **语义关联**:将描述同一场景、同一对话回合的段落放在一起 +2. **对话完整**:一轮完整的对话(问与答)应该在同一分组 +3. **场景统一**:同一时间、地点发生的事件应该在同一分组 +4. **长度均衡**:每个分组的字数尽量均衡(目标 80-200 字/分组) +5. **顺序保持**:分组内段落必须连续 + +# 输入段落 +{paragraphs_preview} + +# 输出格式 +返回 JSON 格式,包含每个分组的起始和结束索引(包含)。 + +```json +{{ + "groups": [ + {{"start": 0, "end": 3}}, + {{"start": 4, "end": 7}}, + {{"start": 8, "end": 12}} + ] +}} +``` + +# 重要提醒 +1. 必须输出正好 {target_segments} 个分组 +2. 分组必须覆盖所有段落(从 0 到 {max_index}) +3. 每个分组的 start 必须等于上一个 end + 1 +4. 只输出 JSON,不要添加其他解释 +""" + + +def build_paragraph_analysis_prompt( + paragraphs: List[str], +) -> str: + """ + Build prompt for analyzing text and recommending segment count + + Args: + paragraphs: List of original paragraphs + + Returns: + Formatted prompt for analysis + """ + # Calculate stats + total_chars = sum(len(p) for p in paragraphs) + # Estimate: ~250 chars/minute for Chinese speech + estimated_duration = int(total_chars / 250 * 60) + + # Create preview for each paragraph (first 50 chars) + previews = [] + for i, para in enumerate(paragraphs): + preview = para[:50].replace('\n', ' ') + char_count = len(para) + if len(para) > 50: + preview += "..." + previews.append(f"[{i}] ({char_count}字) {preview}") + + paragraphs_preview = "\n".join(previews) + + return PARAGRAPH_ANALYSIS_PROMPT.format( + paragraphs_preview=paragraphs_preview, + total_paragraphs=len(paragraphs), + total_chars=total_chars, + estimated_duration=estimated_duration + ) + + +def build_paragraph_grouping_prompt( + paragraphs: List[str], + target_segments: int, + analysis_result: dict = None, +) -> str: + """ + Build prompt for grouping paragraphs based on analysis + + Args: + paragraphs: List of original paragraphs + target_segments: Target number of segments (from analysis) + analysis_result: Optional analysis result for context + + Returns: + Formatted prompt for grouping + """ + # Create preview with char counts + previews = [] + for i, para in enumerate(paragraphs): + preview = para[:50].replace('\n', ' ') + char_count = len(para) + if len(para) > 50: + preview += "..." + previews.append(f"[{i}] ({char_count}字) {preview}") + + paragraphs_preview = "\n".join(previews) + + # Build analysis hint if available + analysis_hint = "" + if analysis_result: + if "reasoning" in analysis_result: + analysis_hint += f"分析理由:{analysis_result['reasoning']}\n" + if "scene_boundaries" in analysis_result: + boundaries = [str(b.get("after_paragraph", "")) for b in analysis_result["scene_boundaries"]] + analysis_hint += f"建议场景切换点(段落后):{', '.join(boundaries)}" + + if not analysis_hint: + analysis_hint = "无额外分析信息" + + return PARAGRAPH_GROUPING_PROMPT.format( + paragraphs_preview=paragraphs_preview, + target_segments=target_segments, + total_paragraphs=len(paragraphs), + max_index=len(paragraphs) - 1, + analysis_hint=analysis_hint + ) + + +# Legacy support - keep original function name for backward compatibility +def build_paragraph_merging_prompt( + paragraphs: List[str], + target_segments: int = 8, +) -> str: + """ + Legacy function for backward compatibility. + Now delegates to build_paragraph_grouping_prompt. + """ + return build_paragraph_grouping_prompt(paragraphs, target_segments) diff --git a/pixelle_video/utils/content_generators.py b/pixelle_video/utils/content_generators.py index 8a821f7..7cd411b 100644 --- a/pixelle_video/utils/content_generators.py +++ b/pixelle_video/utils/content_generators.py @@ -208,7 +208,9 @@ async def generate_narrations_from_content( async def split_narration_script( script: str, - split_mode: Literal["paragraph", "line", "sentence"] = "paragraph", + split_mode: Literal["paragraph", "line", "sentence", "smart"] = "paragraph", + llm_service = None, + target_segments: int = 8, ) -> List[str]: """ Split user-provided narration script into segments @@ -219,6 +221,9 @@ async def split_narration_script( - "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs - "line": Split by single newline (\\n), each line is a segment - "sentence": Split by sentence-ending punctuation (。.!?!?) + - "smart": First split by paragraph, then use LLM to intelligently merge related paragraphs + llm_service: LLM service instance (required for "smart" mode) + target_segments: Target number of segments for "smart" mode (default: 8) Returns: List of narration segments @@ -227,7 +232,31 @@ async def split_narration_script( narrations = [] - if split_mode == "paragraph": + if split_mode == "smart": + # Smart mode: first split by paragraph, then merge intelligently + if llm_service is None: + raise ValueError("llm_service is required for 'smart' split mode") + + # Step 1: Split by paragraph first + paragraphs = re.split(r'\n\s*\n', script) + paragraphs = [p.strip() for p in paragraphs if p.strip()] + logger.info(f" Initial split: {len(paragraphs)} paragraphs") + + # Step 2: Merge intelligently using LLM + # If target_segments is None, merge_paragraphs_smart will auto-analyze + if target_segments is not None and len(paragraphs) <= target_segments: + # No need to merge if already within target + logger.info(f" Paragraphs count ({len(paragraphs)}) <= target ({target_segments}), no merge needed") + narrations = paragraphs + else: + narrations = await merge_paragraphs_smart( + llm_service=llm_service, + paragraphs=paragraphs, + target_segments=target_segments # Can be None for auto-analysis + ) + logger.info(f"✅ Smart split: {len(paragraphs)} paragraphs -> {len(narrations)} segments") + + elif split_mode == "paragraph": # Split by double newline (paragraph mode) # Preserve single newlines within paragraphs paragraphs = re.split(r'\n\s*\n', script) @@ -266,6 +295,150 @@ async def split_narration_script( return narrations +async def merge_paragraphs_smart( + llm_service, + paragraphs: List[str], + target_segments: int = None, # Now optional - auto-analyze if not provided + max_retries: int = 3, +) -> List[str]: + """ + Use LLM to intelligently merge paragraphs based on semantic relevance. + + Two-step approach: + 1. If target_segments is not provided, first analyze text to recommend optimal count + 2. Then group paragraphs based on the target count + + Args: + llm_service: LLM service instance + paragraphs: List of original paragraphs + target_segments: Target number of merged segments (auto-analyzed if None) + max_retries: Maximum retry attempts for each step + + Returns: + List of merged paragraphs + """ + from pixelle_video.prompts import ( + build_paragraph_analysis_prompt, + build_paragraph_grouping_prompt + ) + + # ======================================== + # Step 1: Analyze and recommend segment count (if not provided) + # ======================================== + if target_segments is None: + logger.info(f"Analyzing {len(paragraphs)} paragraphs to recommend segment count...") + + analysis_prompt = build_paragraph_analysis_prompt(paragraphs) + analysis_result = None + + for attempt in range(1, max_retries + 1): + try: + response = await llm_service( + prompt=analysis_prompt, + temperature=0.3, + max_tokens=1500 + ) + + logger.debug(f"Analysis response length: {len(response)} chars") + + result = _parse_json(response) + + if "recommended_segments" not in result: + raise KeyError("Missing 'recommended_segments' in analysis") + + target_segments = result["recommended_segments"] + analysis_result = result + + # Validate range + if target_segments < 3: + target_segments = 3 + elif target_segments > 15: + target_segments = 15 + + reasoning = result.get("reasoning", "N/A") + logger.info(f"✅ Analysis complete: recommended {target_segments} segments") + logger.info(f" Reasoning: {reasoning[:100]}...") + break + + except Exception as e: + logger.error(f"Analysis attempt {attempt} failed: {e}") + if attempt >= max_retries: + # Fallback: use simple heuristic + target_segments = max(3, min(12, len(paragraphs) // 3)) + logger.warning(f"Using fallback: {target_segments} segments (paragraphs/3)") + analysis_result = None + break + logger.info("Retrying analysis...") + else: + analysis_result = None + logger.info(f"Using provided target: {target_segments} segments") + + # ======================================== + # Step 2: Group paragraphs + # ======================================== + logger.info(f"Grouping {len(paragraphs)} paragraphs into {target_segments} segments...") + + grouping_prompt = build_paragraph_grouping_prompt( + paragraphs=paragraphs, + target_segments=target_segments, + analysis_result=analysis_result + ) + + for attempt in range(1, max_retries + 1): + try: + response = await llm_service( + prompt=grouping_prompt, + temperature=0.3, + max_tokens=2000 + ) + + logger.debug(f"Grouping response length: {len(response)} chars") + + result = _parse_json(response) + + if "groups" not in result: + raise KeyError("Invalid response format: missing 'groups'") + + groups = result["groups"] + + # Validate count + if len(groups) != target_segments: + logger.warning( + f"Grouping attempt {attempt}: expected {target_segments} groups, got {len(groups)}" + ) + if attempt < max_retries: + continue + logger.warning(f"Accepting {len(groups)} groups after {max_retries} attempts") + + # Validate group boundaries + for i, group in enumerate(groups): + if "start" not in group or "end" not in group: + raise ValueError(f"Group {i} missing 'start' or 'end'") + if group["start"] > group["end"]: + raise ValueError(f"Group {i} has invalid range: start > end") + if group["start"] < 0 or group["end"] >= len(paragraphs): + raise ValueError(f"Group {i} has out-of-bounds indices") + + # Merge paragraphs based on groups + merged = [] + for group in groups: + start, end = group["start"], group["end"] + merged_text = "\n\n".join(paragraphs[start:end + 1]) + merged.append(merged_text) + + logger.info(f"✅ Successfully merged into {len(merged)} segments") + return merged + + except Exception as e: + logger.error(f"Grouping attempt {attempt} failed: {e}") + if attempt >= max_retries: + raise + logger.info("Retrying grouping...") + + # Fallback: should not reach here + return paragraphs + + async def generate_image_prompts( llm_service, narrations: List[str], @@ -489,8 +662,8 @@ def _parse_json(text: str) -> dict: except json.JSONDecodeError: pass - # Try to find any JSON object in the text - json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts")\s*:\s*\[[^\]]*\][^{}]*\}' + # Try to find any JSON object with known keys (including analysis keys) + json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts"|"video_prompts"|"merged_paragraphs"|"groups"|"recommended_segments"|"scene_boundaries")\s*:\s*[^{}]*\}' match = re.search(json_pattern, text, re.DOTALL) if match: try: @@ -498,6 +671,17 @@ def _parse_json(text: str) -> dict: except json.JSONDecodeError: pass + # Try to find any JSON object that looks like it contains an array + # This is a more aggressive fallback for complex nested arrays + json_start = text.find('{') + json_end = text.rfind('}') + if json_start != -1 and json_end != -1 and json_end > json_start: + potential_json = text[json_start:json_end + 1] + try: + return json.loads(potential_json) + except json.JSONDecodeError: + pass + # If all fails, raise error raise json.JSONDecodeError("No valid JSON found", text, 0) diff --git a/web/components/content_input.py b/web/components/content_input.py index 7076451..5890846 100644 --- a/web/components/content_input.py +++ b/web/components/content_input.py @@ -65,6 +65,7 @@ def render_content_input(): "paragraph": tr("split.mode_paragraph"), "line": tr("split.mode_line"), "sentence": tr("split.mode_sentence"), + "smart": tr("split.mode_smart"), } split_mode = st.selectbox( tr("split.mode_label"), @@ -73,8 +74,16 @@ def render_content_input(): index=0, # Default to paragraph mode help=tr("split.mode_help") ) + + # Show info for smart mode (auto-detect segment count) + if split_mode == "smart": + st.info(tr("split.smart_auto_hint")) + target_segments = None # Auto-detect + else: + target_segments = None # Not used for other modes else: split_mode = "paragraph" # Default for generate mode (not used) + target_segments = None # Title input (optional for both modes) title = st.text_input( @@ -105,7 +114,8 @@ def render_content_input(): "text": text, "title": title, "n_scenes": n_scenes, - "split_mode": split_mode + "split_mode": split_mode, + "target_segments": target_segments } else: diff --git a/web/components/output_preview.py b/web/components/output_preview.py index 5221040..9c35308 100644 --- a/web/components/output_preview.py +++ b/web/components/output_preview.py @@ -52,6 +52,7 @@ def render_single_output(pixelle_video, video_params): title = video_params.get("title") n_scenes = video_params.get("n_scenes", 5) split_mode = video_params.get("split_mode", "paragraph") + target_segments = video_params.get("target_segments", 8) bgm_path = video_params.get("bgm_path") bgm_volume = video_params.get("bgm_volume", 0.2) @@ -116,6 +117,7 @@ def render_single_output(pixelle_video, video_params): "title": title if title else None, "n_scenes": n_scenes, "split_mode": split_mode, + "target_segments": target_segments, "media_workflow": workflow_key, "frame_template": frame_template, "prompt_prefix": prompt_prefix, @@ -222,6 +224,7 @@ def render_single_output(pixelle_video, video_params): "title": title if title else None, "n_scenes": n_scenes, "split_mode": split_mode, + "target_segments": target_segments, "media_workflow": workflow_key, "frame_template": frame_template, "prompt_prefix": prompt_prefix, diff --git a/web/i18n/locales/en_US.json b/web/i18n/locales/en_US.json index 582f96b..bea54fd 100644 --- a/web/i18n/locales/en_US.json +++ b/web/i18n/locales/en_US.json @@ -26,6 +26,8 @@ "split.mode_paragraph": "📄 By Paragraph (\\n\\n)", "split.mode_line": "📝 By Line (\\n)", "split.mode_sentence": "✂️ By Sentence (。.!?)", + "split.mode_smart": "🧠 Smart Merge (AI Grouping)", + "split.smart_auto_hint": "🤖 AI will analyze text structure, recommend optimal segment count, and intelligently merge related paragraphs (dialogues, same scene)", "input.content": "Content", "input.content_placeholder": "Used directly without modification (split by strategy below)\nExample:\nHello everyone, today I'll share three study tips.\n\nThe first tip is focus training, meditate for 10 minutes daily.\n\nThe second tip is active recall, review immediately after learning.", "input.content_help": "Provide your own content for video generation", diff --git a/web/i18n/locales/zh_CN.json b/web/i18n/locales/zh_CN.json index 25ab389..427d49b 100644 --- a/web/i18n/locales/zh_CN.json +++ b/web/i18n/locales/zh_CN.json @@ -26,6 +26,8 @@ "split.mode_paragraph": "📄 按段落(\\n\\n)", "split.mode_line": "📝 按行(\\n)", "split.mode_sentence": "✂️ 按句号(。.!?)", + "split.mode_smart": "🧠 智能合并(AI 分组)", + "split.smart_auto_hint": "🤖 AI 将自动分析文本结构,推荐最佳分镜数量,并智能合并相关段落(对话、同一场景)", "input.content": "内容", "input.content_placeholder": "直接使用,不做改写(根据下方分割方式切分)\n例如:\n大家好,今天跟你分享三个学习技巧。\n\n第一个技巧是专注力训练,每天冥想10分钟。\n\n第二个技巧是主动回忆,学完立即复述。", "input.content_help": "提供您自己的内容用于视频生成",