From 3d3aba3670bfc58fffd36016a5d2a05fa8316384 Mon Sep 17 00:00:00 2001
From: empty <let5sne.mac@gmail.com>
Date: Sat, 17 Jan 2026 00:19:46 +0800
Subject: [PATCH] feat: Add smart paragraph merging mode with AI grouping

- Add "smart" split mode that uses LLM to intelligently merge related paragraphs
- Implement two-step approach: analyze text structure, then group by semantic relevance
- Add paragraph_merging.py with analysis and grouping prompts
- Update UI to support smart mode selection with auto-detect hint
- Add i18n translations for smart mode (en_US, zh_CN)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pixelle_video/pipelines/standard.py        |   8 +-
 pixelle_video/prompts/__init__.py          |  12 ++
 pixelle_video/prompts/paragraph_merging.py | 202 +++++++++++++++++++++
 pixelle_video/utils/content_generators.py  | 192 +++++++++++++++++++-
 web/components/content_input.py            |  12 +-
 web/components/output_preview.py           |   3 +
 web/i18n/locales/en_US.json                |   2 +
 web/i18n/locales/zh_CN.json                |   2 +
 8 files changed, 427 insertions(+), 6 deletions(-)
 create mode 100644 pixelle_video/prompts/paragraph_merging.py

diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py
index 774b6c8..769d9e0 100644
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -124,7 +124,13 @@ class StandardPipeline(LinearVideoPipeline):
         else:  # fixed
             self._report_progress(ctx.progress_callback, "splitting_script", 0.05)
             split_mode = ctx.params.get("split_mode", "paragraph")
-            ctx.narrations = await split_narration_script(text, split_mode=split_mode)
+            target_segments = ctx.params.get("target_segments", 8)
+            ctx.narrations = await split_narration_script(
+                text, 
+                split_mode=split_mode,
+                llm_service=self.llm if split_mode == "smart" else None,
+                target_segments=target_segments
+            )
             logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})")
             logger.info(f"   Note: n_scenes={n_scenes} is ignored in fixed mode")
 
diff --git a/pixelle_video/prompts/__init__.py b/pixelle_video/prompts/__init__.py
index 4e1998f..201a7aa 100644
--- a/pixelle_video/prompts/__init__.py
+++ b/pixelle_video/prompts/__init__.py
@@ -29,6 +29,13 @@ from pixelle_video.prompts.image_generation import (
 )
 from pixelle_video.prompts.style_conversion import build_style_conversion_prompt
 
+# Paragraph merging (two-step: analysis + grouping)
+from pixelle_video.prompts.paragraph_merging import (
+    build_paragraph_analysis_prompt,
+    build_paragraph_grouping_prompt,
+    build_paragraph_merging_prompt,  # Legacy support
+)
+
 
 __all__ = [
     # Narration builders
@@ -40,6 +47,11 @@ __all__ = [
     "build_image_prompt_prompt",
     "build_style_conversion_prompt",
     
+    # Paragraph merging (two-step)
+    "build_paragraph_analysis_prompt",
+    "build_paragraph_grouping_prompt",
+    "build_paragraph_merging_prompt",  # Legacy
+    
     # Image style presets
     "IMAGE_STYLE_PRESETS",
     "DEFAULT_IMAGE_STYLE",
diff --git a/pixelle_video/prompts/paragraph_merging.py b/pixelle_video/prompts/paragraph_merging.py
new file mode 100644
index 0000000..2dfc7cd
--- /dev/null
+++ b/pixelle_video/prompts/paragraph_merging.py
@@ -0,0 +1,202 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Paragraph merging prompt
+
+For intelligently merging short paragraphs into longer segments suitable for video storyboards.
+Uses a two-step approach: first analyze, then group.
+"""
+
+import json
+from typing import List
+
+
+# Step 1: Analyze text and recommend segment count
+PARAGRAPH_ANALYSIS_PROMPT = """# 任务定义
+你是一个专业的视频分镜规划师。请分析以下文本，推荐最佳分镜数量。
+
+# 核心任务
+分析文本结构，根据以下原则推荐分镜数量：
+
+## 分析原则
+1. **语义边界**：识别场景切换、话题转换、情绪变化点
+2. **叙事完整性**：保持对话回合完整（问-答不拆分）
+3. **时长控制**：每个分镜语音时长建议 15-45 秒（约 60-180 字）
+4. **视觉多样性**：确保分镜之间有足够的画面变化
+
+## 文本信息
+- 总段落数：{total_paragraphs}
+- 预估总字数：{total_chars} 字
+- 预估总时长：{estimated_duration} 秒
+
+## 输入段落预览
+{paragraphs_preview}
+
+# 输出格式
+返回 JSON 格式的分析结果：
+
+```json
+{{
+  "recommended_segments": 8,
+  "reasoning": "文本包含开场设定、分手对话、争吵升级、离别等多个场景切换点...",
+  "scene_boundaries": [
+    {{"after_paragraph": 3, "reason": "场景从背景介绍转入对话"}},
+    {{"after_paragraph": 7, "reason": "对话情绪升级"}},
+    ...
+  ]
+}}
+```
+
+# 重要提醒
+1. recommended_segments 应该在 3-15 之间
+2. 每个分镜平均字数建议 80-200 字
+3. scene_boundaries 标记主要的场景切换点，用于后续分组参考
+4. 只输出 JSON，不要添加其他解释
+"""
+
+
+# Step 2: Group paragraphs based on analysis
+PARAGRAPH_GROUPING_PROMPT = """# 任务定义
+你是一个专业的文本分段专家。根据分析结果，将段落分组。
+
+# 核心任务
+将 {total_paragraphs} 个段落（编号 0 到 {max_index}）分成 **{target_segments}** 个分组。
+
+# 分析建议
+{analysis_hint}
+
+# 分组原则
+1. **语义关联**：将描述同一场景、同一对话回合的段落放在一起
+2. **对话完整**：一轮完整的对话（问与答）应该在同一分组
+3. **场景统一**：同一时间、地点发生的事件应该在同一分组
+4. **长度均衡**：每个分组的字数尽量均衡（目标 80-200 字/分组）
+5. **顺序保持**：分组内段落必须连续
+
+# 输入段落
+{paragraphs_preview}
+
+# 输出格式
+返回 JSON 格式，包含每个分组的起始和结束索引（包含）。
+
+```json
+{{
+  "groups": [
+    {{"start": 0, "end": 3}},
+    {{"start": 4, "end": 7}},
+    {{"start": 8, "end": 12}}
+  ]
+}}
+```
+
+# 重要提醒
+1. 必须输出正好 {target_segments} 个分组
+2. 分组必须覆盖所有段落（从 0 到 {max_index}）
+3. 每个分组的 start 必须等于上一个 end + 1
+4. 只输出 JSON，不要添加其他解释
+"""
+
+
+def build_paragraph_analysis_prompt(
+    paragraphs: List[str],
+) -> str:
+    """
+    Build prompt for analyzing text and recommending segment count
+    
+    Args:
+        paragraphs: List of original paragraphs
+    
+    Returns:
+        Formatted prompt for analysis
+    """
+    # Calculate stats
+    total_chars = sum(len(p) for p in paragraphs)
+    # Estimate: ~250 chars/minute for Chinese speech
+    estimated_duration = int(total_chars / 250 * 60)
+    
+    # Create preview for each paragraph (first 50 chars)
+    previews = []
+    for i, para in enumerate(paragraphs):
+        preview = para[:50].replace('\n', ' ')
+        char_count = len(para)
+        if len(para) > 50:
+            preview += "..."
+        previews.append(f"[{i}] ({char_count}字) {preview}")
+    
+    paragraphs_preview = "\n".join(previews)
+    
+    return PARAGRAPH_ANALYSIS_PROMPT.format(
+        paragraphs_preview=paragraphs_preview,
+        total_paragraphs=len(paragraphs),
+        total_chars=total_chars,
+        estimated_duration=estimated_duration
+    )
+
+
+def build_paragraph_grouping_prompt(
+    paragraphs: List[str],
+    target_segments: int,
+    analysis_result: dict = None,
+) -> str:
+    """
+    Build prompt for grouping paragraphs based on analysis
+    
+    Args:
+        paragraphs: List of original paragraphs
+        target_segments: Target number of segments (from analysis)
+        analysis_result: Optional analysis result for context
+    
+    Returns:
+        Formatted prompt for grouping
+    """
+    # Create preview with char counts
+    previews = []
+    for i, para in enumerate(paragraphs):
+        preview = para[:50].replace('\n', ' ')
+        char_count = len(para)
+        if len(para) > 50:
+            preview += "..."
+        previews.append(f"[{i}] ({char_count}字) {preview}")
+    
+    paragraphs_preview = "\n".join(previews)
+    
+    # Build analysis hint if available
+    analysis_hint = ""
+    if analysis_result:
+        if "reasoning" in analysis_result:
+            analysis_hint += f"分析理由：{analysis_result['reasoning']}\n"
+        if "scene_boundaries" in analysis_result:
+            boundaries = [str(b.get("after_paragraph", "")) for b in analysis_result["scene_boundaries"]]
+            analysis_hint += f"建议场景切换点（段落后）：{', '.join(boundaries)}"
+    
+    if not analysis_hint:
+        analysis_hint = "无额外分析信息"
+    
+    return PARAGRAPH_GROUPING_PROMPT.format(
+        paragraphs_preview=paragraphs_preview,
+        target_segments=target_segments,
+        total_paragraphs=len(paragraphs),
+        max_index=len(paragraphs) - 1,
+        analysis_hint=analysis_hint
+    )
+
+
+# Legacy support - keep original function name for backward compatibility
+def build_paragraph_merging_prompt(
+    paragraphs: List[str],
+    target_segments: int = 8,
+) -> str:
+    """
+    Legacy function for backward compatibility.
+    Now delegates to build_paragraph_grouping_prompt.
+    """
+    return build_paragraph_grouping_prompt(paragraphs, target_segments)
diff --git a/pixelle_video/utils/content_generators.py b/pixelle_video/utils/content_generators.py
index 8a821f7..7cd411b 100644
--- a/pixelle_video/utils/content_generators.py
+++ b/pixelle_video/utils/content_generators.py
@@ -208,7 +208,9 @@ async def generate_narrations_from_content(
 
 async def split_narration_script(
     script: str,
-    split_mode: Literal["paragraph", "line", "sentence"] = "paragraph",
+    split_mode: Literal["paragraph", "line", "sentence", "smart"] = "paragraph",
+    llm_service = None,
+    target_segments: int = 8,
 ) -> List[str]:
     """
     Split user-provided narration script into segments
@@ -219,6 +221,9 @@ async def split_narration_script(
             - "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs
             - "line": Split by single newline (\\n), each line is a segment
             - "sentence": Split by sentence-ending punctuation (。.!?！？)
+            - "smart": First split by paragraph, then use LLM to intelligently merge related paragraphs
+        llm_service: LLM service instance (required for "smart" mode)
+        target_segments: Target number of segments for "smart" mode (default: 8)
     
     Returns:
         List of narration segments
@@ -227,7 +232,31 @@ async def split_narration_script(
     
     narrations = []
     
-    if split_mode == "paragraph":
+    if split_mode == "smart":
+        # Smart mode: first split by paragraph, then merge intelligently
+        if llm_service is None:
+            raise ValueError("llm_service is required for 'smart' split mode")
+        
+        # Step 1: Split by paragraph first
+        paragraphs = re.split(r'\n\s*\n', script)
+        paragraphs = [p.strip() for p in paragraphs if p.strip()]
+        logger.info(f"   Initial split: {len(paragraphs)} paragraphs")
+        
+        # Step 2: Merge intelligently using LLM
+        # If target_segments is None, merge_paragraphs_smart will auto-analyze
+        if target_segments is not None and len(paragraphs) <= target_segments:
+            # No need to merge if already within target
+            logger.info(f"   Paragraphs count ({len(paragraphs)}) <= target ({target_segments}), no merge needed")
+            narrations = paragraphs
+        else:
+            narrations = await merge_paragraphs_smart(
+                llm_service=llm_service,
+                paragraphs=paragraphs,
+                target_segments=target_segments  # Can be None for auto-analysis
+            )
+        logger.info(f"✅ Smart split: {len(paragraphs)} paragraphs -> {len(narrations)} segments")
+    
+    elif split_mode == "paragraph":
         # Split by double newline (paragraph mode)
         # Preserve single newlines within paragraphs
         paragraphs = re.split(r'\n\s*\n', script)
@@ -266,6 +295,150 @@ async def split_narration_script(
     return narrations
 
 
+async def merge_paragraphs_smart(
+    llm_service,
+    paragraphs: List[str],
+    target_segments: int = None,  # Now optional - auto-analyze if not provided
+    max_retries: int = 3,
+) -> List[str]:
+    """
+    Use LLM to intelligently merge paragraphs based on semantic relevance.
+    
+    Two-step approach:
+    1. If target_segments is not provided, first analyze text to recommend optimal count
+    2. Then group paragraphs based on the target count
+    
+    Args:
+        llm_service: LLM service instance
+        paragraphs: List of original paragraphs
+        target_segments: Target number of merged segments (auto-analyzed if None)
+        max_retries: Maximum retry attempts for each step
+    
+    Returns:
+        List of merged paragraphs
+    """
+    from pixelle_video.prompts import (
+        build_paragraph_analysis_prompt,
+        build_paragraph_grouping_prompt
+    )
+    
+    # ========================================
+    # Step 1: Analyze and recommend segment count (if not provided)
+    # ========================================
+    if target_segments is None:
+        logger.info(f"Analyzing {len(paragraphs)} paragraphs to recommend segment count...")
+        
+        analysis_prompt = build_paragraph_analysis_prompt(paragraphs)
+        analysis_result = None
+        
+        for attempt in range(1, max_retries + 1):
+            try:
+                response = await llm_service(
+                    prompt=analysis_prompt,
+                    temperature=0.3,
+                    max_tokens=1500
+                )
+                
+                logger.debug(f"Analysis response length: {len(response)} chars")
+                
+                result = _parse_json(response)
+                
+                if "recommended_segments" not in result:
+                    raise KeyError("Missing 'recommended_segments' in analysis")
+                
+                target_segments = result["recommended_segments"]
+                analysis_result = result
+                
+                # Validate range
+                if target_segments < 3:
+                    target_segments = 3
+                elif target_segments > 15:
+                    target_segments = 15
+                
+                reasoning = result.get("reasoning", "N/A")
+                logger.info(f"✅ Analysis complete: recommended {target_segments} segments")
+                logger.info(f"   Reasoning: {reasoning[:100]}...")
+                break
+                
+            except Exception as e:
+                logger.error(f"Analysis attempt {attempt} failed: {e}")
+                if attempt >= max_retries:
+                    # Fallback: use simple heuristic
+                    target_segments = max(3, min(12, len(paragraphs) // 3))
+                    logger.warning(f"Using fallback: {target_segments} segments (paragraphs/3)")
+                    analysis_result = None
+                    break
+                logger.info("Retrying analysis...")
+    else:
+        analysis_result = None
+        logger.info(f"Using provided target: {target_segments} segments")
+    
+    # ========================================
+    # Step 2: Group paragraphs
+    # ========================================
+    logger.info(f"Grouping {len(paragraphs)} paragraphs into {target_segments} segments...")
+    
+    grouping_prompt = build_paragraph_grouping_prompt(
+        paragraphs=paragraphs,
+        target_segments=target_segments,
+        analysis_result=analysis_result
+    )
+    
+    for attempt in range(1, max_retries + 1):
+        try:
+            response = await llm_service(
+                prompt=grouping_prompt,
+                temperature=0.3,
+                max_tokens=2000
+            )
+            
+            logger.debug(f"Grouping response length: {len(response)} chars")
+            
+            result = _parse_json(response)
+            
+            if "groups" not in result:
+                raise KeyError("Invalid response format: missing 'groups'")
+            
+            groups = result["groups"]
+            
+            # Validate count
+            if len(groups) != target_segments:
+                logger.warning(
+                    f"Grouping attempt {attempt}: expected {target_segments} groups, got {len(groups)}"
+                )
+                if attempt < max_retries:
+                    continue
+                logger.warning(f"Accepting {len(groups)} groups after {max_retries} attempts")
+            
+            # Validate group boundaries
+            for i, group in enumerate(groups):
+                if "start" not in group or "end" not in group:
+                    raise ValueError(f"Group {i} missing 'start' or 'end'")
+                if group["start"] > group["end"]:
+                    raise ValueError(f"Group {i} has invalid range: start > end")
+                if group["start"] < 0 or group["end"] >= len(paragraphs):
+                    raise ValueError(f"Group {i} has out-of-bounds indices")
+            
+            # Merge paragraphs based on groups
+            merged = []
+            for group in groups:
+                start, end = group["start"], group["end"]
+                merged_text = "\n\n".join(paragraphs[start:end + 1])
+                merged.append(merged_text)
+            
+            logger.info(f"✅ Successfully merged into {len(merged)} segments")
+            return merged
+            
+        except Exception as e:
+            logger.error(f"Grouping attempt {attempt} failed: {e}")
+            if attempt >= max_retries:
+                raise
+            logger.info("Retrying grouping...")
+    
+    # Fallback: should not reach here
+    return paragraphs
+
+
 async def generate_image_prompts(
     llm_service,
     narrations: List[str],
@@ -489,8 +662,8 @@ def _parse_json(text: str) -> dict:
         except json.JSONDecodeError:
             pass
     
-    # Try to find any JSON object in the text
-    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts")\s*:\s*\[[^\]]*\][^{}]*\}'
+    # Try to find any JSON object with known keys (including analysis keys)
+    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts"|"video_prompts"|"merged_paragraphs"|"groups"|"recommended_segments"|"scene_boundaries")\s*:\s*[^{}]*\}'
     match = re.search(json_pattern, text, re.DOTALL)
     if match:
         try:
@@ -498,6 +671,17 @@ def _parse_json(text: str) -> dict:
         except json.JSONDecodeError:
             pass
     
+    # Try to find any JSON object that looks like it contains an array
+    # This is a more aggressive fallback for complex nested arrays
+    json_start = text.find('{')
+    json_end = text.rfind('}')
+    if json_start != -1 and json_end != -1 and json_end > json_start:
+        potential_json = text[json_start:json_end + 1]
+        try:
+            return json.loads(potential_json)
+        except json.JSONDecodeError:
+            pass
+    
     # If all fails, raise error
     raise json.JSONDecodeError("No valid JSON found", text, 0)
 
diff --git a/web/components/content_input.py b/web/components/content_input.py
index 7076451..5890846 100644
--- a/web/components/content_input.py
+++ b/web/components/content_input.py
@@ -65,6 +65,7 @@ def render_content_input():
                     "paragraph": tr("split.mode_paragraph"),
                     "line": tr("split.mode_line"),
                     "sentence": tr("split.mode_sentence"),
+                    "smart": tr("split.mode_smart"),
                 }
                 split_mode = st.selectbox(
                     tr("split.mode_label"),
@@ -73,8 +74,16 @@ def render_content_input():
                     index=0,  # Default to paragraph mode
                     help=tr("split.mode_help")
                 )
+                
+                # Show info for smart mode (auto-detect segment count)
+                if split_mode == "smart":
+                    st.info(tr("split.smart_auto_hint"))
+                    target_segments = None  # Auto-detect
+                else:
+                    target_segments = None  # Not used for other modes
             else:
                 split_mode = "paragraph"  # Default for generate mode (not used)
+                target_segments = None
             
             # Title input (optional for both modes)
             title = st.text_input(
@@ -105,7 +114,8 @@ def render_content_input():
                 "text": text,
                 "title": title,
                 "n_scenes": n_scenes,
-                "split_mode": split_mode
+                "split_mode": split_mode,
+                "target_segments": target_segments
             }
         
         else:
diff --git a/web/components/output_preview.py b/web/components/output_preview.py
index 5221040..9c35308 100644
--- a/web/components/output_preview.py
+++ b/web/components/output_preview.py
@@ -52,6 +52,7 @@ def render_single_output(pixelle_video, video_params):
     title = video_params.get("title")
     n_scenes = video_params.get("n_scenes", 5)
     split_mode = video_params.get("split_mode", "paragraph")
+    target_segments = video_params.get("target_segments", 8)
     bgm_path = video_params.get("bgm_path")
     bgm_volume = video_params.get("bgm_volume", 0.2)
     
@@ -116,6 +117,7 @@ def render_single_output(pixelle_video, video_params):
                         "title": title if title else None,
                         "n_scenes": n_scenes,
                         "split_mode": split_mode,
+                        "target_segments": target_segments,
                         "media_workflow": workflow_key,
                         "frame_template": frame_template,
                         "prompt_prefix": prompt_prefix,
@@ -222,6 +224,7 @@ def render_single_output(pixelle_video, video_params):
                     "title": title if title else None,
                     "n_scenes": n_scenes,
                     "split_mode": split_mode,
+                    "target_segments": target_segments,
                     "media_workflow": workflow_key,
                     "frame_template": frame_template,
                     "prompt_prefix": prompt_prefix,
diff --git a/web/i18n/locales/en_US.json b/web/i18n/locales/en_US.json
index 582f96b..bea54fd 100644
--- a/web/i18n/locales/en_US.json
+++ b/web/i18n/locales/en_US.json
@@ -26,6 +26,8 @@
     "split.mode_paragraph": "📄 By Paragraph (\\n\\n)",
     "split.mode_line": "📝 By Line (\\n)",
     "split.mode_sentence": "✂️ By Sentence (。.!?)",
+    "split.mode_smart": "🧠 Smart Merge (AI Grouping)",
+    "split.smart_auto_hint": "🤖 AI will analyze text structure, recommend optimal segment count, and intelligently merge related paragraphs (dialogues, same scene)",
     "input.content": "Content",
     "input.content_placeholder": "Used directly without modification (split by strategy below)\nExample:\nHello everyone, today I'll share three study tips.\n\nThe first tip is focus training, meditate for 10 minutes daily.\n\nThe second tip is active recall, review immediately after learning.",
     "input.content_help": "Provide your own content for video generation",
diff --git a/web/i18n/locales/zh_CN.json b/web/i18n/locales/zh_CN.json
index 25ab389..427d49b 100644
--- a/web/i18n/locales/zh_CN.json
+++ b/web/i18n/locales/zh_CN.json
@@ -26,6 +26,8 @@
     "split.mode_paragraph": "📄 按段落（\\n\\n）",
     "split.mode_line": "📝 按行（\\n）",
     "split.mode_sentence": "✂️ 按句号（。.!?）",
+    "split.mode_smart": "🧠 智能合并（AI 分组）",
+    "split.smart_auto_hint": "🤖 AI 将自动分析文本结构，推荐最佳分镜数量，并智能合并相关段落（对话、同一场景）",
     "input.content": "内容",
     "input.content_placeholder": "直接使用，不做改写（根据下方分割方式切分）\n例如：\n大家好，今天跟你分享三个学习技巧。\n\n第一个技巧是专注力训练，每天冥想10分钟。\n\n第二个技巧是主动回忆，学完立即复述。",
     "input.content_help": "提供您自己的内容用于视频生成",