feat: Add smart paragraph merging mode with AI grouping

- Add "smart" split mode that uses LLM to intelligently merge related paragraphs - Implement two-step approach: analyze text structure, then group by semantic relevance - Add paragraph_merging.py with analysis and grouping prompts - Update UI to support smart mode selection with auto-detect hint - Add i18n translations for smart mode (en_US, zh_CN) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 00:19:46 +08:00
parent 3a8ec576ee
commit 3d3aba3670
8 changed files with 427 additions and 6 deletions
--- a/pixelle_video/utils/content_generators.py
+++ b/pixelle_video/utils/content_generators.py
@@ -208,7 +208,9 @@ async def generate_narrations_from_content(

 async def split_narration_script(
    script: str,
-    split_mode: Literal["paragraph", "line", "sentence"] = "paragraph",
+    split_mode: Literal["paragraph", "line", "sentence", "smart"] = "paragraph",
+    llm_service = None,
+    target_segments: int = 8,
 ) -> List[str]:
    """
    Split user-provided narration script into segments
@@ -219,6 +221,9 @@ async def split_narration_script(
            - "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs
            - "line": Split by single newline (\\n), each line is a segment
            - "sentence": Split by sentence-ending punctuation (。.!?！？)
+            - "smart": First split by paragraph, then use LLM to intelligently merge related paragraphs
+        llm_service: LLM service instance (required for "smart" mode)
+        target_segments: Target number of segments for "smart" mode (default: 8)
    
    Returns:
        List of narration segments
@@ -227,7 +232,31 @@ async def split_narration_script(
    
    narrations = []
    
-    if split_mode == "paragraph":
+    if split_mode == "smart":
+        # Smart mode: first split by paragraph, then merge intelligently
+        if llm_service is None:
+            raise ValueError("llm_service is required for 'smart' split mode")
+        
+        # Step 1: Split by paragraph first
+        paragraphs = re.split(r'\n\s*\n', script)
+        paragraphs = [p.strip() for p in paragraphs if p.strip()]
+        logger.info(f"   Initial split: {len(paragraphs)} paragraphs")
+        
+        # Step 2: Merge intelligently using LLM
+        # If target_segments is None, merge_paragraphs_smart will auto-analyze
+        if target_segments is not None and len(paragraphs) <= target_segments:
+            # No need to merge if already within target
+            logger.info(f"   Paragraphs count ({len(paragraphs)}) <= target ({target_segments}), no merge needed")
+            narrations = paragraphs
+        else:
+            narrations = await merge_paragraphs_smart(
+                llm_service=llm_service,
+                paragraphs=paragraphs,
+                target_segments=target_segments  # Can be None for auto-analysis
+            )
+        logger.info(f"✅ Smart split: {len(paragraphs)} paragraphs -> {len(narrations)} segments")
+    
+    elif split_mode == "paragraph":
        # Split by double newline (paragraph mode)
        # Preserve single newlines within paragraphs
        paragraphs = re.split(r'\n\s*\n', script)
@@ -266,6 +295,150 @@ async def split_narration_script(
    return narrations


+async def merge_paragraphs_smart(
+    llm_service,
+    paragraphs: List[str],
+    target_segments: int = None,  # Now optional - auto-analyze if not provided
+    max_retries: int = 3,
+) -> List[str]:
+    """
+    Use LLM to intelligently merge paragraphs based on semantic relevance.
+    
+    Two-step approach:
+    1. If target_segments is not provided, first analyze text to recommend optimal count
+    2. Then group paragraphs based on the target count
+    
+    Args:
+        llm_service: LLM service instance
+        paragraphs: List of original paragraphs
+        target_segments: Target number of merged segments (auto-analyzed if None)
+        max_retries: Maximum retry attempts for each step
+    
+    Returns:
+        List of merged paragraphs
+    """
+    from pixelle_video.prompts import (
+        build_paragraph_analysis_prompt,
+        build_paragraph_grouping_prompt
+    )
+    
+    # ========================================
+    # Step 1: Analyze and recommend segment count (if not provided)
+    # ========================================
+    if target_segments is None:
+        logger.info(f"Analyzing {len(paragraphs)} paragraphs to recommend segment count...")
+        
+        analysis_prompt = build_paragraph_analysis_prompt(paragraphs)
+        analysis_result = None
+        
+        for attempt in range(1, max_retries + 1):
+            try:
+                response = await llm_service(
+                    prompt=analysis_prompt,
+                    temperature=0.3,
+                    max_tokens=1500
+                )
+                
+                logger.debug(f"Analysis response length: {len(response)} chars")
+                
+                result = _parse_json(response)
+                
+                if "recommended_segments" not in result:
+                    raise KeyError("Missing 'recommended_segments' in analysis")
+                
+                target_segments = result["recommended_segments"]
+                analysis_result = result
+                
+                # Validate range
+                if target_segments < 3:
+                    target_segments = 3
+                elif target_segments > 15:
+                    target_segments = 15
+                
+                reasoning = result.get("reasoning", "N/A")
+                logger.info(f"✅ Analysis complete: recommended {target_segments} segments")
+                logger.info(f"   Reasoning: {reasoning[:100]}...")
+                break
+                
+            except Exception as e:
+                logger.error(f"Analysis attempt {attempt} failed: {e}")
+                if attempt >= max_retries:
+                    # Fallback: use simple heuristic
+                    target_segments = max(3, min(12, len(paragraphs) // 3))
+                    logger.warning(f"Using fallback: {target_segments} segments (paragraphs/3)")
+                    analysis_result = None
+                    break
+                logger.info("Retrying analysis...")
+    else:
+        analysis_result = None
+        logger.info(f"Using provided target: {target_segments} segments")
+    
+    # ========================================
+    # Step 2: Group paragraphs
+    # ========================================
+    logger.info(f"Grouping {len(paragraphs)} paragraphs into {target_segments} segments...")
+    
+    grouping_prompt = build_paragraph_grouping_prompt(
+        paragraphs=paragraphs,
+        target_segments=target_segments,
+        analysis_result=analysis_result
+    )
+    
+    for attempt in range(1, max_retries + 1):
+        try:
+            response = await llm_service(
+                prompt=grouping_prompt,
+                temperature=0.3,
+                max_tokens=2000
+            )
+            
+            logger.debug(f"Grouping response length: {len(response)} chars")
+            
+            result = _parse_json(response)
+            
+            if "groups" not in result:
+                raise KeyError("Invalid response format: missing 'groups'")
+            
+            groups = result["groups"]
+            
+            # Validate count
+            if len(groups) != target_segments:
+                logger.warning(
+                    f"Grouping attempt {attempt}: expected {target_segments} groups, got {len(groups)}"
+                )
+                if attempt < max_retries:
+                    continue
+                logger.warning(f"Accepting {len(groups)} groups after {max_retries} attempts")
+            
+            # Validate group boundaries
+            for i, group in enumerate(groups):
+                if "start" not in group or "end" not in group:
+                    raise ValueError(f"Group {i} missing 'start' or 'end'")
+                if group["start"] > group["end"]:
+                    raise ValueError(f"Group {i} has invalid range: start > end")
+                if group["start"] < 0 or group["end"] >= len(paragraphs):
+                    raise ValueError(f"Group {i} has out-of-bounds indices")
+            
+            # Merge paragraphs based on groups
+            merged = []
+            for group in groups:
+                start, end = group["start"], group["end"]
+                merged_text = "\n\n".join(paragraphs[start:end + 1])
+                merged.append(merged_text)
+            
+            logger.info(f"✅ Successfully merged into {len(merged)} segments")
+            return merged
+            
+        except Exception as e:
+            logger.error(f"Grouping attempt {attempt} failed: {e}")
+            if attempt >= max_retries:
+                raise
+            logger.info("Retrying grouping...")
+    
+    # Fallback: should not reach here
+    return paragraphs
+
+
 async def generate_image_prompts(
    llm_service,
    narrations: List[str],
@@ -489,8 +662,8 @@ def _parse_json(text: str) -> dict:
        except json.JSONDecodeError:
            pass
    
-    # Try to find any JSON object in the text
-    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts")\s*:\s*\[[^\]]*\][^{}]*\}'
+    # Try to find any JSON object with known keys (including analysis keys)
+    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts"|"video_prompts"|"merged_paragraphs"|"groups"|"recommended_segments"|"scene_boundaries")\s*:\s*[^{}]*\}'
    match = re.search(json_pattern, text, re.DOTALL)
    if match:
        try:
@@ -498,6 +671,17 @@ def _parse_json(text: str) -> dict:
        except json.JSONDecodeError:
            pass
    
+    # Try to find any JSON object that looks like it contains an array
+    # This is a more aggressive fallback for complex nested arrays
+    json_start = text.find('{')
+    json_end = text.rfind('}')
+    if json_start != -1 and json_end != -1 and json_end > json_start:
+        potential_json = text[json_start:json_end + 1]
+        try:
+            return json.loads(potential_json)
+        except json.JSONDecodeError:
+            pass
+    
    # If all fails, raise error
    raise json.JSONDecodeError("No valid JSON found", text, 0)