feat: Add smart paragraph merging mode with AI grouping

- Add "smart" split mode that uses LLM to intelligently merge related paragraphs - Implement two-step approach: analyze text structure, then group by semantic relevance - Add paragraph_merging.py with analysis and grouping prompts - Update UI to support smart mode selection with auto-detect hint - Add i18n translations for smart mode (en_US, zh_CN) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
docs: Add port configuration guide
2026-01-17 00:19:46 +08:00 · 2026-01-10 16:13:23 +08:00 · 2026-01-10 16:13:02 +08:00 · 2026-01-10 15:56:22 +08:00 · 2026-01-10 15:56:06 +08:00 · 2026-01-07 15:49:11 +08:00
32 changed files with 4226 additions and 1799 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,21 @@
+# Pixelle-Video Environment Configuration
+# Copy this file to .env and customize as needed
+
+# ============================================================================
+# Port Configuration
+# ============================================================================
+
+# FastAPI Backend Port
+API_PORT=8000
+
+# Next.js Editor Port
+EDITOR_PORT=3000
+
+# Streamlit Web UI Port
+WEB_PORT=8501
+
+# ============================================================================
+# Other Configuration
+# ============================================================================
+
+# Add other environment variables here as needed
--- a/.gitignore
+++ b/.gitignore
@@ -76,3 +76,5 @@ examples/
 repositories/

 *.out
+.pids/
+.serena/
--- a/api/routers/editor.py
+++ b/api/routers/editor.py
@@ -41,28 +41,37 @@ from api.schemas.editor import (
    ExportRequest,
    ExportResponse,
    ExportStatusResponse,
+    AlignPromptRequest,
+    AlignPromptResponse,
 )
 from fastapi import BackgroundTasks
 import asyncio
 import uuid as uuid_module
+import os

 router = APIRouter(prefix="/editor", tags=["Editor"])

 # Export task storage
 _export_tasks: dict = {}

+# Get API port from environment
+API_PORT = os.getenv("API_PORT", "8000")

-def _path_to_url(file_path: str, base_url: str = "http://localhost:8000") -> str:
+
+def _path_to_url(file_path: str, base_url: str = None) -> str:
    """Convert local file path to URL accessible through API"""
    if not file_path:
        return None
-    
+
+    if base_url is None:
+        base_url = f"http://localhost:{API_PORT}"
+
    import os
    from pathlib import Path
-    
+
    # Normalize path separators
    file_path = file_path.replace("\\", "/")
-    
+
    # Extract relative path from output directory
    parts = file_path.split("/")
    try:
@@ -71,7 +80,7 @@ def _path_to_url(file_path: str, base_url: str = "http://localhost:8000") -> str
        relative_path = "/".join(relative_parts)
    except ValueError:
        relative_path = Path(file_path).name
-    
+
    return f"{base_url}/api/files/{relative_path}"


@@ -450,22 +459,69 @@ async def regenerate_frame_image(
        raise HTTPException(status_code=400, detail="No image prompt available")
    
    try:
-        # Import and use PixelleVideo core for image generation
+        # Import and use PixelleVideo core services
        from api.dependencies import get_pixelle_video
-        from pixelle_video.models.storyboard import StoryboardFrame, StoryboardConfig
+        from api.routers.quality import _style_anchors
        
-        pixelle_video = get_pixelle_video()
+        logger.debug(f"[REGEN-IMG] Starting image regeneration for frame {frame_id}")
+        logger.debug(f"[REGEN-IMG] Original prompt: {prompt[:100]}...")
        
-        # Generate image using ComfyKit
-        result = await pixelle_video.comfy(
-            workflow="image_gen",
-            prompt=prompt,
-            task_id=storyboard_id,
+        pixelle_video = await get_pixelle_video()
+        
+        # Get style anchor prefix if available
+        style_prefix = ""
+        logger.debug(f"[REGEN-IMG] Checking style anchors for storyboard {storyboard_id}")
+        logger.debug(f"[REGEN-IMG] Available style anchors: {list(_style_anchors.keys())}")
+        
+        if storyboard_id in _style_anchors:
+            style_data = _style_anchors[storyboard_id]
+            style_prefix = style_data.get("style_prefix", "")
+            logger.info(f"[REGEN-IMG] Found style anchor: {style_prefix[:80] if style_prefix else 'EMPTY'}...")
+        else:
+            logger.warning(f"[REGEN-IMG] No style anchor found for {storyboard_id}")
+        
+        # Get character descriptions for prompt injection
+        character_prefix = ""
+        from api.routers.quality import _character_stores
+        if storyboard_id in _character_stores:
+            char_descriptions = []
+            for char_data in _character_stores[storyboard_id].values():
+                appearance = char_data.get("appearance_description", "")
+                clothing = char_data.get("clothing_description", "")
+                name = char_data.get("name", "character")
+                
+                if appearance or clothing:
+                    parts = [f"{name}:"]
+                    if appearance:
+                        parts.append(appearance)
+                    if clothing:
+                        parts.append(f"wearing {clothing}")
+                    char_descriptions.append(" ".join(parts))
+            
+            if char_descriptions:
+                character_prefix = "Characters: " + "; ".join(char_descriptions) + ". "
+                logger.info(f"[REGEN-IMG] Injecting character descriptions: {character_prefix[:80]}...")
+        
+        # Apply style prefix and character descriptions to prompt
+        final_prompt = ""
+        if style_prefix:
+            final_prompt += f"{style_prefix}, "
+        if character_prefix:
+            final_prompt += character_prefix
+        final_prompt += prompt
+        logger.info(f"[REGEN-IMG] Final prompt: {final_prompt[:120]}...")
+        
+        # Use MediaService to generate image via RunningHub workflow
+        # Use image_flux2 workflow (FLUX.1 Kontext model for better consistency)
+        logger.debug(f"[REGEN-IMG] Calling pixelle_video.image with workflow=runninghub/image_flux2.json")
+        result = await pixelle_video.image(
+            prompt=final_prompt,
+            media_type="image",
+            workflow="runninghub/image_flux2.json",
        )
        
-        if result and result.get("images"):
+        if result and result.url:
            # Download and save image
-            image_url = result["images"][0]
            import aiohttp
            import os
            
@@ -473,17 +529,47 @@ async def regenerate_frame_image(
            os.makedirs(output_dir, exist_ok=True)
            image_path = f"{output_dir}/frame_{frame_index}_regenerated.png"
            
-            async with aiohttp.ClientSession() as session:
-                async with session.get(image_url) as resp:
-                    if resp.status == 200:
-                        with open(image_path, 'wb') as f:
-                            f.write(await resp.read())
+            # Check if URL is remote or local
+            if result.url.startswith("http"):
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(result.url) as resp:
+                        if resp.status == 200:
+                            with open(image_path, 'wb') as f:
+                                f.write(await resp.read())
+            else:
+                # Local file, copy it
+                import shutil
+                if os.path.exists(result.url):
+                    shutil.copy2(result.url, image_path)
+                else:
+                    image_path = result.url
            
            # Update frame
            target_frame["image_path"] = _path_to_url(image_path)
            _storyboard_cache[storyboard_id] = storyboard
            
-            logger.info(f"Regenerated image for frame {frame_id}")
+            # Persist changes to storyboard.json
+            try:
+                from pixelle_video.services.persistence import PersistenceService
+                persistence = PersistenceService()
+                
+                # Load existing storyboard model
+                storyboard_model = await persistence.load_storyboard(storyboard_id)
+                if storyboard_model:
+                    # Update the specific frame's image_path
+                    for frame in storyboard_model.frames:
+                        if f"frame-{frame.index}" == frame_id:
+                            frame.image_path = image_path
+                            logger.debug(f"[PERSIST] Updated frame {frame_id} image_path in model")
+                            break
+                    
+                    # Save back to JSON
+                    await persistence.save_storyboard(storyboard_id, storyboard_model)
+                    logger.info(f"[PERSIST] Saved storyboard to JSON for {storyboard_id}")
+            except Exception as pe:
+                logger.warning(f"[PERSIST] Failed to persist storyboard: {pe}")
+            
+            logger.info(f"Regenerated image for frame {frame_id} via RunningHub")
            
            return RegenerateImageResponse(
                image_path=target_frame["image_path"],
@@ -542,7 +628,7 @@ async def regenerate_frame_audio(
        from api.dependencies import get_pixelle_video
        import os
        
-        pixelle_video = get_pixelle_video()
+        pixelle_video = await get_pixelle_video()
        
        # Create output path
        output_dir = f"output/{storyboard_id}"
@@ -574,6 +660,31 @@ async def regenerate_frame_audio(
        storyboard["total_duration"] = sum(f.get("duration", 3.0) for f in frames)
        _storyboard_cache[storyboard_id] = storyboard
        
+        # Persist changes to storyboard.json
+        try:
+            from pixelle_video.services.persistence import PersistenceService
+            persistence = PersistenceService()
+            
+            # Load existing storyboard model
+            storyboard_model = await persistence.load_storyboard(storyboard_id)
+            if storyboard_model:
+                # Update the specific frame's audio_path and duration
+                for frame in storyboard_model.frames:
+                    if f"frame-{frame.index}" == frame_id:
+                        frame.audio_path = result_path
+                        frame.duration = duration
+                        logger.debug(f"[PERSIST] Updated frame {frame_id} audio_path in model")
+                        break
+                
+                # Update total duration
+                storyboard_model.total_duration = sum(f.duration or 3.0 for f in storyboard_model.frames)
+                
+                # Save back to JSON
+                await persistence.save_storyboard(storyboard_id, storyboard_model)
+                logger.info(f"[PERSIST] Saved storyboard to JSON for {storyboard_id}")
+        except Exception as pe:
+            logger.warning(f"[PERSIST] Failed to persist storyboard: {pe}")
+        
        logger.info(f"Regenerated audio for frame {frame_id}, duration: {duration}s")
        
        return RegenerateAudioResponse(
@@ -590,6 +701,98 @@ async def regenerate_frame_audio(
        raise HTTPException(status_code=500, detail=str(e))


+@router.post(
+    "/storyboard/{storyboard_id}/frames/{frame_id}/align-prompt",
+    response_model=AlignPromptResponse
+)
+async def align_frame_prompt(
+    storyboard_id: str = Path(..., description="Storyboard/task ID"),
+    frame_id: str = Path(..., description="Frame ID"),
+    request: AlignPromptRequest = None
+):
+    """
+    Align image prompt with narration
+    
+    Regenerates the image prompt based on the frame's narration using 
+    enhanced core imagery extraction for better semantic relevance.
+    """
+    if storyboard_id not in _storyboard_cache:
+        raise HTTPException(status_code=404, detail=f"Storyboard {storyboard_id} not found")
+    
+    storyboard = _storyboard_cache[storyboard_id]
+    frames = storyboard["frames"]
+    
+    # Find frame
+    target_frame = None
+    for frame in frames:
+        if frame["id"] == frame_id:
+            target_frame = frame
+            break
+    
+    if not target_frame:
+        raise HTTPException(status_code=404, detail=f"Frame {frame_id} not found")
+    
+    # Get narration to use
+    narration = request.narration if request and request.narration else target_frame.get("narration", "")
+    
+    if not narration:
+        raise HTTPException(status_code=400, detail="No narration text available")
+    
+    try:
+        from api.dependencies import get_pixelle_video
+        
+        pixelle_video = await get_pixelle_video()
+        
+        # Use LLM to generate aligned image prompt
+        from pixelle_video.prompts import build_image_prompt_prompt
+        
+        prompt = build_image_prompt_prompt(
+            narrations=[narration],
+            min_words=30,
+            max_words=60
+        )
+        
+        response = await pixelle_video.llm(
+            prompt=prompt,
+            temperature=0.7,
+            max_tokens=500
+        )
+        
+        # Parse response
+        import json
+        import re
+        
+        # Try to extract JSON
+        try:
+            result = json.loads(response)
+        except json.JSONDecodeError:
+            # Try markdown code block
+            match = re.search(r'```(?:json)?\s*([\s\S]+?)\s*```', response)
+            if match:
+                result = json.loads(match.group(1))
+            else:
+                raise ValueError("Failed to parse LLM response")
+        
+        if "image_prompts" not in result or len(result["image_prompts"]) == 0:
+            raise ValueError("No image prompts in response")
+        
+        new_prompt = result["image_prompts"][0]
+        
+        # Update frame
+        target_frame["image_prompt"] = new_prompt
+        _storyboard_cache[storyboard_id] = storyboard
+        
+        logger.info(f"Aligned image prompt for frame {frame_id}")
+        
+        return AlignPromptResponse(
+            image_prompt=new_prompt,
+            success=True
+        )
+        
+    except Exception as e:
+        logger.error(f"Prompt alignment failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
@router.post(
    "/storyboard/{storyboard_id}/frames/{frame_id}/inpaint",
    response_model=InpaintResponse
@@ -755,8 +958,9 @@ async def export_video(
            for frame in sorted_frames:
                path = frame.get("video_segment_path", "")
                if path.startswith("http"):
-                    # Extract path from URL
-                    path = path.replace("http://localhost:8000/api/files/", "output/")
+                    # Extract path from URL (format: http://localhost:{port}/api/files/{relative_path})
+                    if "/api/files/" in path:
+                        path = "output/" + path.split("/api/files/")[-1]
                video_segments.append(path)
            
            _export_tasks[task_id]["progress"] = 0.3
--- a/api/routers/quality.py
+++ b/api/routers/quality.py
@@ -15,10 +15,13 @@ Provides endpoints for:
 - Quality gate evaluation
 """

-from fastapi import APIRouter, HTTPException, Path, Body
+from fastapi import APIRouter, HTTPException, Path, Body, File, UploadFile, Query
 from pydantic import BaseModel, Field
 from typing import List, Optional
 from loguru import logger
+import os
+import shutil
+from datetime import datetime

 router = APIRouter(prefix="/quality", tags=["Quality"])

@@ -45,6 +48,20 @@ class CharacterCreateRequest(BaseModel):
    clothing_description: str = Field("", description="Clothing description")
    distinctive_features: List[str] = Field(default_factory=list)
    character_type: str = Field("person")
+    reference_image_path: Optional[str] = Field(None, description="Reference image path for VLM analysis")
+
+
+class CharacterAnalyzeRequest(BaseModel):
+    """Request to analyze a character image"""
+    image_path: str = Field(..., description="Path to the reference image")
+
+
+class CharacterAnalyzeResponse(BaseModel):
+    """Response from character image analysis"""
+    appearance_description: str = ""
+    clothing_description: str = ""
+    distinctive_features: List[str] = []
+    prompt_description: str = ""  # Combined description for prompt injection


 class ContentCheckRequest(BaseModel):
@@ -115,20 +132,49 @@ async def create_character(
    storyboard_id: str = Path(..., description="Storyboard ID"),
    request: CharacterCreateRequest = Body(...)
 ):
-    """Register a new character"""
+    """
+    Register a new character
+    
+    If reference_image_path is provided and appearance_description is empty,
+    VLM will analyze the image to extract appearance descriptions automatically.
+    """
    import uuid
    
    if storyboard_id not in _character_stores:
        _character_stores[storyboard_id] = {}
    
+    # Auto-analyze reference image if provided and no description
+    appearance_desc = request.appearance_description
+    clothing_desc = request.clothing_description
+    distinctive = request.distinctive_features
+    ref_image = request.reference_image_path
+    
+    if ref_image and not appearance_desc:
+        try:
+            from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
+            analyzer = CharacterAnalyzer()
+            result = await analyzer.analyze_reference_image(ref_image)
+            
+            if result.appearance_description:
+                appearance_desc = result.appearance_description
+            if result.clothing_description:
+                clothing_desc = result.clothing_description
+            if result.distinctive_features:
+                distinctive = result.distinctive_features
+            
+            logger.info(f"Auto-analyzed character from image: {ref_image}")
+        except Exception as e:
+            logger.warning(f"Failed to auto-analyze character image: {e}")
+    
    char_id = f"char_{uuid.uuid4().hex[:8]}"
    character = CharacterSchema(
        id=char_id,
        name=request.name,
-        appearance_description=request.appearance_description,
-        clothing_description=request.clothing_description,
-        distinctive_features=request.distinctive_features,
+        appearance_description=appearance_desc,
+        clothing_description=clothing_desc,
+        distinctive_features=distinctive,
        character_type=request.character_type,
+        reference_image=ref_image,
    )
    
    _character_stores[storyboard_id][char_id] = character.model_dump()
@@ -184,6 +230,75 @@ async def delete_character(
    return {"deleted": True}


+@router.post(
+    "/characters/{storyboard_id}/analyze-image",
+    response_model=CharacterAnalyzeResponse
+)
+async def analyze_character_image(
+    storyboard_id: str = Path(..., description="Storyboard ID"),
+    request: CharacterAnalyzeRequest = Body(...)
+):
+    """
+    Analyze a character reference image using VLM
+    
+    Extracts detailed appearance descriptions that can be used
+    to maintain character consistency across frames.
+    """
+    from pixelle_video.services.quality.character_analyzer import CharacterAnalyzer
+    
+    logger.info(f"Analyzing character image for storyboard {storyboard_id}: {request.image_path}")
+    
+    analyzer = CharacterAnalyzer()
+    result = await analyzer.analyze_reference_image(request.image_path)
+    
+    return CharacterAnalyzeResponse(
+        appearance_description=result.appearance_description,
+        clothing_description=result.clothing_description,
+        distinctive_features=result.distinctive_features,
+        prompt_description=result.to_prompt_description()
+    )
+
+
+@router.post("/upload")
+async def upload_file(
+    file: UploadFile = File(...),
+    storyboard_id: str = Query(..., description="Storyboard ID"),
+    type: str = Query("character", description="File type (character, reference)")
+):
+    """
+    Upload a file for character reference or other purposes.
+    
+    Returns the saved file path that can be used for analysis.
+    """
+    try:
+        # Create output directory
+        output_dir = f"output/{storyboard_id}"
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        ext = os.path.splitext(file.filename)[1] or ".png"
+        filename = f"{type}_{timestamp}{ext}"
+        file_path = os.path.join(output_dir, filename)
+        
+        # Save file
+        with open(file_path, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        
+        logger.info(f"Uploaded file to: {file_path}")
+        
+        return {
+            "success": True,
+            "path": file_path,
+            "file_path": file_path,
+            "filename": filename
+        }
+        
+    except Exception as e:
+        logger.error(f"Failed to upload file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
 # ============================================================
 # Content Filter Endpoints
 # ============================================================
@@ -234,8 +349,9 @@ async def extract_style(
        # Convert URL to file path if needed
        actual_path = image_path
        if image_path.startswith("http"):
-            # Extract path from URL like http://localhost:8000/api/files/...
-            actual_path = image_path.replace("http://localhost:8000/api/files/", "output/")
+            # Extract path from URL (format: http://localhost:{port}/api/files/{relative_path})
+            if "/api/files/" in image_path:
+                actual_path = "output/" + image_path.split("/api/files/")[-1]
        
        # Check if file exists
        import os
@@ -254,9 +370,14 @@ async def extract_style(
            return style_schema
        
        from pixelle_video.services.quality.style_guard import StyleGuard
+        from api.dependencies import get_pixelle_video
        
-        style_guard = StyleGuard()
-        anchor = style_guard.extract_style_anchor(actual_path)
+        # Get LLM service for VLM-based style extraction
+        pixelle_video = await get_pixelle_video()
+        llm_service = pixelle_video.llm if pixelle_video else None
+        
+        style_guard = StyleGuard(llm_service=llm_service)
+        anchor = await style_guard.extract_style_anchor(actual_path)
        
        style_schema = StyleAnchorSchema(
            color_palette=anchor.color_palette,
--- a/api/schemas/editor.py
+++ b/api/schemas/editor.py
@@ -144,3 +144,12 @@ class ExportStatusResponse(BaseModel):
    error: Optional[str] = None


+class AlignPromptRequest(BaseModel):
+    """Request to align image prompt with narration"""
+    narration: Optional[str] = Field(None, description="Override narration text")
+
+
+class AlignPromptResponse(BaseModel):
+    """Response after aligning prompt"""
+    image_prompt: str
+    success: bool = True
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -17,6 +17,20 @@ llm:
 # DeepSeek:        base_url: "https://api.deepseek.com"                           model: "deepseek-chat"
 # Ollama (Local):  base_url: "http://localhost:11434/v1"                          model: "llama3.2"

+# ==================== VLM Configuration (Vision Language Model) ====================
+# Used for character analysis and image understanding
+# If not configured, will try to use LLM config with vision model auto-detection
+vlm:
+  provider: "qwen"  # Options: qwen, glm, openai
+  api_key: ""       # Leave empty to use DASHSCOPE_API_KEY or VLM_API_KEY env var
+  base_url: ""      # Leave empty for auto-detection based on provider
+  model: ""         # Leave empty for default model based on provider
+
+# VLM Provider presets:
+# Qwen (通义千问):   provider: "qwen"   model: "qwen-vl-plus" or "qwen-vl-max" or "qwen3-vl-plus"
+# GLM (智谱):       provider: "glm"    model: "glm-4v-flash" or "glm-4v"
+# OpenAI:          provider: "openai" model: "gpt-4-vision-preview" or "gpt-4o"
+
 # ==================== ComfyUI Configuration ====================
 comfyui:
  # Global ComfyUI settings
--- a/dev.sh
+++ b/dev.sh
@@ -54,7 +54,7 @@ print_banner() {

 start_api() {
    echo -e "${GREEN}🚀 Starting FastAPI Backend...${NC}"
-    uv run python api/app.py --port $API_PORT --reload &
+    API_PORT=$API_PORT uv run python api/app.py --port $API_PORT --reload &
    echo $! > "$PID_DIR/api.pid"
    echo -e "   ${GREEN}✓${NC} API running at: ${YELLOW}http://localhost:$API_PORT${NC}"
    echo -e "   ${GREEN}✓${NC} API Docs at: ${YELLOW}http://localhost:$API_PORT/docs${NC}"
@@ -63,7 +63,7 @@ start_api() {
 start_editor() {
    echo -e "${GREEN}🎬 Starting Next.js Editor...${NC}"
    cd "$PROJECT_ROOT/frontend"
-    PORT=$EDITOR_PORT npm run dev &
+    API_PORT=$API_PORT PORT=$EDITOR_PORT npm run dev &
    echo $! > "$PID_DIR/editor.pid"
    cd "$PROJECT_ROOT"
    echo -e "   ${GREEN}✓${NC} Editor running at: ${YELLOW}http://localhost:$EDITOR_PORT${NC}"
@@ -71,7 +71,7 @@ start_editor() {

 start_web() {
    echo -e "${GREEN}🌐 Starting Streamlit Web UI...${NC}"
-    uv run streamlit run web/app.py --server.port $WEB_PORT &
+    API_PORT=$API_PORT EDITOR_PORT=$EDITOR_PORT uv run streamlit run web/app.py --server.port $WEB_PORT &
    echo $! > "$PID_DIR/web.pid"
    echo -e "   ${GREEN}✓${NC} Web UI running at: ${YELLOW}http://localhost:$WEB_PORT${NC}"
 }
--- a/docs/port-configuration.md
+++ b/docs/port-configuration.md
@@ -0,0 +1,54 @@
+# 端口配置说明
+
+## 默认端口
+
+| 服务 | 默认端口 | 说明 |
+|------|---------|------|
+| FastAPI 后端 | 8000 | API 服务和文档 |
+| Next.js 编辑器 | 3000 | 时间轴编辑器 |
+| Streamlit Web UI | 8501 | Web 界面 |
+
+## 自定义端口
+
+### 方式 1: 环境变量（临时）
+
+```bash
+# 自定义所有端口
+API_PORT=8080 EDITOR_PORT=3001 WEB_PORT=8502 ./dev.sh
+
+# 只自定义部分端口
+API_PORT=8080 ./dev.sh
+```
+
+### 方式 2: .env 文件（持久）
+
+1. 复制示例配置文件：
+   ```bash
+   cp .env.example .env
+   ```
+
+2. 编辑 `.env` 文件，修改端口：
+   ```bash
+   API_PORT=8080
+   EDITOR_PORT=3001
+   WEB_PORT=8502
+   ```
+
+3. 启动服务：
+   ```bash
+   ./dev.sh
+   ```
+
+## 注意事项
+
+1. **端口冲突**：确保选择的端口没有被其他程序占用
+2. **防火墙**：如需外部访问，请配置防火墙规则
+3. **前端重建**：修改端口后，Next.js 前端会自动重建（首次启动较慢）
+
+## 验证端口配置
+
+启动后访问以下地址确认服务正常：
+
+- API 文档: `http://localhost:{API_PORT}/docs`
+- 编辑器: `http://localhost:{EDITOR_PORT}`
+- Web UI: `http://localhost:{WEB_PORT}`
--- a/docs/工作流完整接入示例.md
+++ b/docs/工作流完整接入示例.md
@@ -0,0 +1,434 @@
+# 工作流完整接入示例
+
+# 📝 RunningHub AI 工作流交互使用手册（workflow 版本）
+
+## 1. 功能概述
+
+本脚本通过调用 RunningHub AI 平台的 OpenAPI，实现从本地加载工作流 JSON、修改节点信息、上传文件、提交任务并自动查询结果的全流程操作。
+
+主要功能包括：
+
+- 读取本地工作流配置（JSON 文件）  
+- 生成可修改节点信息列表（nodeInfoList）  
+- 根据节点类型（图片、文本等）修改节点值  
+- 上传图片、音频、视频文件  
+- 向 RunningHub 提交任务并实时查询状态  
+- 输出最终生成结果的文件链接  
+
+✅ 适用于有自定义工作流（workflowId）的高级用户，可在不打开网页的情况下自动执行 AI 工作流。
+
+---
+
+## 2. 文件说明与主要函数
+
+### 💡 主要文件
+
+| 文件名       | 功能 |
+|-------------|------|
+| workflow.py | 主执行脚本 |
+| api.json    | 从 RunningHub 下载的工作流配置文件（包含节点定义） |
+
+### 🔧 核心函数介绍
+
+| 函数名 | 功能描述 |
+|--------|----------|
+| load_json(file_path) | 从本地读取并解析工作流 JSON 文件 |
+| convert_to_node_info_list(data) | 将 JSON 格式转换为节点信息列表 |
+| upload_file(API_KEY, file_path) | 上传本地文件（image/audio/video）至 RunningHub |
+| submit_task(workflowId, node_info_list, API_KEY) | 提交任务，启动 AI 工作流执行 |
+| query_task_outputs(task_id, API_KEY) | 轮询任务执行状态并获取结果输出 |
+
+---
+
+## 3. 操作步骤详解
+
+### Step 1️⃣：输入必要信息
+
+运行脚本后，系统会提示输入以下信息：
+
+```text
+请输入你的 api_key:
+```
+说明：在 RunningHub 控制台“API 调用”中可获得。
+示例：`0s2d1***********2n3mk4`
+```
+请输入 workflowId:
+```
+示例：`1980468315921559554` 
+来源于链接末尾：https://www.runninghub.cn/workflow/1980237776367083521?source=workspace
+
+然后输入本地工作流 JSON 文件路径：
+
+```
+输入您的json文件地址(json文件一定要在自己的工作台中获得，获得途径为导出工作流api到本地)：
+```
+示例：`C:\Users\Mayn\Downloads\api.json`
+
+此时脚本会输出工作流中的所有节点信息：
+
+```
+等待node_info_list生成（包含所有可修改的节点）
+{'3': {'inputs': {...}}, '4': {...}, '6': {...}, ...}
+```
+
+---
+
+### Step 2️⃣：查看并修改节点
+
+脚本会提示：
+
+```text
+请输入 nodeId（输入 'exit' 结束修改）:
+```
+        
+输入节点 nodeId（如 10），脚本会展示该节点的所有字段：
+
+```
+🧩 找到节点 10 的字段如下：
+(0, {'nodeId': '10', 'fieldName': 'image', 'fieldValue': 'xxx.jpg'})
+```
+
+接着输入要修改的字段名：
+
+```
+请输入要修改的 fieldName:
+```
+示例：`image`
+
+---
+
+### Step 3️⃣：修改字段值
+
+#### 📷 如果是文件类型（image/audio/video）
+
+```
+请输入您本地image文件路径:
+```
+示例输入：`D:\R.jpg`
+
+上传成功后：
+
+```
+等待文件上传中
+上传结果: {'code': 0, 'msg': 'success', 'data': {'fileName': 'api/xxx.jpg', 'fileType': 'input'}}
+✅ 已更新 image fieldValue: api/xxx.jpg
+```
+
+#### 📝 如果是文本或数值类型
+
+```
+请输入新的 fieldValue (text):
+```
+示例输入：`1 girl in classroom`
+
+返回：
+
+```
+✅ 已更新 fieldValue: 1 girl in classroom
+```
+
+> 可多次修改不同节点，输入 `exit` 结束。
+
+---
+
+### Step 4️⃣：提交任务
+
+输入完成后，脚本自动提交任务：
+
+```
+开始提交任务，请等待
+📌 提交任务返回: {'code': 0, 'msg': 'success', 'data': {...}}
+📝 taskId: 1980471280073846785
+✅ 无节点错误，任务提交成功。
+```
+
+---
+
+### Step 5️⃣：任务状态轮询
+
+脚本每隔 5 秒查询任务状态：
+
+```
+⏳ 任务运行中...
+⏳ 任务运行中...
+🎉 生成结果完成！
+```
+
+如果任务失败，会打印详细原因：
+
+```
+❌ 任务失败！
+节点 SaveImage 失败原因: 'str' object has no attribute 'shape'
+Traceback: [...]
+```
+
+---
+
+### Step 6️⃣：查看结果文件
+
+任务成功后会输出生成文件链接：
+
+```
+🎉 生成结果完成！
+[{'fileUrl': 'https://rh-images.xiaoyaoyou.com/f24a6365b08fa3bc02f55cd1f63e74a7/output/ComfyUI_00001_hnqxe_1761016156.png',
+  'fileType': 'png',
+  'taskCostTime': '35',
+  'nodeId': '17'}]
+✅ 任务完成！
+```
+
+打开 `fileUrl` 即可查看 AI 生成的图片。
+
+## 4. 完整运行流程概览
+
+1️⃣ 输入 API_KEY 和 workflowId  
+2️⃣ 加载本地 JSON 工作流  
+3️⃣ 自动生成可修改节点列表  
+4️⃣ 修改所需节点参数  
+5️⃣ 上传文件（如图片）  
+6️⃣ 提交任务至 RunningHub  
+7️⃣ 轮询任务状态  
+8️⃣ 获取并打印生成结果链接  
+
+---
+
+## 5. 示例输出结果
+
+```
+请输入你的 api_key: a0fada**************b2ke21
+请输入 workflowId: ***8315921559***
+输入您的json文件地址(json文件一定要在自己的工作台中获得，获得途径为导出工作流api到本地)：C:\Users\Mayn\Downloads\api.json
+```
+```
+🧩 找到节点 10 的字段如下：
+(0, {'nodeId': '10', 'fieldName': 'image', 'fieldValue': 'xxx.jpg'})
+✅ 已更新 image fieldValue: api/xxx.jpg
+```
+```
+开始提交任务，请等待
+📌 提交任务返回: {...}
+⏳ 任务运行中...
+🎉 生成结果完成!
+✅ 任务完成！
+```
+
+---
+
+## 6. 小贴士（Tips）
+
+- 建议使用 Python 3.8+  
+- 脚本可直接在终端运行：
+
+```bash
+python workflow.py
+```
+
+- Windows 用户注意文件路径需使用双反斜杠 `\\`  
+- 若使用代理或云主机，请确保端口 443 可访问 `www.runninghub.cn`
+
+```python
+import http.client
+import json
+import mimetypes
+from codecs import encode
+import time
+import os
+import requests
+API_HOST = "www.runninghub.cn"
+def load_json(file_path):
+    # 打开并读取 JSON 文件
+    with open(file_path, "r", encoding="utf-8") as f:
+        data = json.load(f)  # 将 JSON 内容解析为 Python 对象（dict 或 list）
+    # 打印读取到的数据
+    print(data)
+    return data
+def convert_to_node_info_list(data):
+    node_info_list = []
+
+    for node_id, node_content in data.items():
+        inputs = node_content.get("inputs", {})
+        for field_name, field_value in inputs.items():
+            # 如果 field_value 是列表或字典，可以选择转换成字符串
+            if isinstance(field_value, (list, dict)):
+                field_value = json.dumps(field_value)
+            else:
+                field_value = str(field_value)
+
+            node_info_list.append({
+                "nodeId": str(node_id),
+                "fieldName": str(field_name),
+                "fieldValue": field_value
+            })
+    return node_info_list
+def upload_file(API_KEY, file_path):
+    """
+    上传文件到 RunningHub 平台
+    """
+    url = "https://www.runninghub.cn/task/openapi/upload"
+    headers = {
+        'Host': 'www.runninghub.cn'
+    }
+    data = {
+        'apiKey': API_KEY,
+        'fileType': 'input'
+    }
+    with open(file_path, 'rb') as f:
+        files = {'file': f}
+        response = requests.post(url, headers=headers, files=files, data=data)
+    return response.json()
+# 1️⃣ 提交任务
+def submit_task(workflowId, node_info_list,API_KEY):
+    conn = http.client.HTTPSConnection("www.runninghub.cn")
+    payload = json.dumps({
+        "apiKey": API_KEY,
+        "workflowId": workflowId,
+        "nodeInfoList": node_info_list
+    })
+    headers = {
+        'Host': 'www.runninghub.cn',
+        'Content-Type': 'application/json'
+    }
+    conn.request("POST", "/task/openapi/create", payload, headers)
+    res = conn.getresponse()
+    data = res.read()
+    # ✅ 注意这里：用 json.loads 而不是 json.load
+    data = json.loads(data.decode("utf-8"))
+    print(data)
+    return data
+def query_task_outputs(task_id,API_KEY):
+    conn = http.client.HTTPSConnection(API_HOST)
+    payload = json.dumps({
+        "apiKey": API_KEY,
+        "taskId": task_id
+    })
+    headers = {
+        'Host': API_HOST,
+        'Content-Type': 'application/json'
+    }
+    conn.request("POST", "/task/openapi/outputs", payload, headers)
+    res = conn.getresponse()
+    data = json.loads(res.read().decode("utf-8"))
+    conn.close()
+    return data
+if __name__ == "__main__":
+    print("下面两个输入用于获得AI工作流所需要的信息，api_key为用户的密钥从api调用——进入控制台中获得，workflowId（此为示例，具体的workflowId为你所选择的AI工作流界面上方的链接https://www.runninghub.cn/workflow/1980237776367083521?source=workspace，最后的数字为workflowId）")
+    Api_key = input("请输入你的 api_key: ").strip()
+    workflowId = input("请输入 workflowId: ").strip()
+    print("请您下载您的工作流API json到本地")
+    file_path = input("输入您的json文件地址(json文件一定要在自己的工作台中获得，获得途径为导出工作流api到本地)：").strip()
+    print("等待node_info_list生成（包涵所有的可以修改的node节点）")
+    data = load_json(file_path)
+    node_info_list =  convert_to_node_info_list(data)
+    print(node_info_list)
+    print("下面用户可以输入工作流可以修改的节点id：nodeId,以及对应的fileName,锁定具体的节点位置，在找到具体位置之后，输入您需要修改的fileValue信息完成信息的修改用户发送AI工作流请求")
+    modified_nodes = []
+    while True:
+        node_id_input = input("请输入 nodeId（输入 'exit' 结束修改）: ").strip()
+        if node_id_input.lower() == "exit":
+            break
+
+        # 找出该 nodeId 对应的所有字段
+        node_fields = [n for n in node_info_list if n['nodeId'] == node_id_input]
+
+        if not node_fields:
+            print("❌ 未找到该 nodeId 对应的节点")
+            continue
+
+        print(f"\n🧩 找到节点 {node_id_input} 的字段如下：")
+        for field in enumerate(node_fields):
+            print(field)
+
+        # 让用户选择要修改的字段
+        field_name_input = input("\n请输入要修改的 fieldName: ").strip()
+        target_node = next(
+            (f for f in node_fields if f['fieldName'] == field_name_input), None
+        )
+
+        if not target_node:
+            print("❌ 未找到该 fieldName")
+            continue
+
+        print(f"选中字段: {target_node}")
+        # 根据类型处理
+        if target_node['fieldName'] in ["image", "audio", "video"]:
+            file_path = input(f"请输入您本地{target_node['fieldName']}文件路径: ").strip()
+            print("等待文件上传中")
+            upload_result = upload_file(Api_key, file_path)
+            print("上传结果:", upload_result)
+            # 假设 upload_file 已返回解析后的 JSON 字典
+            if upload_result and upload_result.get("msg") == "success":
+                uploaded_file_name = upload_result.get("data", {}).get("fileName")
+                if uploaded_file_name:
+                    target_node['fieldValue'] = uploaded_file_name
+                    print(f"✅ 已更新 {target_node['fieldName']} fieldValue:", uploaded_file_name)
+            else:
+                print("❌ 上传失败或返回格式异常:", upload_result)
+        else:
+            # 其他类型直接修改
+            new_value = input(f"请输入新的 fieldValue ({target_node['fieldName']}): ").strip()
+            target_node['fieldValue'] = new_value
+            print("✅ 已更新 fieldValue:", new_value)
+        modified_nodes.append({
+            "nodeId": target_node['nodeId'],
+            "fieldName": target_node['fieldName'],
+            "fieldValue": target_node['fieldValue']
+        })
+    print(modified_nodes)
+    print("开始提交任务，请等待")
+    # 提交任务
+    submit_result = submit_task(workflowId, modified_nodes,Api_key)
+    print("📌 提交任务返回:", submit_result)
+    if submit_result.get("code") != 0:
+        print("❌ 提交任务失败:", submit_result)
+        exit()
+    task_id = submit_result["data"]["taskId"]
+    print(f"📝 taskId: {task_id}")
+    # 解析成功返回
+    prompt_tips_str = submit_result["data"].get("promptTips")
+    if prompt_tips_str:
+        try:
+            prompt_tips = json.loads(prompt_tips_str)
+            node_errors = prompt_tips.get("node_errors", {})
+            if node_errors:
+                print("⚠️ 节点错误信息如下：")
+                for node_id, err in node_errors.items():
+                    print(f"  节点 {node_id} 错误: {err}")
+            else:
+                print("✅ 无节点错误，任务提交成功。")
+        except Exception as e:
+            print("⚠️ 无法解析 promptTips:", e)
+    else:
+        print("⚠️ 未返回 promptTips 字段。")
+    timeout = 600
+    start_time = time.time()
+    while True:
+        outputs_result = query_task_outputs(task_id, Api_key)
+        code = outputs_result.get("code")
+        msg = outputs_result.get("msg")
+        data = outputs_result.get("data")
+        if code == 0 and data:  # 成功
+            file_url = data[0].get("fileUrl")
+            print("🎉 生成结果完成！")
+            print(data)
+            break
+        elif code == 805:  # 任务失败
+            failed_reason = data.get("failedReason") if data else None
+            print("❌ 任务失败！")
+            if failed_reason:
+                print(f"节点 {failed_reason.get('node_name')} 失败原因: {failed_reason.get('exception_message')}")
+                print("Traceback:", failed_reason.get("traceback"))
+            else:
+                print(outputs_result)
+            break
+        elif code == 804 or code == 813:  # 运行中或排队中
+            status_text = "运行中" if code == 804 else "排队中"
+            print(f"⏳ 任务{status_text}...")
+        else:
+            print("⚠️ 未知状态:", outputs_result)
+        # 超时检查
+        if time.time() - start_time > timeout:
+            print("⏰ 等待超时（超过10分钟），任务未完成。")
+            break
+        time.sleep(5)
+    print("✅ 任务完成！")
+```
--- a/frontend/next.config.ts
+++ b/frontend/next.config.ts
@@ -2,10 +2,11 @@ import type { NextConfig } from "next";

 const nextConfig: NextConfig = {
  async rewrites() {
+    const apiPort = process.env.API_PORT || '8000';
    return [
      {
        source: '/api/:path*',
-        destination: 'http://localhost:8000/api/:path*',
+        destination: `http://localhost:${apiPort}/api/:path*`,
      },
    ]
  },
--- a/frontend/src/app/editor/page.tsx
+++ b/frontend/src/app/editor/page.tsx
@@ -74,6 +74,9 @@ export default function EditorPage() {
    const [exportDownloadUrl, setExportDownloadUrl] = useState<string | null>(null)
    const [exportError, setExportError] = useState<string | null>(null)

+    // Save all state
+    const [isSavingAll, setIsSavingAll] = useState(false)
+
    useEffect(() => {
        async function loadStoryboard() {
            // Get storyboard_id from URL, default to demo-1
@@ -160,12 +163,39 @@ export default function EditorPage() {
                </div>

                <div className="flex items-center gap-2">
-                    <Button variant="ghost" size="sm">
+                    <Button variant="ghost" size="sm" onClick={() => {
+                        console.log('[SETTINGS] Settings clicked - not implemented yet')
+                        alert('设置功能开发中...')
+                    }}>
                        <Settings className="h-4 w-4 mr-2" />
                        设置
                    </Button>
-                    <Button variant="ghost" size="sm">
-                        <Save className="h-4 w-4 mr-2" />
+                    <Button
+                        variant="ghost"
+                        size="sm"
+                        disabled={isSavingAll}
+                        onClick={async () => {
+                            if (!storyboard) return
+                            console.log('[SAVE-ALL] Starting save all frames...')
+                            setIsSavingAll(true)
+                            try {
+                                // Save is handled automatically by updateFrame during edits
+                                // This button confirms the current state is synced
+                                console.log('[SAVE-ALL] Current storyboard state:', storyboard.frames.length, 'frames')
+                                alert('当前状态已保存！\n\n提示：编辑分镜后点击右侧「保存」按钮可保存单个分镜的修改。')
+                            } catch (err: any) {
+                                console.error('[SAVE-ALL] Error:', err)
+                                alert('保存失败: ' + err.message)
+                            } finally {
+                                setIsSavingAll(false)
+                            }
+                        }}
+                    >
+                        {isSavingAll ? (
+                            <Loader2 className="h-4 w-4 mr-2 animate-spin" />
+                        ) : (
+                            <Save className="h-4 w-4 mr-2" />
+                        )}
                        保存
                    </Button>
                    <ExportButton
@@ -229,6 +259,7 @@ function SelectedFrameDetails() {
    const [isSaving, setIsSaving] = useState(false)
    const [isRegeneratingImage, setIsRegeneratingImage] = useState(false)
    const [isRegeneratingAudio, setIsRegeneratingAudio] = useState(false)
+    const [isAligningPrompt, setIsAligningPrompt] = useState(false)
    const [error, setError] = useState<string | null>(null)

    // Update local state when frame changes
@@ -250,14 +281,19 @@ function SelectedFrameDetails() {
    const handleSave = async () => {
        if (!storyboard || !selectedFrame) return

+        console.log('[SAVE] Starting save for frame:', selectedFrame.id)
+        console.log('[SAVE] Narration:', narration)
+        console.log('[SAVE] Image Prompt:', imagePrompt?.slice(0, 50))
+
        setIsSaving(true)
        setError(null)

        try {
-            await editorApi.updateFrame(storyboard.id, selectedFrame.id, {
+            const result = await editorApi.updateFrame(storyboard.id, selectedFrame.id, {
                narration,
                image_prompt: imagePrompt,
            })
+            console.log('[SAVE] API response:', result)

            // Update local store
            updateFrame(selectedFrame.id, {
@@ -265,8 +301,10 @@ function SelectedFrameDetails() {
                imagePrompt,
            })

+            console.log('[SAVE] Success!')
            setIsEditing(false)
        } catch (err: any) {
+            console.error('[SAVE] Error:', err)
            setError(err.message || '保存失败')
        } finally {
            setIsSaving(false)
@@ -276,6 +314,9 @@ function SelectedFrameDetails() {
    const handleRegenerateImage = async () => {
        if (!storyboard || !selectedFrame) return

+        console.log('[REGEN-IMG] Starting regenerate image for frame:', selectedFrame.id)
+        console.log('[REGEN-IMG] Image prompt:', imagePrompt?.slice(0, 80))
+
        setIsRegeneratingImage(true)
        setError(null)

@@ -285,12 +326,15 @@ function SelectedFrameDetails() {
                selectedFrame.id,
                imagePrompt
            )
+            console.log('[REGEN-IMG] API response:', result)

            // Update local store with new image path
            updateFrame(selectedFrame.id, {
                imagePath: result.image_path,
            })
+            console.log('[REGEN-IMG] Success! New image path:', result.image_path)
        } catch (err: any) {
+            console.error('[REGEN-IMG] Error:', err)
            setError(err.message || '重新生成图片失败')
        } finally {
            setIsRegeneratingImage(false)
@@ -322,6 +366,31 @@ function SelectedFrameDetails() {
        }
    }

+    const handleAlignPrompt = async () => {
+        if (!storyboard || !selectedFrame) return
+
+        setIsAligningPrompt(true)
+        setError(null)
+
+        try {
+            const result = await editorApi.alignPrompt(
+                storyboard.id,
+                selectedFrame.id,
+                narration || selectedFrame.narration
+            )
+
+            // Update local store with new image prompt
+            updateFrame(selectedFrame.id, {
+                imagePrompt: result.image_prompt,
+            })
+            setImagePrompt(result.image_prompt)
+        } catch (err: any) {
+            setError(err.message || '对齐提示词失败')
+        } finally {
+            setIsAligningPrompt(false)
+        }
+    }
+
    return (
        <div className="space-y-4">
            {error && (
@@ -434,6 +503,18 @@ function SelectedFrameDetails() {
                        ) : null}
                        重新生成音频
                    </Button>
+                    <Button
+                        size="sm"
+                        variant="outline"
+                        className="w-full"
+                        onClick={handleAlignPrompt}
+                        disabled={isAligningPrompt}
+                    >
+                        {isAligningPrompt ? (
+                            <Loader2 className="h-4 w-4 animate-spin mr-2" />
+                        ) : null}
+                        对齐提示词
+                    </Button>
                </div>
            )}
        </div>
--- a/frontend/src/components/preview/preview-player.tsx
+++ b/frontend/src/components/preview/preview-player.tsx
@@ -88,6 +88,23 @@ export function PreviewPlayer() {

    const selectedFrame = storyboard?.frames.find((f) => f.id === selectedFrameId)

+    // Sync currentTime when user selects a frame (not during playback)
+    useEffect(() => {
+        if (isPlaying || !storyboard?.frames.length || !selectedFrameId) return
+
+        // Calculate start time of selected frame
+        let startTime = 0
+        for (const frame of storyboard.frames) {
+            if (frame.id === selectedFrameId) break
+            startTime += frame.duration
+        }
+
+        // Only update if different (avoid infinite loop)
+        if (Math.abs(currentTime - startTime) > 0.1) {
+            setCurrentTime(startTime)
+        }
+    }, [selectedFrameId, isPlaying])
+
    // Audio playback sync
    useEffect(() => {
        if (!audioRef.current) return
--- a/frontend/src/components/quality/character-panel.tsx
+++ b/frontend/src/components/quality/character-panel.tsx
@@ -1,8 +1,8 @@
 'use client'

-import { useState, useEffect } from 'react'
+import { useState, useEffect, useRef } from 'react'
 import { Button } from '@/components/ui/button'
-import { Plus, User, Trash2, Edit, Loader2 } from 'lucide-react'
+import { Plus, User, Trash2, Image, Loader2, Wand2 } from 'lucide-react'
 import { qualityApi, type Character } from '@/services/quality-api'

 interface CharacterPanelProps {
@@ -13,12 +13,14 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
    const [characters, setCharacters] = useState<Character[]>([])
    const [isLoading, setIsLoading] = useState(false)
    const [isAdding, setIsAdding] = useState(false)
-    const [editingId, setEditingId] = useState<string | null>(null)
+    const [isAnalyzing, setIsAnalyzing] = useState(false)
+    const fileInputRef = useRef<HTMLInputElement>(null)

    // Form state
    const [name, setName] = useState('')
    const [appearance, setAppearance] = useState('')
    const [clothing, setClothing] = useState('')
+    const [refImagePath, setRefImagePath] = useState('')

    useEffect(() => {
        loadCharacters()
@@ -36,8 +38,70 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
        }
    }

+    const handleAnalyzeImage = async () => {
+        if (!refImagePath) {
+            alert('请先上传参考图片')
+            return
+        }
+
+        try {
+            setIsAnalyzing(true)
+            const result = await qualityApi.analyzeCharacterImage(storyboardId, refImagePath)
+
+            // Auto-fill form with VLM results
+            if (result.appearance_description) {
+                setAppearance(result.appearance_description)
+            }
+            if (result.clothing_description) {
+                setClothing(result.clothing_description)
+            }
+
+            console.log('[CHARACTER] VLM analysis result:', result)
+        } catch (e) {
+            console.error('Failed to analyze image:', e)
+            alert('图片分析失败，请重试')
+        } finally {
+            setIsAnalyzing(false)
+        }
+    }
+
+    const handleImageUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
+        const file = e.target.files?.[0]
+        if (!file) return
+
+        try {
+            // Upload to server - use quality API endpoint
+            const formData = new FormData()
+            formData.append('file', file)
+
+            const apiBase = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000/api'
+            const response = await fetch(`${apiBase}/quality/upload?storyboard_id=${storyboardId}&type=character`, {
+                method: 'POST',
+                body: formData,
+            })
+
+            if (response.ok) {
+                const data = await response.json()
+                setRefImagePath(data.path || data.file_path)
+                console.log('[CHARACTER] Image uploaded:', data.path)
+            } else {
+                // Fallback: use local file path for demo
+                const localPath = `output/${storyboardId}/character_${Date.now()}.png`
+                setRefImagePath(localPath)
+                console.log('[CHARACTER] Using fallback path:', localPath)
+            }
+        } catch (e) {
+            console.error('Failed to upload image:', e)
+            // Fallback for demo
+            setRefImagePath(`output/${storyboardId}/character_ref.png`)
+        }
+    }
+
    const handleAdd = async () => {
-        if (!name.trim()) return
+        if (!name.trim()) {
+            alert('请输入角色名称')
+            return
+        }

        try {
            const newChar = await qualityApi.createCharacter(storyboardId, {
@@ -46,6 +110,7 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
                clothing_description: clothing,
                distinctive_features: [],
                character_type: 'person',
+                reference_image_path: refImagePath || undefined,
            })
            setCharacters([...characters, newChar])
            resetForm()
@@ -67,8 +132,8 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
        setName('')
        setAppearance('')
        setClothing('')
+        setRefImagePath('')
        setIsAdding(false)
-        setEditingId(null)
    }

    return (
@@ -98,10 +163,10 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
                            key={char.id}
                            className="flex items-center justify-between p-2 bg-muted/50 rounded text-sm"
                        >
-                            <div>
+                            <div className="flex-1 min-w-0">
                                <div className="font-medium">{char.name}</div>
                                {char.appearance_description && (
-                                    <div className="text-xs text-muted-foreground truncate max-w-[150px]">
+                                    <div className="text-xs text-muted-foreground truncate">
                                        {char.appearance_description}
                                    </div>
                                )}
@@ -109,7 +174,7 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {
                            <Button
                                variant="ghost"
                                size="icon"
-                                className="h-6 w-6"
+                                className="h-6 w-6 shrink-0"
                                onClick={() => handleDelete(char.id)}
                            >
                                <Trash2 className="h-3 w-3 text-destructive" />
@@ -119,7 +184,7 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {

                    {characters.length === 0 && !isAdding && (
                        <p className="text-xs text-muted-foreground text-center py-2">
-                            暂无角色
+                            暂无角色，点击 + 添加
                        </p>
                    )}
                </div>
@@ -127,19 +192,54 @@ export function CharacterPanel({ storyboardId }: CharacterPanelProps) {

            {isAdding && (
                <div className="space-y-2 pt-2 border-t">
+                    {/* Reference Image Upload */}
+                    <div className="flex items-center gap-2">
+                        <input
+                            ref={fileInputRef}
+                            type="file"
+                            accept="image/*"
+                            onChange={handleImageUpload}
+                            className="hidden"
+                        />
+                        <Button
+                            variant="outline"
+                            size="sm"
+                            className="flex-1"
+                            onClick={() => fileInputRef.current?.click()}
+                        >
+                            <Image className="h-4 w-4 mr-2" />
+                            {refImagePath ? '已上传参考图' : '上传参考图'}
+                        </Button>
+                        <Button
+                            variant="secondary"
+                            size="sm"
+                            onClick={handleAnalyzeImage}
+                            disabled={!refImagePath || isAnalyzing}
+                        >
+                            {isAnalyzing ? (
+                                <Loader2 className="h-4 w-4 animate-spin" />
+                            ) : (
+                                <>
+                                    <Wand2 className="h-4 w-4 mr-1" />
+                                    分析
+                                </>
+                            )}
+                        </Button>
+                    </div>
+
                    <input
                        type="text"
                        value={name}
                        onChange={(e) => setName(e.target.value)}
-                        placeholder="角色名称"
+                        placeholder="角色名称 *"
                        className="w-full p-2 text-sm border rounded bg-background"
                    />
-                    <input
-                        type="text"
+                    <textarea
                        value={appearance}
                        onChange={(e) => setAppearance(e.target.value)}
-                        placeholder="外貌描述"
-                        className="w-full p-2 text-sm border rounded bg-background"
+                        placeholder="外貌描述（可通过分析自动生成）"
+                        rows={2}
+                        className="w-full p-2 text-sm border rounded bg-background resize-none"
                    />
                    <input
                        type="text"
--- a/frontend/src/services/editor-api.ts
+++ b/frontend/src/services/editor-api.ts
@@ -197,6 +197,31 @@ class EditorApiClient {
        return response.json()
    }

+    /**
+     * Align image prompt with narration - regenerate prompt based on narration
+     */
+    async alignPrompt(
+        storyboardId: string,
+        frameId: string,
+        narration?: string
+    ): Promise<{ image_prompt: string; success: boolean }> {
+        const response = await fetch(
+            `${this.baseUrl}/editor/storyboard/${storyboardId}/frames/${frameId}/align-prompt`,
+            {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ narration }),
+            }
+        )
+
+        if (!response.ok) {
+            const error = await response.json().catch(() => ({ detail: response.statusText }))
+            throw new Error(error.detail || `Failed to align prompt: ${response.statusText}`)
+        }
+
+        return response.json()
+    }
+
    /**
     * Inpaint (局部重绘) image for a frame
     */
--- a/frontend/src/services/quality-api.ts
+++ b/frontend/src/services/quality-api.ts
@@ -20,6 +20,22 @@ export interface Character {
    reference_image?: string
 }

+export interface CharacterAnalysisResult {
+    appearance_description: string
+    clothing_description: string
+    distinctive_features: string[]
+    prompt_description: string
+}
+
+export interface CharacterCreateData {
+    name: string
+    appearance_description: string
+    clothing_description: string
+    distinctive_features: string[]
+    character_type: string
+    reference_image_path?: string
+}
+
 export interface ContentCheckResult {
    passed: boolean
    category: 'safe' | 'sensitive' | 'blocked'
@@ -65,7 +81,7 @@ class QualityApiClient {

    async createCharacter(
        storyboardId: string,
-        data: Omit<Character, 'id'>
+        data: CharacterCreateData
    ): Promise<Character> {
        const response = await fetch(
            `${this.baseUrl}/quality/characters/${storyboardId}`,
@@ -110,6 +126,24 @@ class QualityApiClient {
        }
    }

+    async analyzeCharacterImage(
+        storyboardId: string,
+        imagePath: string
+    ): Promise<CharacterAnalysisResult> {
+        const response = await fetch(
+            `${this.baseUrl}/quality/characters/${storyboardId}/analyze-image`,
+            {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ image_path: imagePath }),
+            }
+        )
+        if (!response.ok) {
+            throw new Error('Failed to analyze character image')
+        }
+        return response.json()
+    }
+
    // ============================================================
    // Content Filter
    // ============================================================
--- a/pixelle_video/config/init.py
+++ b/pixelle_video/config/init.py
@@ -29,7 +29,7 @@ Usage:
    if config_manager.validate():
        print("Config is valid!")
 """
-from .schema import PixelleVideoConfig, LLMConfig, ComfyUIConfig, TTSSubConfig, ImageSubConfig, VideoSubConfig
+from .schema import PixelleVideoConfig, LLMConfig, VLMConfig, ComfyUIConfig, TTSSubConfig, ImageSubConfig, VideoSubConfig
 from .manager import ConfigManager
 from .loader import load_config_dict, save_config_dict

@@ -38,7 +38,8 @@ config_manager = ConfigManager()

 __all__ = [
    "PixelleVideoConfig",
-    "LLMConfig", 
+    "LLMConfig",
+    "VLMConfig",
    "ComfyUIConfig",
    "TTSSubConfig",
    "ImageSubConfig",
--- a/pixelle_video/config/schema.py
+++ b/pixelle_video/config/schema.py
@@ -26,6 +26,14 @@ class LLMConfig(BaseModel):
    model: str = Field(default="", description="LLM Model Name")


+class VLMConfig(BaseModel):
+    """VLM (Vision Language Model) configuration for character analysis"""
+    provider: str = Field(default="qwen", description="VLM provider: qwen, glm, openai")
+    api_key: str = Field(default="", description="VLM API Key")
+    base_url: str = Field(default="", description="VLM API Base URL (auto-detected if empty)")
+    model: str = Field(default="", description="VLM Model Name (defaults based on provider)")
+
+
 class TTSLocalConfig(BaseModel):
    """Local TTS configuration (Edge TTS)"""
    voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
@@ -92,6 +100,7 @@ class PixelleVideoConfig(BaseModel):
    """Pixelle-Video main configuration"""
    project_name: str = Field(default="Pixelle-Video", description="Project name")
    llm: LLMConfig = Field(default_factory=LLMConfig)
+    vlm: VLMConfig = Field(default_factory=VLMConfig)
    comfyui: ComfyUIConfig = Field(default_factory=ComfyUIConfig)
    template: TemplateConfig = Field(default_factory=TemplateConfig)
    
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -124,7 +124,13 @@ class StandardPipeline(LinearVideoPipeline):
        else:  # fixed
            self._report_progress(ctx.progress_callback, "splitting_script", 0.05)
            split_mode = ctx.params.get("split_mode", "paragraph")
-            ctx.narrations = await split_narration_script(text, split_mode=split_mode)
+            target_segments = ctx.params.get("target_segments", 8)
+            ctx.narrations = await split_narration_script(
+                text, 
+                split_mode=split_mode,
+                llm_service=self.llm if split_mode == "smart" else None,
+                target_segments=target_segments
+            )
            logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})")
            logger.info(f"   Note: n_scenes={n_scenes} is ignored in fixed mode")

@@ -495,11 +501,26 @@ class StandardPipeline(LinearVideoPipeline):
                logger.warning("No task_id in storyboard, skipping persistence")
                return
            
-            # Build metadata
-            input_with_title = ctx.params.copy()
-            input_with_title["text"] = ctx.input_text # Ensure text is included
-            if not input_with_title.get("title"):
-                input_with_title["title"] = storyboard.title
+            # Build metadata - filter out non-serializable objects
+            clean_input = {}
+            for key, value in ctx.params.items():
+                # Skip non-serializable objects like CharacterMemory
+                if key == "character_memory":
+                    # Convert to serializable dict if present
+                    if value is not None and hasattr(value, 'to_dict'):
+                        clean_input["character_memory"] = value.to_dict()
+                elif key == "progress_callback":
+                    # Skip callback functions
+                    continue
+                elif callable(value):
+                    # Skip any callable objects
+                    continue
+                else:
+                    clean_input[key] = value
+            
+            clean_input["text"] = ctx.input_text  # Ensure text is included
+            if not clean_input.get("title"):
+                clean_input["title"] = storyboard.title
            
            metadata = {
                "task_id": task_id,
@@ -507,7 +528,7 @@ class StandardPipeline(LinearVideoPipeline):
                "completed_at": storyboard.completed_at.isoformat() if storyboard.completed_at else None,
                "status": "completed",
                
-                "input": input_with_title,
+                "input": clean_input,
                
                "result": {
                    "video_path": result.video_path,
--- a/pixelle_video/prompts/init.py
+++ b/pixelle_video/prompts/init.py
@@ -29,6 +29,13 @@ from pixelle_video.prompts.image_generation import (
 )
 from pixelle_video.prompts.style_conversion import build_style_conversion_prompt

+# Paragraph merging (two-step: analysis + grouping)
+from pixelle_video.prompts.paragraph_merging import (
+    build_paragraph_analysis_prompt,
+    build_paragraph_grouping_prompt,
+    build_paragraph_merging_prompt,  # Legacy support
+)
+

 __all__ = [
    # Narration builders
@@ -40,6 +47,11 @@ __all__ = [
    "build_image_prompt_prompt",
    "build_style_conversion_prompt",
    
+    # Paragraph merging (two-step)
+    "build_paragraph_analysis_prompt",
+    "build_paragraph_grouping_prompt",
+    "build_paragraph_merging_prompt",  # Legacy
+    
    # Image style presets
    "IMAGE_STYLE_PRESETS",
    "DEFAULT_IMAGE_STYLE",
--- a/pixelle_video/prompts/image_generation.py
+++ b/pixelle_video/prompts/image_generation.py
@@ -58,31 +58,37 @@ Based on the existing video script, create corresponding **English** image promp
 # Input Content
 {narrations_json}

+# ⭐ Core Imagery Extraction (Critical for Relevance)
+For EACH narration, you MUST:
+1. **Extract 2-3 core visual imagery/metaphors** that best represent the narration's meaning
+2. **Identify the emotional tone** (hopeful, melancholic, inspiring, etc.)
+3. **Determine concrete visual elements** that embody these abstract concepts
+
+Example thought process:
+- Narration: "给自己一个不设限的探索时间"
+- Core Imagery: exploration, freedom, open paths
+- Emotional Tone: hopeful, adventurous
+- Visual Elements: open road, person looking at horizon, map with unmarked routes
+
 # Output Requirements

 ## Image Prompt Specifications
 - Language: **Must use English** (for AI image generation models)
- Description structure: scene + character action + emotion + symbolic elements
- Description length: Ensure clear, complete, and creative descriptions (recommended 50-100 English words)
+- **REQUIRED Structure**: [Core imagery] + [Scene description] + [Character action] + [Emotional atmosphere]
+- Description length: 50-100 English words
+- **The image prompt MUST directly reflect the extracted core imagery from the narration**

 ## Visual Creative Requirements
 - Each image must accurately reflect the specific content and emotion of the corresponding narration
- Use symbolic techniques to visualize abstract concepts (e.g., use paths to represent life choices, chains to represent constraints, etc.)
+- **Prioritize core visual metaphors** - the main visual elements must embody the narration's key message
+- Use symbolic techniques to visualize abstract concepts (e.g., paths=choices, chains=constraints, open doors=opportunities)
 - Scenes should express rich emotions and actions to enhance visual impact
- Highlight themes through composition and element arrangement, avoid overly literal representations

-## Key English Vocabulary Reference
- Symbolic elements: symbolic elements
- Expression: expression / facial expression
- Action: action / gesture / movement
- Scene: scene / setting
- Atmosphere: atmosphere / mood
-
-## Visual and Copy Coordination Principles
- Images should serve the copy, becoming a visual extension of the copy content
- Avoid visual elements unrelated to or contradicting the copy content
- Choose visual presentation methods that best enhance the persuasiveness of the copy
- Ensure the audience can quickly understand the core viewpoint of the copy through images
+## Visual and Narration Coordination Principles (Most Important)
+- **Direct semantic connection**: The main visual elements MUST represent the narration's core meaning
+- **Avoid decorative scenes**: Don't add unrelated beautiful scenery that doesn't support the message
+- **Ask yourself**: If someone saw only the image, could they guess what the narration is about?
+- **Test question**: What is the ONE THING this narration is about? Make sure that thing is visible in the image.

 ## Creative Guidance
 1. **Phenomenon Description Copy**: Use intuitive scenes to represent social phenomena
@@ -97,8 +103,8 @@ Strictly output in the following JSON format, **image prompts must be in English
 ```json
 {{
  "image_prompts": [
-    "[detailed English image prompt following the style requirements]",
-    "[detailed English image prompt following the style requirements]"
+    "[Core imagery visible] + [Scene with semantic connection to narration] + [Character/action reflecting the message] + [Emotional atmosphere]",
+    "[Next image prompt following the same structure]"
  ]
 }}
 ```
@@ -109,14 +115,15 @@ Strictly output in the following JSON format, **image prompts must be in English
 3. Input is {{"narrations": [narration array]}} format, output is {{"image_prompts": [image prompt array]}} format
 4. **The output image_prompts array must contain exactly {narrations_count} elements, corresponding one-to-one with the input narrations array**
 5. **Image prompts must use English** (for AI image generation models)
-6. Image prompts must accurately reflect the specific content and emotion of the corresponding narration
-7. Each image must be creative and visually impactful, avoid being monotonous
-8. Ensure visual scenes can enhance the persuasiveness of the copy and audience understanding
+6. **⭐ Most Critical: Each image prompt must have DIRECT semantic relevance to its narration**
+7. Before writing each prompt, mentally extract the core visual metaphor from the narration
+8. Verify: Could someone understand the narration's message from the image alone?

 Now, please create {narrations_count} corresponding **English** image prompts for the above {narrations_count} narrations. Only output JSON, no other content.
 """


+
 def build_image_prompt_prompt(
    narrations: List[str],
    min_words: int,
--- a/pixelle_video/prompts/paragraph_merging.py
+++ b/pixelle_video/prompts/paragraph_merging.py
@@ -0,0 +1,202 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Paragraph merging prompt
+
+For intelligently merging short paragraphs into longer segments suitable for video storyboards.
+Uses a two-step approach: first analyze, then group.
+"""
+
+import json
+from typing import List
+
+
+# Step 1: Analyze text and recommend segment count
+PARAGRAPH_ANALYSIS_PROMPT = """# 任务定义
+你是一个专业的视频分镜规划师。请分析以下文本，推荐最佳分镜数量。
+
+# 核心任务
+分析文本结构，根据以下原则推荐分镜数量：
+
+## 分析原则
+1. **语义边界**：识别场景切换、话题转换、情绪变化点
+2. **叙事完整性**：保持对话回合完整（问-答不拆分）
+3. **时长控制**：每个分镜语音时长建议 15-45 秒（约 60-180 字）
+4. **视觉多样性**：确保分镜之间有足够的画面变化
+
+## 文本信息
+- 总段落数：{total_paragraphs}
+- 预估总字数：{total_chars} 字
+- 预估总时长：{estimated_duration} 秒
+
+## 输入段落预览
+{paragraphs_preview}
+
+# 输出格式
+返回 JSON 格式的分析结果：
+
+```json
+{{
+  "recommended_segments": 8,
+  "reasoning": "文本包含开场设定、分手对话、争吵升级、离别等多个场景切换点...",
+  "scene_boundaries": [
+    {{"after_paragraph": 3, "reason": "场景从背景介绍转入对话"}},
+    {{"after_paragraph": 7, "reason": "对话情绪升级"}},
+    ...
+  ]
+}}
+```
+
+# 重要提醒
+1. recommended_segments 应该在 3-15 之间
+2. 每个分镜平均字数建议 80-200 字
+3. scene_boundaries 标记主要的场景切换点，用于后续分组参考
+4. 只输出 JSON，不要添加其他解释
+"""
+
+
+# Step 2: Group paragraphs based on analysis
+PARAGRAPH_GROUPING_PROMPT = """# 任务定义
+你是一个专业的文本分段专家。根据分析结果，将段落分组。
+
+# 核心任务
+将 {total_paragraphs} 个段落（编号 0 到 {max_index}）分成 **{target_segments}** 个分组。
+
+# 分析建议
+{analysis_hint}
+
+# 分组原则
+1. **语义关联**：将描述同一场景、同一对话回合的段落放在一起
+2. **对话完整**：一轮完整的对话（问与答）应该在同一分组
+3. **场景统一**：同一时间、地点发生的事件应该在同一分组
+4. **长度均衡**：每个分组的字数尽量均衡（目标 80-200 字/分组）
+5. **顺序保持**：分组内段落必须连续
+
+# 输入段落
+{paragraphs_preview}
+
+# 输出格式
+返回 JSON 格式，包含每个分组的起始和结束索引（包含）。
+
+```json
+{{
+  "groups": [
+    {{"start": 0, "end": 3}},
+    {{"start": 4, "end": 7}},
+    {{"start": 8, "end": 12}}
+  ]
+}}
+```
+
+# 重要提醒
+1. 必须输出正好 {target_segments} 个分组
+2. 分组必须覆盖所有段落（从 0 到 {max_index}）
+3. 每个分组的 start 必须等于上一个 end + 1
+4. 只输出 JSON，不要添加其他解释
+"""
+
+
+def build_paragraph_analysis_prompt(
+    paragraphs: List[str],
+) -> str:
+    """
+    Build prompt for analyzing text and recommending segment count
+    
+    Args:
+        paragraphs: List of original paragraphs
+    
+    Returns:
+        Formatted prompt for analysis
+    """
+    # Calculate stats
+    total_chars = sum(len(p) for p in paragraphs)
+    # Estimate: ~250 chars/minute for Chinese speech
+    estimated_duration = int(total_chars / 250 * 60)
+    
+    # Create preview for each paragraph (first 50 chars)
+    previews = []
+    for i, para in enumerate(paragraphs):
+        preview = para[:50].replace('\n', ' ')
+        char_count = len(para)
+        if len(para) > 50:
+            preview += "..."
+        previews.append(f"[{i}] ({char_count}字) {preview}")
+    
+    paragraphs_preview = "\n".join(previews)
+    
+    return PARAGRAPH_ANALYSIS_PROMPT.format(
+        paragraphs_preview=paragraphs_preview,
+        total_paragraphs=len(paragraphs),
+        total_chars=total_chars,
+        estimated_duration=estimated_duration
+    )
+
+
+def build_paragraph_grouping_prompt(
+    paragraphs: List[str],
+    target_segments: int,
+    analysis_result: dict = None,
+) -> str:
+    """
+    Build prompt for grouping paragraphs based on analysis
+    
+    Args:
+        paragraphs: List of original paragraphs
+        target_segments: Target number of segments (from analysis)
+        analysis_result: Optional analysis result for context
+    
+    Returns:
+        Formatted prompt for grouping
+    """
+    # Create preview with char counts
+    previews = []
+    for i, para in enumerate(paragraphs):
+        preview = para[:50].replace('\n', ' ')
+        char_count = len(para)
+        if len(para) > 50:
+            preview += "..."
+        previews.append(f"[{i}] ({char_count}字) {preview}")
+    
+    paragraphs_preview = "\n".join(previews)
+    
+    # Build analysis hint if available
+    analysis_hint = ""
+    if analysis_result:
+        if "reasoning" in analysis_result:
+            analysis_hint += f"分析理由：{analysis_result['reasoning']}\n"
+        if "scene_boundaries" in analysis_result:
+            boundaries = [str(b.get("after_paragraph", "")) for b in analysis_result["scene_boundaries"]]
+            analysis_hint += f"建议场景切换点（段落后）：{', '.join(boundaries)}"
+    
+    if not analysis_hint:
+        analysis_hint = "无额外分析信息"
+    
+    return PARAGRAPH_GROUPING_PROMPT.format(
+        paragraphs_preview=paragraphs_preview,
+        target_segments=target_segments,
+        total_paragraphs=len(paragraphs),
+        max_index=len(paragraphs) - 1,
+        analysis_hint=analysis_hint
+    )
+
+
+# Legacy support - keep original function name for backward compatibility
+def build_paragraph_merging_prompt(
+    paragraphs: List[str],
+    target_segments: int = 8,
+) -> str:
+    """
+    Legacy function for backward compatibility.
+    Now delegates to build_paragraph_grouping_prompt.
+    """
+    return build_paragraph_grouping_prompt(paragraphs, target_segments)
--- a/pixelle_video/services/quality/character_analyzer.py
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -0,0 +1,323 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CharacterAnalyzer - VLM-based character appearance extraction
+
+Analyzes reference images to extract detailed character descriptions
+for maintaining visual consistency across video frames.
+"""
+
+import base64
+import json
+import os
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+
+from loguru import logger
+from openai import AsyncOpenAI
+
+
+@dataclass
+class CharacterAnalysisResult:
+    """Result of character image analysis"""
+    
+    appearance_description: str = ""     # Physical features
+    clothing_description: str = ""       # What they're wearing
+    distinctive_features: List[str] = None  # Unique identifying features
+    
+    def __post_init__(self):
+        if self.distinctive_features is None:
+            self.distinctive_features = []
+    
+    def to_prompt_description(self) -> str:
+        """Generate a prompt-ready character description"""
+        parts = []
+        
+        if self.appearance_description:
+            parts.append(self.appearance_description)
+        
+        if self.clothing_description:
+            parts.append(f"wearing {self.clothing_description}")
+        
+        if self.distinctive_features:
+            features = ", ".join(self.distinctive_features)
+            parts.append(f"with {features}")
+        
+        return ", ".join(parts) if parts else ""
+    
+    def to_dict(self) -> dict:
+        return {
+            "appearance_description": self.appearance_description,
+            "clothing_description": self.clothing_description,
+            "distinctive_features": self.distinctive_features,
+        }
+
+
+class CharacterAnalyzer:
+    """
+    VLM-based character appearance analyzer
+    
+    Analyzes reference images to extract detailed character descriptions
+    that can be injected into image generation prompts.
+    
+    Example:
+        >>> analyzer = CharacterAnalyzer()
+        >>> result = await analyzer.analyze_reference_image("character.png")
+        >>> print(result.appearance_description)
+        "young woman with long black hair, round face, fair skin"
+        >>> print(result.to_prompt_description())
+        "young woman with long black hair, round face, fair skin, wearing blue hoodie, with round glasses"
+    """
+    
+    def __init__(self):
+        """Initialize CharacterAnalyzer"""
+        pass
+    
+    async def analyze_reference_image(
+        self,
+        image_path: str,
+    ) -> CharacterAnalysisResult:
+        """
+        Analyze a reference image to extract character appearance
+        
+        Args:
+            image_path: Path to the reference image
+            
+        Returns:
+            CharacterAnalysisResult with extracted descriptions
+        """
+        logger.info(f"Analyzing character reference image: {image_path}")
+        
+        # Check if file exists
+        if not os.path.exists(image_path):
+            logger.warning(f"Image not found: {image_path}")
+            return CharacterAnalysisResult()
+        
+        try:
+            # Read and encode image
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode("utf-8")
+            
+            # Determine image type
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = "image/png" if ext == ".png" else "image/jpeg"
+            
+            # VLM prompt for character analysis - optimized for storyboard consistency
+            # Focus on CONSTANT features (identity), exclude VARIABLE features (pose/expression)
+            analysis_prompt = """Analyze this character/person for VIDEO STORYBOARD consistency.
+
+GOAL: Extract features that should remain CONSISTENT across different video frames.
+The output will be injected into image generation prompts for multiple scenes.
+
+Extract ONLY these CONSTANT features:
+1. Identity: gender, approximate age group (child/young/middle-aged/elderly)
+2. Hair: color, length, style (NOT affected by wind/movement)
+3. Face: skin tone, face shape (NOT expressions)
+4. Clothing: type and colors (assume same outfit throughout video)
+5. Distinctive: glasses, accessories, tattoos, scars, unique marks
+
+DO NOT include:
+- Expressions (smile, frown) - changes per scene
+- Poses/gestures - changes per scene
+- View angle - determined by scene composition
+- Lighting/shadows - scene-dependent
+- Background elements
+
+Output JSON format (simple strings for direct prompt injection):
+{
+    "identity": "elderly man" or "young woman" etc,
+    "appearance": "short gray hair, light skin, round face",
+    "clothing": "brown sweater vest over white shirt, dark trousers",
+    "distinctive": ["round glasses", "silver watch"]
+}
+
+Output ONLY the JSON, no explanation."""
+
+            # Build multimodal message
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": analysis_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{media_type};base64,{image_data}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # Get VLM configuration
+            # Priority: Environment variables > config.yaml > defaults
+            from pixelle_video.config import config_manager
+            
+            # VLM config from config.yaml (now part of PixelleVideoConfig)
+            vlm_config = config_manager.config.vlm
+            
+            # Environment variables override config.yaml
+            vlm_provider = os.getenv("VLM_PROVIDER") or vlm_config.provider or "qwen"
+            vlm_api_key = os.getenv("VLM_API_KEY") or os.getenv("DASHSCOPE_API_KEY") or vlm_config.api_key
+            vlm_base_url = os.getenv("VLM_BASE_URL") or vlm_config.base_url
+            vlm_model = os.getenv("VLM_MODEL") or vlm_config.model
+            
+            # Configure based on provider
+            if vlm_provider == "qwen":
+                # 通义千问 Qwen VL
+                vlm_base_url = vlm_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+                vlm_model = vlm_model or "qwen-vl-plus"  # or qwen-vl-max, qwen3-vl-plus
+                logger.info(f"Using Qwen VL: model={vlm_model}")
+            elif vlm_provider == "glm":
+                # 智谱 GLM-4V
+                from pixelle_video.config import config_manager
+                llm_config = config_manager.config.llm
+                vlm_api_key = vlm_api_key or llm_config.api_key
+                vlm_base_url = vlm_base_url or llm_config.base_url
+                vlm_model = vlm_model or "glm-4v-flash"
+                logger.info(f"Using GLM VL: model={vlm_model}")
+            else:  # openai or other
+                from pixelle_video.config import config_manager
+                llm_config = config_manager.config.llm
+                vlm_api_key = vlm_api_key or llm_config.api_key
+                vlm_base_url = vlm_base_url or llm_config.base_url
+                vlm_model = vlm_model or llm_config.model
+                logger.info(f"Using {vlm_provider} VL: model={vlm_model}")
+            
+            if not vlm_api_key:
+                logger.error("No VLM API key configured. Set VLM_API_KEY or DASHSCOPE_API_KEY environment variable.")
+                return CharacterAnalysisResult()
+            
+            # Create OpenAI-compatible client
+            client = AsyncOpenAI(
+                api_key=vlm_api_key,
+                base_url=vlm_base_url
+            )
+            
+            # Call VLM
+            response = await client.chat.completions.create(
+                model=vlm_model,
+                messages=messages,
+                temperature=0.3,
+                max_tokens=2000
+            )
+            
+            vlm_response = response.choices[0].message.content if response.choices else None
+            
+            if vlm_response:
+                logger.debug(f"VLM character analysis response: {vlm_response[:150] if len(vlm_response) > 150 else vlm_response}...")
+            else:
+                logger.warning(f"VLM returned empty content. Full response: {response}")
+            
+            # Parse response
+            return self._parse_response(vlm_response)
+            
+        except Exception as e:
+            logger.error(f"Character analysis failed: {e}")
+            return CharacterAnalysisResult()
+    
+    def _parse_response(self, response: str) -> CharacterAnalysisResult:
+        """Parse VLM response into CharacterAnalysisResult"""
+        if not response:
+            logger.warning("Empty VLM response")
+            return CharacterAnalysisResult()
+        
+        # Log full response for debugging
+        logger.debug(f"Full VLM response:\n{response}")
+        
+        try:
+            # Remove markdown code blocks if present
+            cleaned = response.strip()
+            if cleaned.startswith("```json"):
+                cleaned = cleaned[7:]
+            elif cleaned.startswith("```"):
+                cleaned = cleaned[3:]
+            if cleaned.endswith("```"):
+                cleaned = cleaned[:-3]
+            cleaned = cleaned.strip()
+            
+            # Try to extract JSON from response
+            match = re.search(r'\{[\s\S]*\}', cleaned)
+            if match:
+                json_str = match.group()
+                logger.debug(f"Extracted JSON: {json_str[:200]}...")
+                data = json.loads(json_str)
+            else:
+                logger.warning(f"No JSON found in response, trying direct parse")
+                data = json.loads(cleaned)
+            
+            # Handle nested JSON structures - flatten to strings
+            # New field names: identity, appearance, clothing, distinctive
+            identity = data.get("identity", "")
+            appearance = data.get("appearance", "") or data.get("appearance_description", "")
+            
+            if isinstance(appearance, dict):
+                # Flatten nested object to descriptive string
+                parts = []
+                for key, value in appearance.items():
+                    if isinstance(value, dict):
+                        details = ", ".join(f"{k}: {v}" for k, v in value.items())
+                        parts.append(f"{key} ({details})")
+                    else:
+                        parts.append(f"{key}: {value}")
+                appearance = "; ".join(parts)
+            
+            # Combine identity + appearance for full description
+            if identity and appearance:
+                full_appearance = f"{identity}, {appearance}"
+            else:
+                full_appearance = identity or appearance
+            
+            clothing = data.get("clothing", "") or data.get("clothing_description", "")
+            if isinstance(clothing, dict):
+                # Flatten nested clothing description
+                parts = []
+                for person, items in clothing.items():
+                    if isinstance(items, dict):
+                        details = ", ".join(f"{k}: {v}" for k, v in items.items())
+                        parts.append(f"{person} ({details})")
+                    else:
+                        parts.append(f"{person}: {items}")
+                clothing = "; ".join(parts)
+            
+            distinctive = data.get("distinctive", []) or data.get("distinctive_features", [])
+            if not isinstance(distinctive, list):
+                distinctive = [str(distinctive)]
+            
+            result = CharacterAnalysisResult(
+                appearance_description=full_appearance,
+                clothing_description=clothing,
+                distinctive_features=distinctive,
+            )
+            
+            logger.info(f"Character analysis extracted: {result.appearance_description[:80] if result.appearance_description else 'empty'}...")
+            return result
+            
+        except (json.JSONDecodeError, KeyError) as e:
+            logger.warning(f"Failed to parse VLM response: {e}")
+            logger.debug(f"Response that failed to parse: {response[:500]}")
+            
+            # Try to use the raw response as appearance description (fallback)
+            if response and 20 < len(response) < 500:
+                # Clean up the response
+                fallback = response.strip()
+                if "```" in fallback:
+                    fallback = re.sub(r'```.*?```', '', fallback, flags=re.DOTALL).strip()
+                if fallback:
+                    logger.info(f"Using raw response as appearance: {fallback[:80]}...")
+                    return CharacterAnalysisResult(
+                        appearance_description=fallback
+                    )
+            
+            return CharacterAnalysisResult()
--- a/pixelle_video/services/quality/character_memory.py
+++ b/pixelle_video/services/quality/character_memory.py
@@ -558,6 +558,19 @@ class CharacterMemory:
        self._characters.clear()
        self._name_index.clear()
        logger.info("Character memory cleared")
+    
+    def to_dict(self) -> dict:
+        """Convert to JSON-serializable dictionary"""
+        return {
+            "characters": [char.to_dict() for char in self.characters],
+            "config": {
+                "auto_detect_characters": self.config.auto_detect_characters,
+                "use_llm_detection": self.config.use_llm_detection,
+                "inject_character_prompts": self.config.inject_character_prompts,
+                "use_reference_images": self.config.use_reference_images,
+                "enable_visual_features": self.config.enable_visual_features,
+            }
+        }

    @property
    def feature_extractor(self):
--- a/pixelle_video/services/quality/style_guard.py
+++ b/pixelle_video/services/quality/style_guard.py
@@ -164,28 +164,140 @@ class StyleGuard:
    async def _extract_with_vlm(self, image_path: str) -> StyleAnchor:
        """Extract style using Vision Language Model"""
        try:
-            # TODO: Implement VLM call when vision-capable LLM is integrated
-            # For now, return a placeholder
-            logger.debug("VLM style extraction: using placeholder (VLM not yet integrated)")
+            if not self.llm_service:
+                logger.warning("No LLM service available, using basic extraction")
+                return self._extract_basic(image_path)
            
-            # Placeholder extraction based on common styles
-            return StyleAnchor(
-                art_style="consistent artistic",
-                color_palette="harmonious colors",
-                lighting="balanced",
-                style_prefix="maintaining visual consistency, same artistic style as previous frames",
-                reference_image=image_path,
+            import base64
+            import os
+            from openai import AsyncOpenAI
+            
+            # Read and encode image
+            if not os.path.exists(image_path):
+                logger.warning(f"Image not found: {image_path}")
+                return self._extract_basic(image_path)
+            
+            with open(image_path, "rb") as f:
+                image_data = base64.b64encode(f.read()).decode("utf-8")
+            
+            # Determine image type
+            ext = os.path.splitext(image_path)[1].lower()
+            media_type = "image/png" if ext == ".png" else "image/jpeg"
+            
+            # Style extraction prompt
+            style_prompt = """Analyze this image and extract its visual style characteristics.
+
+Provide a concise style description that could be used as a prefix for image generation prompts to maintain visual consistency.
+
+Output format (JSON):
+{
+    "art_style": "specific art style (e.g., oil painting, digital illustration, anime, photorealistic, watercolor, line art)",
+    "color_palette": "dominant colors and mood (e.g., warm earth tones, vibrant neon, muted pastels)",
+    "lighting": "lighting style (e.g., soft natural light, dramatic shadows, studio lighting)",
+    "texture": "visual texture (e.g., smooth, grainy, brushstroke visible)",
+    "style_prefix": "A complete prompt prefix combining all elements (30-50 words)"
+}
+
+Focus on creating a specific, reproducible style_prefix that will generate visually consistent images."""
+
+            # Build multimodal message with image
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": style_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{media_type};base64,{image_data}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # Get LLM config for VLM call
+            from pixelle_video.config import config_manager
+            llm_config = config_manager.config.llm
+            
+            # Create OpenAI client directly for VLM call
+            client = AsyncOpenAI(
+                api_key=llm_config.api_key,
+                base_url=llm_config.base_url
            )
            
+            # Call VLM with multimodal message
+            try:
+                response = await client.chat.completions.create(
+                    model=llm_config.model,
+                    messages=messages,
+                    temperature=0.3,
+                    max_tokens=500
+                )
+                vlm_response = response.choices[0].message.content
+                logger.debug(f"VLM style extraction response: {vlm_response[:100]}...")
+            except Exception as e:
+                logger.warning(f"VLM call failed, using basic extraction: {e}")
+                return self._extract_basic(image_path)
+            
+            # Parse response
+            import json
+            import re
+            
+            try:
+                # Try to extract JSON from response
+                match = re.search(r'\{[\s\S]*\}', vlm_response)
+                if match:
+                    data = json.loads(match.group())
+                else:
+                    data = json.loads(vlm_response)
+                
+                anchor = StyleAnchor(
+                    art_style=data.get("art_style", ""),
+                    color_palette=data.get("color_palette", ""),
+                    lighting=data.get("lighting", ""),
+                    texture=data.get("texture", ""),
+                    style_prefix=data.get("style_prefix", ""),
+                    reference_image=image_path,
+                )
+                
+                logger.info(f"VLM extracted style: {anchor.style_prefix[:80]}...")
+                return anchor
+                
+            except (json.JSONDecodeError, KeyError) as e:
+                logger.warning(f"Failed to parse VLM response: {e}")
+                # Use the raw response as style_prefix if it looks reasonable
+                if len(vlm_response) < 200 and len(vlm_response) > 20:
+                    return StyleAnchor(
+                        style_prefix=vlm_response.strip(),
+                        reference_image=image_path,
+                    )
+                return self._extract_basic(image_path)
+            
        except Exception as e:
            logger.warning(f"VLM style extraction failed: {e}")
            return self._extract_basic(image_path)
    
    def _extract_basic(self, image_path: str) -> StyleAnchor:
-        """Basic style extraction without VLM"""
-        # Return generic style anchor
+        """Basic style extraction without VLM - analyze filename for hints"""
+        import os
+        
+        filename = os.path.basename(image_path).lower()
+        
+        # Try to infer style from filename or path
+        style_hints = []
+        
+        if "anime" in filename or "cartoon" in filename:
+            style_hints.append("anime style illustration")
+        elif "realistic" in filename or "photo" in filename:
+            style_hints.append("photorealistic style")
+        elif "sketch" in filename or "line" in filename:
+            style_hints.append("sketch style, clean lines")
+        else:
+            style_hints.append("consistent visual style, high quality")
+        
        return StyleAnchor(
-            style_prefix="consistent visual style",
+            style_prefix=", ".join(style_hints),
            reference_image=image_path,
        )
    
--- a/pixelle_video/utils/content_generators.py
+++ b/pixelle_video/utils/content_generators.py
@@ -208,7 +208,9 @@ async def generate_narrations_from_content(

 async def split_narration_script(
    script: str,
-    split_mode: Literal["paragraph", "line", "sentence"] = "paragraph",
+    split_mode: Literal["paragraph", "line", "sentence", "smart"] = "paragraph",
+    llm_service = None,
+    target_segments: int = 8,
 ) -> List[str]:
    """
    Split user-provided narration script into segments
@@ -219,6 +221,9 @@ async def split_narration_script(
            - "paragraph": Split by double newline (\\n\\n), preserve single newlines within paragraphs
            - "line": Split by single newline (\\n), each line is a segment
            - "sentence": Split by sentence-ending punctuation (。.!?！？)
+            - "smart": First split by paragraph, then use LLM to intelligently merge related paragraphs
+        llm_service: LLM service instance (required for "smart" mode)
+        target_segments: Target number of segments for "smart" mode (default: 8)
    
    Returns:
        List of narration segments
@@ -227,7 +232,31 @@ async def split_narration_script(
    
    narrations = []
    
-    if split_mode == "paragraph":
+    if split_mode == "smart":
+        # Smart mode: first split by paragraph, then merge intelligently
+        if llm_service is None:
+            raise ValueError("llm_service is required for 'smart' split mode")
+        
+        # Step 1: Split by paragraph first
+        paragraphs = re.split(r'\n\s*\n', script)
+        paragraphs = [p.strip() for p in paragraphs if p.strip()]
+        logger.info(f"   Initial split: {len(paragraphs)} paragraphs")
+        
+        # Step 2: Merge intelligently using LLM
+        # If target_segments is None, merge_paragraphs_smart will auto-analyze
+        if target_segments is not None and len(paragraphs) <= target_segments:
+            # No need to merge if already within target
+            logger.info(f"   Paragraphs count ({len(paragraphs)}) <= target ({target_segments}), no merge needed")
+            narrations = paragraphs
+        else:
+            narrations = await merge_paragraphs_smart(
+                llm_service=llm_service,
+                paragraphs=paragraphs,
+                target_segments=target_segments  # Can be None for auto-analysis
+            )
+        logger.info(f"✅ Smart split: {len(paragraphs)} paragraphs -> {len(narrations)} segments")
+    
+    elif split_mode == "paragraph":
        # Split by double newline (paragraph mode)
        # Preserve single newlines within paragraphs
        paragraphs = re.split(r'\n\s*\n', script)
@@ -266,6 +295,150 @@ async def split_narration_script(
    return narrations


+async def merge_paragraphs_smart(
+    llm_service,
+    paragraphs: List[str],
+    target_segments: int = None,  # Now optional - auto-analyze if not provided
+    max_retries: int = 3,
+) -> List[str]:
+    """
+    Use LLM to intelligently merge paragraphs based on semantic relevance.
+    
+    Two-step approach:
+    1. If target_segments is not provided, first analyze text to recommend optimal count
+    2. Then group paragraphs based on the target count
+    
+    Args:
+        llm_service: LLM service instance
+        paragraphs: List of original paragraphs
+        target_segments: Target number of merged segments (auto-analyzed if None)
+        max_retries: Maximum retry attempts for each step
+    
+    Returns:
+        List of merged paragraphs
+    """
+    from pixelle_video.prompts import (
+        build_paragraph_analysis_prompt,
+        build_paragraph_grouping_prompt
+    )
+    
+    # ========================================
+    # Step 1: Analyze and recommend segment count (if not provided)
+    # ========================================
+    if target_segments is None:
+        logger.info(f"Analyzing {len(paragraphs)} paragraphs to recommend segment count...")
+        
+        analysis_prompt = build_paragraph_analysis_prompt(paragraphs)
+        analysis_result = None
+        
+        for attempt in range(1, max_retries + 1):
+            try:
+                response = await llm_service(
+                    prompt=analysis_prompt,
+                    temperature=0.3,
+                    max_tokens=1500
+                )
+                
+                logger.debug(f"Analysis response length: {len(response)} chars")
+                
+                result = _parse_json(response)
+                
+                if "recommended_segments" not in result:
+                    raise KeyError("Missing 'recommended_segments' in analysis")
+                
+                target_segments = result["recommended_segments"]
+                analysis_result = result
+                
+                # Validate range
+                if target_segments < 3:
+                    target_segments = 3
+                elif target_segments > 15:
+                    target_segments = 15
+                
+                reasoning = result.get("reasoning", "N/A")
+                logger.info(f"✅ Analysis complete: recommended {target_segments} segments")
+                logger.info(f"   Reasoning: {reasoning[:100]}...")
+                break
+                
+            except Exception as e:
+                logger.error(f"Analysis attempt {attempt} failed: {e}")
+                if attempt >= max_retries:
+                    # Fallback: use simple heuristic
+                    target_segments = max(3, min(12, len(paragraphs) // 3))
+                    logger.warning(f"Using fallback: {target_segments} segments (paragraphs/3)")
+                    analysis_result = None
+                    break
+                logger.info("Retrying analysis...")
+    else:
+        analysis_result = None
+        logger.info(f"Using provided target: {target_segments} segments")
+    
+    # ========================================
+    # Step 2: Group paragraphs
+    # ========================================
+    logger.info(f"Grouping {len(paragraphs)} paragraphs into {target_segments} segments...")
+    
+    grouping_prompt = build_paragraph_grouping_prompt(
+        paragraphs=paragraphs,
+        target_segments=target_segments,
+        analysis_result=analysis_result
+    )
+    
+    for attempt in range(1, max_retries + 1):
+        try:
+            response = await llm_service(
+                prompt=grouping_prompt,
+                temperature=0.3,
+                max_tokens=2000
+            )
+            
+            logger.debug(f"Grouping response length: {len(response)} chars")
+            
+            result = _parse_json(response)
+            
+            if "groups" not in result:
+                raise KeyError("Invalid response format: missing 'groups'")
+            
+            groups = result["groups"]
+            
+            # Validate count
+            if len(groups) != target_segments:
+                logger.warning(
+                    f"Grouping attempt {attempt}: expected {target_segments} groups, got {len(groups)}"
+                )
+                if attempt < max_retries:
+                    continue
+                logger.warning(f"Accepting {len(groups)} groups after {max_retries} attempts")
+            
+            # Validate group boundaries
+            for i, group in enumerate(groups):
+                if "start" not in group or "end" not in group:
+                    raise ValueError(f"Group {i} missing 'start' or 'end'")
+                if group["start"] > group["end"]:
+                    raise ValueError(f"Group {i} has invalid range: start > end")
+                if group["start"] < 0 or group["end"] >= len(paragraphs):
+                    raise ValueError(f"Group {i} has out-of-bounds indices")
+            
+            # Merge paragraphs based on groups
+            merged = []
+            for group in groups:
+                start, end = group["start"], group["end"]
+                merged_text = "\n\n".join(paragraphs[start:end + 1])
+                merged.append(merged_text)
+            
+            logger.info(f"✅ Successfully merged into {len(merged)} segments")
+            return merged
+            
+        except Exception as e:
+            logger.error(f"Grouping attempt {attempt} failed: {e}")
+            if attempt >= max_retries:
+                raise
+            logger.info("Retrying grouping...")
+    
+    # Fallback: should not reach here
+    return paragraphs
+
+
 async def generate_image_prompts(
    llm_service,
    narrations: List[str],
@@ -489,8 +662,8 @@ def _parse_json(text: str) -> dict:
        except json.JSONDecodeError:
            pass
    
-    # Try to find any JSON object in the text
-    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts")\s*:\s*\[[^\]]*\][^{}]*\}'
+    # Try to find any JSON object with known keys (including analysis keys)
+    json_pattern = r'\{[^{}]*(?:"narrations"|"image_prompts"|"video_prompts"|"merged_paragraphs"|"groups"|"recommended_segments"|"scene_boundaries")\s*:\s*[^{}]*\}'
    match = re.search(json_pattern, text, re.DOTALL)
    if match:
        try:
@@ -498,6 +671,17 @@ def _parse_json(text: str) -> dict:
        except json.JSONDecodeError:
            pass
    
+    # Try to find any JSON object that looks like it contains an array
+    # This is a more aggressive fallback for complex nested arrays
+    json_start = text.find('{')
+    json_end = text.rfind('}')
+    if json_start != -1 and json_end != -1 and json_end > json_start:
+        potential_json = text[json_start:json_end + 1]
+        try:
+            return json.loads(potential_json)
+        except json.JSONDecodeError:
+            pass
+    
    # If all fails, raise error
    raise json.JSONDecodeError("No valid JSON found", text, 0)

--- a/uv.lock
+++ b/uv.lock
--- a/web/components/content_input.py
+++ b/web/components/content_input.py
@@ -65,6 +65,7 @@ def render_content_input():
                    "paragraph": tr("split.mode_paragraph"),
                    "line": tr("split.mode_line"),
                    "sentence": tr("split.mode_sentence"),
+                    "smart": tr("split.mode_smart"),
                }
                split_mode = st.selectbox(
                    tr("split.mode_label"),
@@ -73,8 +74,16 @@ def render_content_input():
                    index=0,  # Default to paragraph mode
                    help=tr("split.mode_help")
                )
+                
+                # Show info for smart mode (auto-detect segment count)
+                if split_mode == "smart":
+                    st.info(tr("split.smart_auto_hint"))
+                    target_segments = None  # Auto-detect
+                else:
+                    target_segments = None  # Not used for other modes
            else:
                split_mode = "paragraph"  # Default for generate mode (not used)
+                target_segments = None
            
            # Title input (optional for both modes)
            title = st.text_input(
@@ -105,7 +114,8 @@ def render_content_input():
                "text": text,
                "title": title,
                "n_scenes": n_scenes,
-                "split_mode": split_mode
+                "split_mode": split_mode,
+                "target_segments": target_segments
            }
        
        else:
--- a/web/components/output_preview.py
+++ b/web/components/output_preview.py
@@ -26,6 +26,10 @@ from web.utils.async_helpers import run_async
 from pixelle_video.models.progress import ProgressEvent
 from pixelle_video.config import config_manager

+# Get ports from environment
+API_PORT = os.getenv("API_PORT", "8000")
+EDITOR_PORT = os.getenv("EDITOR_PORT", "3000")
+

 def render_output_preview(pixelle_video, video_params):
    """Render output preview section (right column)"""
@@ -48,6 +52,7 @@ def render_single_output(pixelle_video, video_params):
    title = video_params.get("title")
    n_scenes = video_params.get("n_scenes", 5)
    split_mode = video_params.get("split_mode", "paragraph")
+    target_segments = video_params.get("target_segments", 8)
    bgm_path = video_params.get("bgm_path")
    bgm_volume = video_params.get("bgm_volume", 0.2)
    
@@ -112,6 +117,7 @@ def render_single_output(pixelle_video, video_params):
                        "title": title if title else None,
                        "n_scenes": n_scenes,
                        "split_mode": split_mode,
+                        "target_segments": target_segments,
                        "media_workflow": workflow_key,
                        "frame_template": frame_template,
                        "prompt_prefix": prompt_prefix,
@@ -135,7 +141,7 @@ def render_single_output(pixelle_video, video_params):
                    
                    # Submit to async API
                    response = requests.post(
-                        "http://localhost:8000/api/video/generate/async",
+                        f"http://localhost:{API_PORT}/api/video/generate/async",
                        json=api_payload,
                        timeout=30
                    )
@@ -218,6 +224,7 @@ def render_single_output(pixelle_video, video_params):
                    "title": title if title else None,
                    "n_scenes": n_scenes,
                    "split_mode": split_mode,
+                    "target_segments": target_segments,
                    "media_workflow": workflow_key,
                    "frame_template": frame_template,
                    "prompt_prefix": prompt_prefix,
@@ -309,7 +316,7 @@ def render_single_output(pixelle_video, video_params):
                            pass
                        
                        if task_id:
-                            editor_url = f"http://localhost:3000/editor?storyboard_id={task_id}"
+                            editor_url = f"http://localhost:{EDITOR_PORT}/editor?storyboard_id={task_id}"
                            st.markdown(
                                f'''
                                <a href="{editor_url}" target="_blank" style="text-decoration: none;">
--- a/web/i18n/locales/en_US.json
+++ b/web/i18n/locales/en_US.json
@@ -26,6 +26,8 @@
    "split.mode_paragraph": "📄 By Paragraph (\\n\\n)",
    "split.mode_line": "📝 By Line (\\n)",
    "split.mode_sentence": "✂️ By Sentence (。.!?)",
+    "split.mode_smart": "🧠 Smart Merge (AI Grouping)",
+    "split.smart_auto_hint": "🤖 AI will analyze text structure, recommend optimal segment count, and intelligently merge related paragraphs (dialogues, same scene)",
    "input.content": "Content",
    "input.content_placeholder": "Used directly without modification (split by strategy below)\nExample:\nHello everyone, today I'll share three study tips.\n\nThe first tip is focus training, meditate for 10 minutes daily.\n\nThe second tip is active recall, review immediately after learning.",
    "input.content_help": "Provide your own content for video generation",
--- a/web/i18n/locales/zh_CN.json
+++ b/web/i18n/locales/zh_CN.json
@@ -26,6 +26,8 @@
    "split.mode_paragraph": "📄 按段落（\\n\\n）",
    "split.mode_line": "📝 按行（\\n）",
    "split.mode_sentence": "✂️ 按句号（。.!?）",
+    "split.mode_smart": "🧠 智能合并（AI 分组）",
+    "split.smart_auto_hint": "🤖 AI 将自动分析文本结构，推荐最佳分镜数量，并智能合并相关段落（对话、同一场景）",
    "input.content": "内容",
    "input.content_placeholder": "直接使用，不做改写（根据下方分割方式切分）\n例如：\n大家好，今天跟你分享三个学习技巧。\n\n第一个技巧是专注力训练，每天冥想10分钟。\n\n第二个技巧是主动回忆，学完立即复述。",
    "input.content_help": "提供您自己的内容用于视频生成",
--- a/web/pages/2_📚_History.py
+++ b/web/pages/2_📚_History.py
@@ -33,6 +33,9 @@ from web.components.header import render_header
 from web.i18n import tr
 from web.utils.async_helpers import run_async

+# Get ports from environment
+EDITOR_PORT = os.getenv("EDITOR_PORT", "3000")
+
 # Page config
 st.set_page_config(
    page_title="History - Pixelle-Video",
@@ -363,7 +366,7 @@ def render_task_detail_modal(task_id: str, pixelle_video):
                )
            
            # Open in Editor button
-            editor_url = f"http://localhost:3000/editor?storyboard_id={task_id}"
+            editor_url = f"http://localhost:{EDITOR_PORT}/editor?storyboard_id={task_id}"
            st.markdown(
                f'''
                <a href="{editor_url}" target="_blank" style="text-decoration: none;">
--- a/web/pages/3_📋_Tasks.py
+++ b/web/pages/3_📋_Tasks.py
@@ -22,6 +22,7 @@ Features:
 import streamlit as st
 import requests
 import time
+import os
 from datetime import datetime

 from web.i18n import tr, get_language
@@ -33,8 +34,12 @@ st.set_page_config(
    layout="wide",
 )

+# Get ports from environment
+API_PORT = os.getenv("API_PORT", "8000")
+EDITOR_PORT = os.getenv("EDITOR_PORT", "3000")
+
 # API endpoint
-API_BASE = "http://localhost:8000/api"
+API_BASE = f"http://localhost:{API_PORT}/api"


 def get_all_tasks():
@@ -183,7 +188,7 @@ def render_task_card(task):
            with col_a:
                st.success("✨ 视频生成成功")
            with col_b:
-                editor_url = f"http://localhost:3000/editor?storyboard_id={task_id}"
+                editor_url = f"http://localhost:{EDITOR_PORT}/editor?storyboard_id={task_id}"
                st.markdown(
                    f'''
                    <a href="{editor_url}" target="_blank" style="text-decoration: none;">
Author	SHA1	Message	Date
empty	3d3aba3670	feat: Add smart paragraph merging mode with AI grouping Some checks failed Deploy Documentation / deploy (push) Has been cancelled Details - Add "smart" split mode that uses LLM to intelligently merge related paragraphs - Implement two-step approach: analyze text structure, then group by semantic relevance - Add paragraph_merging.py with analysis and grouping prompts - Update UI to support smart mode selection with auto-detect hint - Add i18n translations for smart mode (en_US, zh_CN) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-01-17 00:19:46 +08:00
empty	3a8ec576ee	docs: Add port configuration guide Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>	2026-01-10 16:13:23 +08:00
empty	3f59b324ad	fix: Remove hardcoded ports, support custom port configuration - Replace all hardcoded localhost:8000/3000/8501 with environment variables - Frontend: Use API_PORT env var in next.config.ts - Backend: Use API_PORT env var in editor.py and quality.py - Web UI: Use API_PORT and EDITOR_PORT env vars in all Streamlit pages - Update dev.sh to pass environment variables to all services - Add .env.example with port configuration template Now supports custom ports via environment variables: API_PORT=8080 EDITOR_PORT=3001 WEB_PORT=8502 ./dev.sh Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>	2026-01-10 16:13:02 +08:00
empty	6bf16936af	chore: Update dependency lock file Some checks failed Deploy Documentation / deploy (push) Has been cancelled Details Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>	2026-01-10 15:56:22 +08:00
empty	c65b040fe3	fix: Filter non-serializable objects in pipeline metadata - Skip CharacterMemory and callback functions during serialization - Add .pids/ and .serena/ to gitignore - Add workflow integration documentation Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>	2026-01-10 15:56:06 +08:00
empty	be2639c596	fix: Add to_dict method to CharacterMemory for JSON serialization	2026-01-07 15:49:11 +08:00
empty	c854bd80e0	feat: Switch to flux2 workflow for image regeneration	2026-01-07 09:56:03 +08:00
empty	90ceb76296	feat: Optimize VLM prompt for storyboard consistency, focus on constant features	2026-01-07 09:44:27 +08:00
empty	1b54552fec	fix: Handle nested JSON structures in VLM response parsing	2026-01-07 09:40:21 +08:00
empty	f19804facb	feat: Add VLMConfig to schema for proper config.yaml VLM support	2026-01-07 09:38:45 +08:00
empty	bc077475c6	fix: Remove duplicate import os causing UnboundLocalError	2026-01-07 09:35:45 +08:00
empty	9675b9c23b	feat: Add VLM config to config.example.yaml with config.yaml support	2026-01-07 09:32:16 +08:00
empty	92183b083b	feat: Add Qwen VL support for character analysis, configurable via VLM_PROVIDER	2026-01-07 09:29:43 +08:00
empty	be216eacad	fix: Increase VLM max_tokens to 2000 to avoid response truncation	2026-01-07 03:37:55 +08:00
empty	8d82cf91d5	fix: Auto-detect and use GLM-4V vision model for character analysis	2026-01-07 03:33:56 +08:00
empty	8c35b0066f	fix: Enhance VLM response parsing to handle markdown code blocks	2026-01-07 03:31:42 +08:00
empty	44249889df	fix: Add file upload endpoint and fix frontend upload path	2026-01-07 03:17:58 +08:00
empty	49e667cc94	feat: Enhance CharacterPanel with image upload and VLM analysis	2026-01-07 03:11:41 +08:00
empty	b3cf9e64e5	feat: Implement Character Memory V1 - VLM analysis and prompt injection	2026-01-07 03:08:29 +08:00
empty	da98d0842a	feat: Persist regenerated image/audio paths to storyboard.json	2026-01-07 00:34:38 +08:00
empty	c0eb4ed320	fix: Add onClick handlers to top Settings and Save buttons with alerts	2026-01-07 00:31:50 +08:00
empty	2be9256c48	fix: Use OpenAI multimodal message format for VLM style extraction	2026-01-07 00:22:33 +08:00
empty	297f3ccda4	feat: Enhance StyleGuard with VLM-based style extraction for specific style_prefix	2026-01-07 00:16:57 +08:00
empty	a3ab12e87c	feat: Add detailed debug logging to editor functions	2026-01-07 00:05:53 +08:00
empty	4d3c89a8f6	fix: Await async extract_style_anchor call	2026-01-06 23:54:35 +08:00
empty	7da6ed6a74	feat: Auto-apply style anchor prefix when regenerating frame images	2026-01-06 23:50:27 +08:00
empty	bf5a2af4fd	fix: Await get_pixelle_video in regenerate_frame_audio	2026-01-06 23:34:53 +08:00
empty	1d343e55ba	feat(P1): Add align-prompt feature for better text-image relevance	2026-01-06 23:29:41 +08:00
empty	2978622f7f	feat(P0): Enhance image prompt generation with core imagery extraction for better text-image alignment	2026-01-06 23:04:20 +08:00
empty	b62fdb6958	fix: Specify runninghub/image_flux.json workflow explicitly	2026-01-06 18:14:49 +08:00
empty	f8b102c2e0	fix: Await get_pixelle_video coroutine in regenerate_frame_image	2026-01-06 18:13:57 +08:00
empty	4b86803692	feat: Update regenerate_frame_image to use MediaService with RunningHub support	2026-01-06 18:11:19 +08:00
empty	e29615a885	fix: Sync currentTime when user selects a frame in timeline	2026-01-06 17:55:36 +08:00