AI-Video/pixelle_video/services/quality/character_memory.py

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
CharacterMemory - Character consistency and memory system

Maintains consistent character appearance across video frames by:
1. Detecting and registering characters from narrations
2. Extracting visual descriptions from first appearances
3. Injecting character consistency prompts into subsequent frames
4. Supporting reference images for ComfyUI IP-Adapter/ControlNet
"""

from dataclasses import dataclass, field
from typing import List, Dict, Optional, Set, Tuple, Any
from datetime import datetime
from enum import Enum

import numpy as np
from loguru import logger


class CharacterType(Enum):
    """Type of character"""
    PERSON = "person"           # Human character
    ANIMAL = "animal"           # Animal character
    CREATURE = "creature"       # Fantasy/fictional creature
    OBJECT = "object"           # Personified object
    ABSTRACT = "abstract"       # Abstract entity


@dataclass
class Character:
    """
    Represents a character in the video narrative

    Stores visual description, reference images, and appearance history
    to maintain consistency across frames.
    """

    # Identity
    id: str                                    # Unique identifier
    name: str                                  # Character name (e.g., "小明", "the hero")
    aliases: List[str] = field(default_factory=list)  # Alternative names
    character_type: CharacterType = CharacterType.PERSON

    # Visual description (for prompt injection)
    appearance_description: str = ""           # Detailed visual description
    clothing_description: str = ""             # Clothing/outfit description
    distinctive_features: List[str] = field(default_factory=list)  # Unique features

    # Reference images (for IP-Adapter/ControlNet)
    reference_images: List[str] = field(default_factory=list)  # Paths to reference images
    primary_reference: Optional[str] = None    # Primary reference image

    # Prompt elements
    prompt_prefix: str = ""                    # Pre-built prompt prefix
    negative_prompt: str = ""                  # Negative prompt additions

    # Metadata
    is_active: bool = True                     # Whether this character is active for logic
    first_appearance_frame: int = 0            # Frame index of first appearance
    appearance_frames: List[int] = field(default_factory=list)  # All frames with this character
    created_at: Optional[datetime] = None

    # Visual features (NEW - for cross-frame consistency)
    visual_features: Optional[Any] = None      # CLIP feature vector (np.ndarray)
    feature_extraction_frame: Optional[int] = None  # Frame where features were extracted
    similarity_history: List[float] = field(default_factory=list)  # Similarity scores history
    min_similarity_threshold: float = 0.75     # Minimum similarity for consistency

    def __post_init__(self):
        if self.created_at is None:
            self.created_at = datetime.now()
        if not hasattr(self, 'is_active'):
            self.is_active = True
        if not self.prompt_prefix:
            self._build_prompt_prefix()

    def _build_prompt_prefix(self):
        """Build prompt prefix from visual descriptions"""
        elements = []

        if self.appearance_description:
            elements.append(self.appearance_description)
        if self.clothing_description:
            elements.append(f"wearing {self.clothing_description}")
        if self.distinctive_features:
            elements.append(", ".join(self.distinctive_features))

        self.prompt_prefix = ", ".join(elements) if elements else ""

    def get_prompt_injection(self) -> str:
        """Get the prompt text to inject for this character"""
        if self.prompt_prefix:
            return f"({self.name}: {self.prompt_prefix})"
        return f"({self.name})"

    def add_reference_image(self, image_path: str, set_as_primary: bool = False):
        """Add a reference image for this character"""
        if image_path not in self.reference_images:
            self.reference_images.append(image_path)
        if set_as_primary or self.primary_reference is None:
            self.primary_reference = image_path

    def matches_name(self, name: str) -> bool:
        """Check if a name matches this character"""
        name_lower = name.lower().strip()
        if self.name.lower() == name_lower:
            return True
        return any(alias.lower() == name_lower for alias in self.aliases)

    def set_visual_features(self, features: np.ndarray, frame_index: int):
        """Set visual features from reference frame"""
        self.visual_features = features
        self.feature_extraction_frame = frame_index

    def check_visual_similarity(self, other_features: np.ndarray) -> Tuple[bool, float]:
        """Check if other features match this character"""
        if self.visual_features is None:
            return True, 1.0

        similarity = float(np.dot(self.visual_features, other_features))
        similarity = (similarity + 1) / 2  # Normalize to 0-1

        self.similarity_history.append(similarity)
        is_match = similarity >= self.min_similarity_threshold
        return is_match, similarity

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "name": self.name,
            "aliases": self.aliases,
            "type": self.character_type.value,
            "appearance_description": self.appearance_description,
            "clothing_description": self.clothing_description,
            "distinctive_features": self.distinctive_features,
            "reference_images": self.reference_images,
            "primary_reference": self.primary_reference,
            "prompt_prefix": self.prompt_prefix,
            "first_appearance_frame": self.first_appearance_frame,
        }


@dataclass
class CharacterMemoryConfig:
    """Configuration for character memory system"""

    # Detection settings
    auto_detect_characters: bool = True        # Automatically detect characters from narrations
    use_llm_detection: bool = True             # Use LLM to extract character info

    # Consistency settings
    inject_character_prompts: bool = True      # Inject character descriptions into prompts
    use_reference_images: bool = True          # Use reference images for generation

    # Reference image settings
    extract_reference_from_first: bool = True  # Extract reference from first appearance
    max_reference_images: int = 3              # Max reference images per character

    # Prompt injection settings
    prompt_injection_position: str = "start"   # "start" or "end"
    include_clothing: bool = True              # Include clothing in prompts
    include_features: bool = True              # Include distinctive features

    # Visual feature settings (NEW)
    enable_visual_features: bool = True        # Enable CLIP visual features
    visual_similarity_threshold: float = 0.75  # Min similarity for consistency
    extract_features_on_first: bool = True     # Extract features on first appearance


class CharacterMemory:
    """
    Character memory and consistency manager

    Tracks characters across video frames and ensures visual consistency
    by injecting character descriptions and reference images into the
    generation pipeline.

    Example:
        >>> memory = CharacterMemory(llm_service)
        >>>
        >>> # Register a character
        >>> char = memory.register_character(
        ...     name="小明",
        ...     appearance_description="young man with short black hair",
        ...     clothing_description="blue t-shirt"
        ... )
        >>>
        >>> # Apply to prompt
        >>> enhanced_prompt = memory.apply_to_prompt(
        ...     prompt="A person walking in the park",
        ...     characters=["小明"]
        ... )
    """

    def __init__(
        self,
        llm_service=None,
        config: Optional[CharacterMemoryConfig] = None
    ):
        """
        Initialize CharacterMemory

        Args:
            llm_service: Optional LLM service for character detection
            config: Character memory configuration
        """
        self.llm_service = llm_service
        self.config = config or CharacterMemoryConfig()
        self._characters: Dict[str, Character] = {}
        self._name_index: Dict[str, str] = {}  # name -> character_id mapping
        self._feature_extractor = None  # Lazy-loaded

    def register_character(
        self,
        name: str,
        appearance_description: str = "",
        clothing_description: str = "",
        distinctive_features: Optional[List[str]] = None,
        character_type: CharacterType = CharacterType.PERSON,
        first_frame: int = 0,
    ) -> Character:
        """
        Register a new character

        Args:
            name: Character name
            appearance_description: Visual appearance description
            clothing_description: Clothing/outfit description
            distinctive_features: List of distinctive features
            character_type: Type of character
            first_frame: Frame index of first appearance

        Returns:
            Created Character object
        """
        # Generate unique ID
        char_id = f"char_{len(self._characters)}_{name.replace(' ', '_').lower()}"

        character = Character(
            id=char_id,
            name=name,
            appearance_description=appearance_description,
            clothing_description=clothing_description,
            distinctive_features=distinctive_features or [],
            character_type=character_type,
            first_appearance_frame=first_frame,
            appearance_frames=[first_frame],
        )

        self._characters[char_id] = character
        self._name_index[name.lower()] = char_id

        logger.info(f"Registered character: {name} (id={char_id})")

        return character

    def get_character(self, name: str) -> Optional[Character]:
        """Get a character by name"""
        name_lower = name.lower().strip()
        char_id = self._name_index.get(name_lower)
        if char_id:
            return self._characters.get(char_id)

        # Search aliases
        for char in self._characters.values():
            if char.matches_name(name):
                return char

        return None

    def get_character_by_id(self, char_id: str) -> Optional[Character]:
        """Get a character by ID"""
        return self._characters.get(char_id)

    @property
    def characters(self) -> List[Character]:
        """Get all registered characters"""
        return list(self._characters.values())

    async def detect_characters_from_narration(
        self,
        narration: str,
        frame_index: int = 0,
    ) -> List[Character]:
        """
        Detect and register characters mentioned in narration

        Args:
            narration: Narration text to analyze
            frame_index: Current frame index

        Returns:
            List of detected/registered characters
        """
        if not self.config.auto_detect_characters:
            return []

        detected = []

        if self.config.use_llm_detection and self.llm_service:
            detected = await self._detect_with_llm(narration, frame_index)
        else:
            detected = self._detect_basic(narration, frame_index)

        return detected

    async def _detect_with_llm(
        self,
        narration: str,
        frame_index: int,
    ) -> List[Character]:
        """Detect characters using LLM"""
        if not self.llm_service:
            return []

        try:
            prompt = f"""分析以下文案，提取其中提到的角色/人物。

文案: {narration}

请用 JSON 格式返回角色列表，每个角色包含:
- name: 角色名称或代称
- type: person/animal/creature/object
- appearance: 外貌描述（如有）
- clothing: 服装描述（如有）

如果没有明确角色，返回空列表 []。

只返回 JSON，不要其他解释。"""

            response = await self.llm_service(prompt, temperature=0.1)

            # Parse response
            import json
            import re

            # Extract JSON from response
            json_match = re.search(r'\[.*\]', response, re.DOTALL)
            if json_match:
                characters_data = json.loads(json_match.group())

                result = []
                for char_data in characters_data:
                    name = char_data.get("name", "").strip()
                    if not name:
                        continue

                    # Check if already registered
                    existing = self.get_character(name)
                    if existing:
                        existing.appearance_frames.append(frame_index)
                        result.append(existing)
                    else:
                        # Register new character
                        char_type = CharacterType.PERSON
                        type_str = char_data.get("type", "person").lower()
                        if type_str == "animal":
                            char_type = CharacterType.ANIMAL
                        elif type_str == "creature":
                            char_type = CharacterType.CREATURE

                        char = self.register_character(
                            name=name,
                            appearance_description=char_data.get("appearance", ""),
                            clothing_description=char_data.get("clothing", ""),
                            character_type=char_type,
                            first_frame=frame_index,
                        )
                        result.append(char)

                return result

            return []

        except Exception as e:
            logger.warning(f"LLM character detection failed: {e}")
            return self._detect_basic(narration, frame_index)

    def _detect_basic(
        self,
        narration: str,
        frame_index: int,
    ) -> List[Character]:
        """Basic character detection without LLM"""
        # Simple pattern matching for common character references
        import re

        patterns = [
            r'(?:他|她|它)们?',  # Chinese pronouns
            r'(?:小\w{1,2})',    # Names like 小明, 小红
            r'(?:老\w{1,2})',    # Names like 老王, 老李
        ]

        detected = []
        for pattern in patterns:
            matches = re.findall(pattern, narration)
            for match in matches:
                existing = self.get_character(match)
                if existing:
                    existing.appearance_frames.append(frame_index)
                    if existing not in detected:
                        detected.append(existing)

        return detected

    def apply_to_prompt(
        self,
        prompt: str,
        character_names: Optional[List[str]] = None,
        frame_index: Optional[int] = None,
    ) -> str:
        """
        Apply character consistency to an image prompt

        Args:
            prompt: Original image prompt
            character_names: Specific characters to include (None = auto-detect)
            frame_index: Current frame index for tracking

        Returns:
            Enhanced prompt with character consistency
        """
        if not self.config.inject_character_prompts:
            return prompt

        characters_to_include = []

        if character_names:
            for name in character_names:
                char = self.get_character(name)
                if char:
                    characters_to_include.append(char)
        else:
            # Include all characters that have appeared
            characters_to_include = self.characters

        if not characters_to_include:
            return prompt

        # Build character injection
        injections = []
        for char in characters_to_include:
            injection = char.get_prompt_injection()
            if injection:
                injections.append(injection)

            # Track appearance
            if frame_index is not None and frame_index not in char.appearance_frames:
                char.appearance_frames.append(frame_index)

        if not injections:
            return prompt

        character_prompt = ", ".join(injections)

        if self.config.prompt_injection_position == "start":
            return f"{character_prompt}, {prompt}"
        else:
            return f"{prompt}, {character_prompt}"

    def get_reference_images(
        self,
        character_names: Optional[List[str]] = None,
    ) -> List[str]:
        """
        Get reference images for specified characters

        Args:
            character_names: Character names (None = all characters)

        Returns:
            List of reference image paths
        """
        if not self.config.use_reference_images:
            return []

        images = []

        if character_names:
            for name in character_names:
                char = self.get_character(name)
                if char and char.primary_reference:
                    images.append(char.primary_reference)
        else:
            for char in self.characters:
                if char.primary_reference:
                    images.append(char.primary_reference)

        return images[:self.config.max_reference_images]

    def set_reference_image(
        self,
        character_name: str,
        image_path: str,
        set_as_primary: bool = True,
    ):
        """
        Set a reference image for a character

        Args:
            character_name: Character name
            image_path: Path to reference image
            set_as_primary: Whether to set as primary reference
        """
        char = self.get_character(character_name)
        if char:
            char.add_reference_image(image_path, set_as_primary)
            logger.debug(f"Set reference image for {character_name}: {image_path}")
        else:
            logger.warning(f"Character not found: {character_name}")

    def update_character_appearance(
        self,
        character_name: str,
        appearance_description: Optional[str] = None,
        clothing_description: Optional[str] = None,
        distinctive_features: Optional[List[str]] = None,
    ):
        """Update a character's visual description"""
        char = self.get_character(character_name)
        if char:
            if appearance_description:
                char.appearance_description = appearance_description
            if clothing_description:
                char.clothing_description = clothing_description
            if distinctive_features:
                char.distinctive_features = distinctive_features
            char._build_prompt_prefix()
            logger.debug(f"Updated appearance for {character_name}")

    def get_consistency_summary(self) -> str:
        """Get a summary of character consistency for logging"""
        if not self._characters:
            return "No characters registered"

        lines = [f"Characters ({len(self._characters)}):"]
        for char in self.characters:
            lines.append(
                f"  - {char.name}: {len(char.appearance_frames)} appearances, "
                f"ref_images={len(char.reference_images)}"
            )
        return "\n".join(lines)

    def reset(self):
        """Clear all character memory"""
        self._characters.clear()
        self._name_index.clear()
        logger.info("Character memory cleared")

    @property
    def feature_extractor(self):
        """Lazy-load feature extractor"""
        if self._feature_extractor is None and self.config.enable_visual_features:
            from pixelle_video.services.quality.feature_extractor import FeatureExtractor
            self._feature_extractor = FeatureExtractor()
        return self._feature_extractor

    async def extract_character_features(
        self,
        character_name: str,
        image_path: str,
        frame_index: int = 0
    ) -> bool:
        """Extract and store visual features for a character"""
        if not self.config.enable_visual_features:
            return False

        char = self.get_character(character_name)
        if not char:
            logger.warning(f"Character not found: {character_name}")
            return False

        extractor = self.feature_extractor
        if extractor is None or not extractor.is_available:
            logger.debug("Feature extractor not available")
            return False

        features = extractor.extract_image_features(image_path)
        if features is None:
            return False

        char.set_visual_features(features, frame_index)
        char.add_reference_image(image_path, set_as_primary=True)
        logger.info(f"Extracted visual features for {character_name}")
        return True

    async def check_character_consistency(
        self,
        character_name: str,
        image_path: str
    ) -> Tuple[bool, float]:
        """Check if image maintains character consistency"""
        char = self.get_character(character_name)
        if not char or char.visual_features is None:
            return True, 1.0

        extractor = self.feature_extractor
        if extractor is None or not extractor.is_available:
            return True, 1.0

        new_features = extractor.extract_image_features(image_path)
        if new_features is None:
            return True, 1.0

        return char.check_visual_similarity(new_features)