AI-Video/pixelle_video/services/quality/quality_gate.py

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
QualityGate - Quality evaluation system for generated content

Evaluates images and videos based on:
- Aesthetic quality (visual appeal)
- Text-to-image matching (semantic alignment)
- Technical quality (clarity, no artifacts)

Includes HybridQualityGate for combined objective + VLM evaluation.
"""

import time
from dataclasses import dataclass
from typing import Optional
from pathlib import Path

from loguru import logger

from pixelle_video.services.quality.models import QualityScore, QualityConfig


@dataclass
class HybridQualityConfig(QualityConfig):
    """Extended configuration for hybrid quality evaluation"""

    # CLIP settings
    enable_clip_score: bool = True
    clip_model: str = "ViT-B/32"
    clip_weight: float = 0.5

    # Technical metrics settings
    enable_technical_metrics: bool = True
    sharpness_threshold: float = 0.3

    # Smart VLM skip
    enable_smart_skip: bool = True
    smart_skip_threshold: float = 0.75

    # Feature caching
    cache_features: bool = True


class QualityGate:
    """
    Quality evaluation gate for AI-generated content

    Uses VLM (Vision Language Model) or local models to evaluate:
    1. Aesthetic quality - Is the image visually appealing?
    2. Text matching - Does the image match the prompt/narration?
    3. Technical quality - Is the image clear and free of artifacts?

    Example:
        >>> gate = QualityGate(llm_service, config)
        >>> score = await gate.evaluate_image(
        ...     image_path="output/frame_001.png",
        ...     prompt="A sunset over mountains",
        ...     narration="夕阳西下，余晖洒满山间"
        ... )
        >>> if score.passed:
        ...     print("Image quality approved!")
    """

    def __init__(
        self,
        llm_service=None,
        config: Optional[QualityConfig] = None
    ):
        """
        Initialize QualityGate

        Args:
            llm_service: LLM service for VLM-based evaluation
            config: Quality configuration
        """
        self.llm_service = llm_service
        self.config = config or QualityConfig()

    async def evaluate_image(
        self,
        image_path: str,
        prompt: str,
        narration: Optional[str] = None,
    ) -> QualityScore:
        """
        Evaluate the quality of a generated image

        Args:
            image_path: Path to the image file
            prompt: The prompt used to generate the image
            narration: Optional narration text for context

        Returns:
            QualityScore with evaluation results
        """
        start_time = time.time()
        issues = []

        # Validate image exists
        if not Path(image_path).exists():
            return QualityScore(
                passed=False,
                issues=["Image file not found"],
                evaluation_time_ms=(time.time() - start_time) * 1000
            )

        # Evaluate using VLM or fallback to basic checks
        if self.config.use_vlm_evaluation and self.llm_service:
            score = await self._evaluate_with_vlm(image_path, prompt, narration)
        else:
            score = await self._evaluate_basic(image_path, prompt)

        # Set evaluation time
        score.evaluation_time_ms = (time.time() - start_time) * 1000

        # Determine if passed
        score.passed = score.overall_score >= self.config.overall_threshold

        logger.debug(
            f"Quality evaluation: overall={score.overall_score:.2f}, "
            f"passed={score.passed}, time={score.evaluation_time_ms:.0f}ms"
        )

        return score

    async def evaluate_video(
        self,
        video_path: str,
        prompt: str,
        narration: Optional[str] = None,
    ) -> QualityScore:
        """
        Evaluate the quality of a generated video

        Args:
            video_path: Path to the video file
            prompt: The prompt used to generate the video
            narration: Optional narration text for context

        Returns:
            QualityScore with evaluation results
        """
        start_time = time.time()

        # Validate video exists
        if not Path(video_path).exists():
            return QualityScore(
                passed=False,
                issues=["Video file not found"],
                evaluation_time_ms=(time.time() - start_time) * 1000
            )

        # For video, we can extract key frames and evaluate
        # For now, use VLM with video input or sample frames
        if self.config.use_vlm_evaluation and self.llm_service:
            score = await self._evaluate_video_with_vlm(video_path, prompt, narration)
        else:
            score = await self._evaluate_video_basic(video_path)

        score.evaluation_time_ms = (time.time() - start_time) * 1000
        score.passed = score.overall_score >= self.config.overall_threshold

        return score

    async def _evaluate_with_vlm(
        self,
        image_path: str,
        prompt: str,
        narration: Optional[str] = None,
    ) -> QualityScore:
        """
        Evaluate image quality using Vision Language Model

        Uses the LLM with vision capability to assess:
        - Visual quality and aesthetics
        - Prompt-image alignment
        - Technical defects
        """
        evaluation_prompt = self._build_evaluation_prompt(prompt, narration)

        try:
            # Call LLM with image (requires VLM-capable model like GPT-4o, Qwen-VL)
            # Note: This requires the LLM service to support vision input
            # For now, we'll use a basic score if VLM is not available

            # TODO: Implement actual VLM call when integrating with vision-capable LLM
            # response = await self.llm_service(
            #     prompt=evaluation_prompt,
            #     images=[image_path],
            #     response_type=ImageQualityResponse
            # )

            # Fallback to basic evaluation for now
            logger.debug("VLM evaluation: using basic fallback (VLM integration pending)")
            return await self._evaluate_basic(image_path, prompt)

        except Exception as e:
            logger.warning(f"VLM evaluation failed: {e}, falling back to basic")
            return await self._evaluate_basic(image_path, prompt)

    async def _evaluate_basic(
        self,
        image_path: str,
        prompt: str,
    ) -> QualityScore:
        """
        Basic image quality evaluation without VLM

        Performs simple checks:
        - File size and dimensions
        - Image format validation
        """
        issues = []

        try:
            # Import PIL for basic checks
            from PIL import Image

            with Image.open(image_path) as img:
                width, height = img.size

                # Check minimum dimensions
                if width < 256 or height < 256:
                    issues.append(f"Image too small: {width}x{height}")

                # Check aspect ratio (not too extreme)
                aspect = max(width, height) / min(width, height)
                if aspect > 4:
                    issues.append(f"Extreme aspect ratio: {aspect:.1f}")

            # Basic scores (generous defaults when VLM not available)
            aesthetic_score = 0.7 if not issues else 0.4
            text_match_score = 0.7  # Can't properly evaluate without VLM
            technical_score = 0.8 if not issues else 0.5

            # Calculate overall
            overall = (
                aesthetic_score * self.config.aesthetic_weight +
                text_match_score * self.config.text_match_weight +
                technical_score * self.config.technical_weight
            )

            return QualityScore(
                aesthetic_score=aesthetic_score,
                text_match_score=text_match_score,
                technical_score=technical_score,
                overall_score=overall,
                issues=issues,
            )

        except Exception as e:
            logger.error(f"Basic evaluation failed: {e}")
            return QualityScore(
                overall_score=0.0,
                passed=False,
                issues=[f"Evaluation error: {str(e)}"]
            )

    async def _evaluate_video_with_vlm(
        self,
        video_path: str,
        prompt: str,
        narration: Optional[str] = None,
    ) -> QualityScore:
        """Evaluate video using VLM (placeholder for future implementation)"""
        # TODO: Implement video frame sampling and VLM evaluation
        return await self._evaluate_video_basic(video_path)

    async def _evaluate_video_basic(
        self,
        video_path: str,
    ) -> QualityScore:
        """Basic video quality evaluation"""
        issues = []

        try:
            import subprocess
            import json

            # Use ffprobe to get video info
            cmd = [
                "ffprobe", "-v", "quiet", "-print_format", "json",
                "-show_format", "-show_streams", video_path
            ]
            result = subprocess.run(cmd, capture_output=True, text=True)

            if result.returncode != 0:
                issues.append("Failed to read video metadata")
                return QualityScore(overall_score=0.5, issues=issues)

            info = json.loads(result.stdout)

            # Check for video stream
            video_stream = None
            for stream in info.get("streams", []):
                if stream.get("codec_type") == "video":
                    video_stream = stream
                    break

            if not video_stream:
                issues.append("No video stream found")
                return QualityScore(overall_score=0.0, passed=False, issues=issues)

            # Check dimensions
            width = video_stream.get("width", 0)
            height = video_stream.get("height", 0)
            if width < 256 or height < 256:
                issues.append(f"Video too small: {width}x{height}")

            # Check duration
            duration = float(info.get("format", {}).get("duration", 0))
            if duration < 0.5:
                issues.append(f"Video too short: {duration:.1f}s")

            # Calculate scores
            aesthetic_score = 0.7
            text_match_score = 0.7
            technical_score = 0.8 if not issues else 0.5

            overall = (
                aesthetic_score * self.config.aesthetic_weight +
                text_match_score * self.config.text_match_weight +
                technical_score * self.config.technical_weight
            )

            return QualityScore(
                aesthetic_score=aesthetic_score,
                text_match_score=text_match_score,
                technical_score=technical_score,
                overall_score=overall,
                issues=issues,
            )

        except Exception as e:
            logger.error(f"Video evaluation failed: {e}")
            return QualityScore(
                overall_score=0.5,
                issues=[f"Evaluation error: {str(e)}"]
            )

    def _build_evaluation_prompt(
        self,
        prompt: str,
        narration: Optional[str] = None,
    ) -> str:
        """Build the evaluation prompt for VLM"""
        context = f"Narration: {narration}\n" if narration else ""

        return f"""Evaluate this AI-generated image on the following criteria.
Rate each from 0.0 to 1.0.

Image Generation Prompt: {prompt}
{context}
Evaluation Criteria:

1. Aesthetic Quality (0.0-1.0):
   - Is the image visually appealing?
   - Good composition, colors, and style?

2. Prompt Matching (0.0-1.0):
   - Does the image accurately represent the prompt?
   - Are key elements from the prompt visible?

3. Technical Quality (0.0-1.0):
   - Is the image clear and well-defined?
   - Free of artifacts, distortions, or blurriness?
   - Natural looking (no AI artifacts like extra fingers)?

Respond in JSON format:
{{
  "aesthetic_score": 0.0,
  "text_match_score": 0.0,
  "technical_score": 0.0,
  "issues": ["list of any problems found"]
}}
"""


class HybridQualityGate(QualityGate):
    """
    Hybrid quality gate combining objective metrics with VLM evaluation

    Evaluation flow:
    1. Calculate technical metrics (fast, local)
    2. Calculate CLIP score if enabled (local, requires CLIP)
    3. If smart_skip enabled and objective score >= threshold, skip VLM
    4. Otherwise, call VLM for subjective evaluation
    5. Combine scores with configurable weights

    Example:
        >>> gate = HybridQualityGate(llm_service, config)
        >>> score = await gate.evaluate_image(
        ...     image_path="frame_001.png",
        ...     prompt="A sunset over mountains"
        ... )
    """

    def __init__(
        self,
        llm_service=None,
        config: Optional[HybridQualityConfig] = None
    ):
        parent_config = config or HybridQualityConfig()
        super().__init__(llm_service, parent_config)

        self.hybrid_config = parent_config
        self._feature_extractor = None
        self._metrics_calculator = None
        self._vlm_evaluator = None

    @property
    def feature_extractor(self):
        """Lazy-load feature extractor"""
        if self._feature_extractor is None:
            from pixelle_video.services.quality.feature_extractor import (
                FeatureExtractor, FeatureExtractorConfig
            )
            self._feature_extractor = FeatureExtractor(
                FeatureExtractorConfig(
                    model_name=self.hybrid_config.clip_model,
                    cache_features=self.hybrid_config.cache_features
                )
            )
        return self._feature_extractor

    @property
    def metrics_calculator(self):
        """Lazy-load metrics calculator"""
        if self._metrics_calculator is None:
            from pixelle_video.services.quality.objective_metrics import (
                ObjectiveMetricsCalculator
            )
            self._metrics_calculator = ObjectiveMetricsCalculator(
                sharpness_threshold=self.hybrid_config.sharpness_threshold
            )
        return self._metrics_calculator

    @property
    def vlm_evaluator(self):
        """Lazy-load VLM evaluator"""
        if self._vlm_evaluator is None:
            from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
            self._vlm_evaluator = VLMEvaluator(self.llm_service)
        return self._vlm_evaluator

    async def evaluate_image(
        self,
        image_path: str,
        prompt: str,
        narration: Optional[str] = None,
    ) -> QualityScore:
        """Evaluate image quality using hybrid approach"""
        start_time = time.time()
        issues = []

        if not Path(image_path).exists():
            return QualityScore(
                passed=False,
                issues=["Image file not found"],
                evaluation_time_ms=(time.time() - start_time) * 1000
            )

        # Step 1: Technical metrics (fast, local)
        technical_score = 0.7
        technical_metrics = None

        if self.hybrid_config.enable_technical_metrics:
            technical_metrics = self.metrics_calculator.analyze_image(image_path)
            technical_score = technical_metrics.overall_technical
            issues.extend(technical_metrics.issues)

        # Step 2: CLIP score (if available)
        clip_score = None
        text_match_score = 0.7

        if self.hybrid_config.enable_clip_score:
            clip_score = self.feature_extractor.calculate_clip_score(
                image_path, prompt
            )
            if clip_score is not None:
                text_match_score = clip_score

        # Step 3: Determine if VLM needed
        objective_score = (technical_score + text_match_score) / 2
        use_vlm = True
        aesthetic_score = 0.7

        if self.hybrid_config.enable_smart_skip:
            if objective_score >= self.hybrid_config.smart_skip_threshold:
                use_vlm = False
                logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")

        # Step 4: VLM evaluation (if needed)
        if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
            vlm_result = await self.vlm_evaluator.evaluate_image(
                image_path, prompt, narration
            )
            aesthetic_score = vlm_result.aesthetic_score or 0.7

            if clip_score is not None:
                text_match_score = (
                    clip_score * self.hybrid_config.clip_weight +
                    vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
                )
            else:
                text_match_score = vlm_result.text_match_score or 0.7

            issues.extend(vlm_result.issues)

        # Step 5: Calculate overall
        overall = (
            aesthetic_score * self.config.aesthetic_weight +
            text_match_score * self.config.text_match_weight +
            technical_score * self.config.technical_weight
        )

        score = QualityScore(
            aesthetic_score=aesthetic_score,
            text_match_score=text_match_score,
            technical_score=technical_score,
            overall_score=overall,
            issues=issues,
            evaluation_time_ms=(time.time() - start_time) * 1000
        )

        score.passed = overall >= self.config.overall_threshold

        logger.debug(
            f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
            f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
        )

        return score