feat: Add hybrid quality evaluation system with CLIP and VLM support

- Add FeatureExtractor for CLIP-based image/text feature extraction - Add ObjectiveMetricsCalculator for technical quality metrics - Add VLMEvaluator for vision language model evaluation - Add HybridQualityGate combining objective + VLM evaluation - Enhance CharacterMemory with visual feature support - Add quality optional dependency (torch, ftfy, regex) - Add unit tests for new modules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 15:56:44 +08:00
parent ca018a9b1f
commit 56db9bf9d2
12 changed files with 1230 additions and 4 deletions
--- a/pixelle_video/services/quality/quality_gate.py
+++ b/pixelle_video/services/quality/quality_gate.py
@@ -17,9 +17,12 @@ Evaluates images and videos based on:
 - Aesthetic quality (visual appeal)
 - Text-to-image matching (semantic alignment)
 - Technical quality (clarity, no artifacts)
+
+Includes HybridQualityGate for combined objective + VLM evaluation.
 """

 import time
+from dataclasses import dataclass
 from typing import Optional
 from pathlib import Path

@@ -28,6 +31,27 @@ from loguru import logger
 from pixelle_video.services.quality.models import QualityScore, QualityConfig


+@dataclass
+class HybridQualityConfig(QualityConfig):
+    """Extended configuration for hybrid quality evaluation"""
+
+    # CLIP settings
+    enable_clip_score: bool = True
+    clip_model: str = "ViT-B/32"
+    clip_weight: float = 0.5
+
+    # Technical metrics settings
+    enable_technical_metrics: bool = True
+    sharpness_threshold: float = 0.3
+
+    # Smart VLM skip
+    enable_smart_skip: bool = True
+    smart_skip_threshold: float = 0.75
+
+    # Feature caching
+    cache_features: bool = True
+
+
 class QualityGate:
    """
    Quality evaluation gate for AI-generated content
@@ -361,3 +385,160 @@ Respond in JSON format:
  "issues": ["list of any problems found"]
 }}
 """
+
+
+class HybridQualityGate(QualityGate):
+    """
+    Hybrid quality gate combining objective metrics with VLM evaluation
+
+    Evaluation flow:
+    1. Calculate technical metrics (fast, local)
+    2. Calculate CLIP score if enabled (local, requires CLIP)
+    3. If smart_skip enabled and objective score >= threshold, skip VLM
+    4. Otherwise, call VLM for subjective evaluation
+    5. Combine scores with configurable weights
+
+    Example:
+        >>> gate = HybridQualityGate(llm_service, config)
+        >>> score = await gate.evaluate_image(
+        ...     image_path="frame_001.png",
+        ...     prompt="A sunset over mountains"
+        ... )
+    """
+
+    def __init__(
+        self,
+        llm_service=None,
+        config: Optional[HybridQualityConfig] = None
+    ):
+        parent_config = config or HybridQualityConfig()
+        super().__init__(llm_service, parent_config)
+
+        self.hybrid_config = parent_config
+        self._feature_extractor = None
+        self._metrics_calculator = None
+        self._vlm_evaluator = None
+
+    @property
+    def feature_extractor(self):
+        """Lazy-load feature extractor"""
+        if self._feature_extractor is None:
+            from pixelle_video.services.quality.feature_extractor import (
+                FeatureExtractor, FeatureExtractorConfig
+            )
+            self._feature_extractor = FeatureExtractor(
+                FeatureExtractorConfig(
+                    model_name=self.hybrid_config.clip_model,
+                    cache_features=self.hybrid_config.cache_features
+                )
+            )
+        return self._feature_extractor
+
+    @property
+    def metrics_calculator(self):
+        """Lazy-load metrics calculator"""
+        if self._metrics_calculator is None:
+            from pixelle_video.services.quality.objective_metrics import (
+                ObjectiveMetricsCalculator
+            )
+            self._metrics_calculator = ObjectiveMetricsCalculator(
+                sharpness_threshold=self.hybrid_config.sharpness_threshold
+            )
+        return self._metrics_calculator
+
+    @property
+    def vlm_evaluator(self):
+        """Lazy-load VLM evaluator"""
+        if self._vlm_evaluator is None:
+            from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
+            self._vlm_evaluator = VLMEvaluator(self.llm_service)
+        return self._vlm_evaluator
+
+    async def evaluate_image(
+        self,
+        image_path: str,
+        prompt: str,
+        narration: Optional[str] = None,
+    ) -> QualityScore:
+        """Evaluate image quality using hybrid approach"""
+        start_time = time.time()
+        issues = []
+
+        if not Path(image_path).exists():
+            return QualityScore(
+                passed=False,
+                issues=["Image file not found"],
+                evaluation_time_ms=(time.time() - start_time) * 1000
+            )
+
+        # Step 1: Technical metrics (fast, local)
+        technical_score = 0.7
+        technical_metrics = None
+
+        if self.hybrid_config.enable_technical_metrics:
+            technical_metrics = self.metrics_calculator.analyze_image(image_path)
+            technical_score = technical_metrics.overall_technical
+            issues.extend(technical_metrics.issues)
+
+        # Step 2: CLIP score (if available)
+        clip_score = None
+        text_match_score = 0.7
+
+        if self.hybrid_config.enable_clip_score:
+            clip_score = self.feature_extractor.calculate_clip_score(
+                image_path, prompt
+            )
+            if clip_score is not None:
+                text_match_score = clip_score
+
+        # Step 3: Determine if VLM needed
+        objective_score = (technical_score + text_match_score) / 2
+        use_vlm = True
+        aesthetic_score = 0.7
+
+        if self.hybrid_config.enable_smart_skip:
+            if objective_score >= self.hybrid_config.smart_skip_threshold:
+                use_vlm = False
+                logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")
+
+        # Step 4: VLM evaluation (if needed)
+        if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
+            vlm_result = await self.vlm_evaluator.evaluate_image(
+                image_path, prompt, narration
+            )
+            aesthetic_score = vlm_result.aesthetic_score or 0.7
+
+            if clip_score is not None:
+                text_match_score = (
+                    clip_score * self.hybrid_config.clip_weight +
+                    vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
+                )
+            else:
+                text_match_score = vlm_result.text_match_score or 0.7
+
+            issues.extend(vlm_result.issues)
+
+        # Step 5: Calculate overall
+        overall = (
+            aesthetic_score * self.config.aesthetic_weight +
+            text_match_score * self.config.text_match_weight +
+            technical_score * self.config.technical_weight
+        )
+
+        score = QualityScore(
+            aesthetic_score=aesthetic_score,
+            text_match_score=text_match_score,
+            technical_score=technical_score,
+            overall_score=overall,
+            issues=issues,
+            evaluation_time_ms=(time.time() - start_time) * 1000
+        )
+
+        score.passed = overall >= self.config.overall_threshold
+
+        logger.debug(
+            f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
+            f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
+        )
+
+        return score