feat: Add hybrid quality evaluation system with CLIP and VLM support

- Add FeatureExtractor for CLIP-based image/text feature extraction
- Add ObjectiveMetricsCalculator for technical quality metrics
- Add VLMEvaluator for vision language model evaluation
- Add HybridQualityGate combining objective + VLM evaluation
- Enhance CharacterMemory with visual feature support
- Add quality optional dependency (torch, ftfy, regex)
- Add unit tests for new modules

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
empty
2026-01-05 15:56:44 +08:00
parent ca018a9b1f
commit 56db9bf9d2
12 changed files with 1230 additions and 4 deletions

View File

@@ -17,9 +17,12 @@ Evaluates images and videos based on:
- Aesthetic quality (visual appeal)
- Text-to-image matching (semantic alignment)
- Technical quality (clarity, no artifacts)
Includes HybridQualityGate for combined objective + VLM evaluation.
"""
import time
from dataclasses import dataclass
from typing import Optional
from pathlib import Path
@@ -28,6 +31,27 @@ from loguru import logger
from pixelle_video.services.quality.models import QualityScore, QualityConfig
@dataclass
class HybridQualityConfig(QualityConfig):
"""Extended configuration for hybrid quality evaluation"""
# CLIP settings
enable_clip_score: bool = True
clip_model: str = "ViT-B/32"
clip_weight: float = 0.5
# Technical metrics settings
enable_technical_metrics: bool = True
sharpness_threshold: float = 0.3
# Smart VLM skip
enable_smart_skip: bool = True
smart_skip_threshold: float = 0.75
# Feature caching
cache_features: bool = True
class QualityGate:
"""
Quality evaluation gate for AI-generated content
@@ -361,3 +385,160 @@ Respond in JSON format:
"issues": ["list of any problems found"]
}}
"""
class HybridQualityGate(QualityGate):
"""
Hybrid quality gate combining objective metrics with VLM evaluation
Evaluation flow:
1. Calculate technical metrics (fast, local)
2. Calculate CLIP score if enabled (local, requires CLIP)
3. If smart_skip enabled and objective score >= threshold, skip VLM
4. Otherwise, call VLM for subjective evaluation
5. Combine scores with configurable weights
Example:
>>> gate = HybridQualityGate(llm_service, config)
>>> score = await gate.evaluate_image(
... image_path="frame_001.png",
... prompt="A sunset over mountains"
... )
"""
def __init__(
self,
llm_service=None,
config: Optional[HybridQualityConfig] = None
):
parent_config = config or HybridQualityConfig()
super().__init__(llm_service, parent_config)
self.hybrid_config = parent_config
self._feature_extractor = None
self._metrics_calculator = None
self._vlm_evaluator = None
@property
def feature_extractor(self):
"""Lazy-load feature extractor"""
if self._feature_extractor is None:
from pixelle_video.services.quality.feature_extractor import (
FeatureExtractor, FeatureExtractorConfig
)
self._feature_extractor = FeatureExtractor(
FeatureExtractorConfig(
model_name=self.hybrid_config.clip_model,
cache_features=self.hybrid_config.cache_features
)
)
return self._feature_extractor
@property
def metrics_calculator(self):
"""Lazy-load metrics calculator"""
if self._metrics_calculator is None:
from pixelle_video.services.quality.objective_metrics import (
ObjectiveMetricsCalculator
)
self._metrics_calculator = ObjectiveMetricsCalculator(
sharpness_threshold=self.hybrid_config.sharpness_threshold
)
return self._metrics_calculator
@property
def vlm_evaluator(self):
"""Lazy-load VLM evaluator"""
if self._vlm_evaluator is None:
from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
self._vlm_evaluator = VLMEvaluator(self.llm_service)
return self._vlm_evaluator
async def evaluate_image(
self,
image_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""Evaluate image quality using hybrid approach"""
start_time = time.time()
issues = []
if not Path(image_path).exists():
return QualityScore(
passed=False,
issues=["Image file not found"],
evaluation_time_ms=(time.time() - start_time) * 1000
)
# Step 1: Technical metrics (fast, local)
technical_score = 0.7
technical_metrics = None
if self.hybrid_config.enable_technical_metrics:
technical_metrics = self.metrics_calculator.analyze_image(image_path)
technical_score = technical_metrics.overall_technical
issues.extend(technical_metrics.issues)
# Step 2: CLIP score (if available)
clip_score = None
text_match_score = 0.7
if self.hybrid_config.enable_clip_score:
clip_score = self.feature_extractor.calculate_clip_score(
image_path, prompt
)
if clip_score is not None:
text_match_score = clip_score
# Step 3: Determine if VLM needed
objective_score = (technical_score + text_match_score) / 2
use_vlm = True
aesthetic_score = 0.7
if self.hybrid_config.enable_smart_skip:
if objective_score >= self.hybrid_config.smart_skip_threshold:
use_vlm = False
logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")
# Step 4: VLM evaluation (if needed)
if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
vlm_result = await self.vlm_evaluator.evaluate_image(
image_path, prompt, narration
)
aesthetic_score = vlm_result.aesthetic_score or 0.7
if clip_score is not None:
text_match_score = (
clip_score * self.hybrid_config.clip_weight +
vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
)
else:
text_match_score = vlm_result.text_match_score or 0.7
issues.extend(vlm_result.issues)
# Step 5: Calculate overall
overall = (
aesthetic_score * self.config.aesthetic_weight +
text_match_score * self.config.text_match_weight +
technical_score * self.config.technical_weight
)
score = QualityScore(
aesthetic_score=aesthetic_score,
text_match_score=text_match_score,
technical_score=technical_score,
overall_score=overall,
issues=issues,
evaluation_time_ms=(time.time() - start_time) * 1000
)
score.passed = overall >= self.config.overall_threshold
logger.debug(
f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
)
return score