feat: Add hybrid quality evaluation system with CLIP and VLM support
- Add FeatureExtractor for CLIP-based image/text feature extraction - Add ObjectiveMetricsCalculator for technical quality metrics - Add VLMEvaluator for vision language model evaluation - Add HybridQualityGate combining objective + VLM evaluation - Enhance CharacterMemory with visual feature support - Add quality optional dependency (torch, ftfy, regex) - Add unit tests for new modules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -17,9 +17,12 @@ Evaluates images and videos based on:
|
||||
- Aesthetic quality (visual appeal)
|
||||
- Text-to-image matching (semantic alignment)
|
||||
- Technical quality (clarity, no artifacts)
|
||||
|
||||
Includes HybridQualityGate for combined objective + VLM evaluation.
|
||||
"""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
@@ -28,6 +31,27 @@ from loguru import logger
|
||||
from pixelle_video.services.quality.models import QualityScore, QualityConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class HybridQualityConfig(QualityConfig):
|
||||
"""Extended configuration for hybrid quality evaluation"""
|
||||
|
||||
# CLIP settings
|
||||
enable_clip_score: bool = True
|
||||
clip_model: str = "ViT-B/32"
|
||||
clip_weight: float = 0.5
|
||||
|
||||
# Technical metrics settings
|
||||
enable_technical_metrics: bool = True
|
||||
sharpness_threshold: float = 0.3
|
||||
|
||||
# Smart VLM skip
|
||||
enable_smart_skip: bool = True
|
||||
smart_skip_threshold: float = 0.75
|
||||
|
||||
# Feature caching
|
||||
cache_features: bool = True
|
||||
|
||||
|
||||
class QualityGate:
|
||||
"""
|
||||
Quality evaluation gate for AI-generated content
|
||||
@@ -361,3 +385,160 @@ Respond in JSON format:
|
||||
"issues": ["list of any problems found"]
|
||||
}}
|
||||
"""
|
||||
|
||||
|
||||
class HybridQualityGate(QualityGate):
|
||||
"""
|
||||
Hybrid quality gate combining objective metrics with VLM evaluation
|
||||
|
||||
Evaluation flow:
|
||||
1. Calculate technical metrics (fast, local)
|
||||
2. Calculate CLIP score if enabled (local, requires CLIP)
|
||||
3. If smart_skip enabled and objective score >= threshold, skip VLM
|
||||
4. Otherwise, call VLM for subjective evaluation
|
||||
5. Combine scores with configurable weights
|
||||
|
||||
Example:
|
||||
>>> gate = HybridQualityGate(llm_service, config)
|
||||
>>> score = await gate.evaluate_image(
|
||||
... image_path="frame_001.png",
|
||||
... prompt="A sunset over mountains"
|
||||
... )
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_service=None,
|
||||
config: Optional[HybridQualityConfig] = None
|
||||
):
|
||||
parent_config = config or HybridQualityConfig()
|
||||
super().__init__(llm_service, parent_config)
|
||||
|
||||
self.hybrid_config = parent_config
|
||||
self._feature_extractor = None
|
||||
self._metrics_calculator = None
|
||||
self._vlm_evaluator = None
|
||||
|
||||
@property
|
||||
def feature_extractor(self):
|
||||
"""Lazy-load feature extractor"""
|
||||
if self._feature_extractor is None:
|
||||
from pixelle_video.services.quality.feature_extractor import (
|
||||
FeatureExtractor, FeatureExtractorConfig
|
||||
)
|
||||
self._feature_extractor = FeatureExtractor(
|
||||
FeatureExtractorConfig(
|
||||
model_name=self.hybrid_config.clip_model,
|
||||
cache_features=self.hybrid_config.cache_features
|
||||
)
|
||||
)
|
||||
return self._feature_extractor
|
||||
|
||||
@property
|
||||
def metrics_calculator(self):
|
||||
"""Lazy-load metrics calculator"""
|
||||
if self._metrics_calculator is None:
|
||||
from pixelle_video.services.quality.objective_metrics import (
|
||||
ObjectiveMetricsCalculator
|
||||
)
|
||||
self._metrics_calculator = ObjectiveMetricsCalculator(
|
||||
sharpness_threshold=self.hybrid_config.sharpness_threshold
|
||||
)
|
||||
return self._metrics_calculator
|
||||
|
||||
@property
|
||||
def vlm_evaluator(self):
|
||||
"""Lazy-load VLM evaluator"""
|
||||
if self._vlm_evaluator is None:
|
||||
from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
|
||||
self._vlm_evaluator = VLMEvaluator(self.llm_service)
|
||||
return self._vlm_evaluator
|
||||
|
||||
async def evaluate_image(
|
||||
self,
|
||||
image_path: str,
|
||||
prompt: str,
|
||||
narration: Optional[str] = None,
|
||||
) -> QualityScore:
|
||||
"""Evaluate image quality using hybrid approach"""
|
||||
start_time = time.time()
|
||||
issues = []
|
||||
|
||||
if not Path(image_path).exists():
|
||||
return QualityScore(
|
||||
passed=False,
|
||||
issues=["Image file not found"],
|
||||
evaluation_time_ms=(time.time() - start_time) * 1000
|
||||
)
|
||||
|
||||
# Step 1: Technical metrics (fast, local)
|
||||
technical_score = 0.7
|
||||
technical_metrics = None
|
||||
|
||||
if self.hybrid_config.enable_technical_metrics:
|
||||
technical_metrics = self.metrics_calculator.analyze_image(image_path)
|
||||
technical_score = technical_metrics.overall_technical
|
||||
issues.extend(technical_metrics.issues)
|
||||
|
||||
# Step 2: CLIP score (if available)
|
||||
clip_score = None
|
||||
text_match_score = 0.7
|
||||
|
||||
if self.hybrid_config.enable_clip_score:
|
||||
clip_score = self.feature_extractor.calculate_clip_score(
|
||||
image_path, prompt
|
||||
)
|
||||
if clip_score is not None:
|
||||
text_match_score = clip_score
|
||||
|
||||
# Step 3: Determine if VLM needed
|
||||
objective_score = (technical_score + text_match_score) / 2
|
||||
use_vlm = True
|
||||
aesthetic_score = 0.7
|
||||
|
||||
if self.hybrid_config.enable_smart_skip:
|
||||
if objective_score >= self.hybrid_config.smart_skip_threshold:
|
||||
use_vlm = False
|
||||
logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")
|
||||
|
||||
# Step 4: VLM evaluation (if needed)
|
||||
if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
|
||||
vlm_result = await self.vlm_evaluator.evaluate_image(
|
||||
image_path, prompt, narration
|
||||
)
|
||||
aesthetic_score = vlm_result.aesthetic_score or 0.7
|
||||
|
||||
if clip_score is not None:
|
||||
text_match_score = (
|
||||
clip_score * self.hybrid_config.clip_weight +
|
||||
vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
|
||||
)
|
||||
else:
|
||||
text_match_score = vlm_result.text_match_score or 0.7
|
||||
|
||||
issues.extend(vlm_result.issues)
|
||||
|
||||
# Step 5: Calculate overall
|
||||
overall = (
|
||||
aesthetic_score * self.config.aesthetic_weight +
|
||||
text_match_score * self.config.text_match_weight +
|
||||
technical_score * self.config.technical_weight
|
||||
)
|
||||
|
||||
score = QualityScore(
|
||||
aesthetic_score=aesthetic_score,
|
||||
text_match_score=text_match_score,
|
||||
technical_score=technical_score,
|
||||
overall_score=overall,
|
||||
issues=issues,
|
||||
evaluation_time_ms=(time.time() - start_time) * 1000
|
||||
)
|
||||
|
||||
score.passed = overall >= self.config.overall_threshold
|
||||
|
||||
logger.debug(
|
||||
f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
|
||||
f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
|
||||
)
|
||||
|
||||
return score
|
||||
|
||||
Reference in New Issue
Block a user