Files
AI-Video/pixelle_video/services/quality/quality_gate.py
empty 56db9bf9d2 feat: Add hybrid quality evaluation system with CLIP and VLM support
- Add FeatureExtractor for CLIP-based image/text feature extraction
- Add ObjectiveMetricsCalculator for technical quality metrics
- Add VLMEvaluator for vision language model evaluation
- Add HybridQualityGate combining objective + VLM evaluation
- Enhance CharacterMemory with visual feature support
- Add quality optional dependency (torch, ftfy, regex)
- Add unit tests for new modules

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 15:56:44 +08:00

545 lines
18 KiB
Python

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
QualityGate - Quality evaluation system for generated content
Evaluates images and videos based on:
- Aesthetic quality (visual appeal)
- Text-to-image matching (semantic alignment)
- Technical quality (clarity, no artifacts)
Includes HybridQualityGate for combined objective + VLM evaluation.
"""
import time
from dataclasses import dataclass
from typing import Optional
from pathlib import Path
from loguru import logger
from pixelle_video.services.quality.models import QualityScore, QualityConfig
@dataclass
class HybridQualityConfig(QualityConfig):
"""Extended configuration for hybrid quality evaluation"""
# CLIP settings
enable_clip_score: bool = True
clip_model: str = "ViT-B/32"
clip_weight: float = 0.5
# Technical metrics settings
enable_technical_metrics: bool = True
sharpness_threshold: float = 0.3
# Smart VLM skip
enable_smart_skip: bool = True
smart_skip_threshold: float = 0.75
# Feature caching
cache_features: bool = True
class QualityGate:
"""
Quality evaluation gate for AI-generated content
Uses VLM (Vision Language Model) or local models to evaluate:
1. Aesthetic quality - Is the image visually appealing?
2. Text matching - Does the image match the prompt/narration?
3. Technical quality - Is the image clear and free of artifacts?
Example:
>>> gate = QualityGate(llm_service, config)
>>> score = await gate.evaluate_image(
... image_path="output/frame_001.png",
... prompt="A sunset over mountains",
... narration="夕阳西下,余晖洒满山间"
... )
>>> if score.passed:
... print("Image quality approved!")
"""
def __init__(
self,
llm_service=None,
config: Optional[QualityConfig] = None
):
"""
Initialize QualityGate
Args:
llm_service: LLM service for VLM-based evaluation
config: Quality configuration
"""
self.llm_service = llm_service
self.config = config or QualityConfig()
async def evaluate_image(
self,
image_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""
Evaluate the quality of a generated image
Args:
image_path: Path to the image file
prompt: The prompt used to generate the image
narration: Optional narration text for context
Returns:
QualityScore with evaluation results
"""
start_time = time.time()
issues = []
# Validate image exists
if not Path(image_path).exists():
return QualityScore(
passed=False,
issues=["Image file not found"],
evaluation_time_ms=(time.time() - start_time) * 1000
)
# Evaluate using VLM or fallback to basic checks
if self.config.use_vlm_evaluation and self.llm_service:
score = await self._evaluate_with_vlm(image_path, prompt, narration)
else:
score = await self._evaluate_basic(image_path, prompt)
# Set evaluation time
score.evaluation_time_ms = (time.time() - start_time) * 1000
# Determine if passed
score.passed = score.overall_score >= self.config.overall_threshold
logger.debug(
f"Quality evaluation: overall={score.overall_score:.2f}, "
f"passed={score.passed}, time={score.evaluation_time_ms:.0f}ms"
)
return score
async def evaluate_video(
self,
video_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""
Evaluate the quality of a generated video
Args:
video_path: Path to the video file
prompt: The prompt used to generate the video
narration: Optional narration text for context
Returns:
QualityScore with evaluation results
"""
start_time = time.time()
# Validate video exists
if not Path(video_path).exists():
return QualityScore(
passed=False,
issues=["Video file not found"],
evaluation_time_ms=(time.time() - start_time) * 1000
)
# For video, we can extract key frames and evaluate
# For now, use VLM with video input or sample frames
if self.config.use_vlm_evaluation and self.llm_service:
score = await self._evaluate_video_with_vlm(video_path, prompt, narration)
else:
score = await self._evaluate_video_basic(video_path)
score.evaluation_time_ms = (time.time() - start_time) * 1000
score.passed = score.overall_score >= self.config.overall_threshold
return score
async def _evaluate_with_vlm(
self,
image_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""
Evaluate image quality using Vision Language Model
Uses the LLM with vision capability to assess:
- Visual quality and aesthetics
- Prompt-image alignment
- Technical defects
"""
evaluation_prompt = self._build_evaluation_prompt(prompt, narration)
try:
# Call LLM with image (requires VLM-capable model like GPT-4o, Qwen-VL)
# Note: This requires the LLM service to support vision input
# For now, we'll use a basic score if VLM is not available
# TODO: Implement actual VLM call when integrating with vision-capable LLM
# response = await self.llm_service(
# prompt=evaluation_prompt,
# images=[image_path],
# response_type=ImageQualityResponse
# )
# Fallback to basic evaluation for now
logger.debug("VLM evaluation: using basic fallback (VLM integration pending)")
return await self._evaluate_basic(image_path, prompt)
except Exception as e:
logger.warning(f"VLM evaluation failed: {e}, falling back to basic")
return await self._evaluate_basic(image_path, prompt)
async def _evaluate_basic(
self,
image_path: str,
prompt: str,
) -> QualityScore:
"""
Basic image quality evaluation without VLM
Performs simple checks:
- File size and dimensions
- Image format validation
"""
issues = []
try:
# Import PIL for basic checks
from PIL import Image
with Image.open(image_path) as img:
width, height = img.size
# Check minimum dimensions
if width < 256 or height < 256:
issues.append(f"Image too small: {width}x{height}")
# Check aspect ratio (not too extreme)
aspect = max(width, height) / min(width, height)
if aspect > 4:
issues.append(f"Extreme aspect ratio: {aspect:.1f}")
# Basic scores (generous defaults when VLM not available)
aesthetic_score = 0.7 if not issues else 0.4
text_match_score = 0.7 # Can't properly evaluate without VLM
technical_score = 0.8 if not issues else 0.5
# Calculate overall
overall = (
aesthetic_score * self.config.aesthetic_weight +
text_match_score * self.config.text_match_weight +
technical_score * self.config.technical_weight
)
return QualityScore(
aesthetic_score=aesthetic_score,
text_match_score=text_match_score,
technical_score=technical_score,
overall_score=overall,
issues=issues,
)
except Exception as e:
logger.error(f"Basic evaluation failed: {e}")
return QualityScore(
overall_score=0.0,
passed=False,
issues=[f"Evaluation error: {str(e)}"]
)
async def _evaluate_video_with_vlm(
self,
video_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""Evaluate video using VLM (placeholder for future implementation)"""
# TODO: Implement video frame sampling and VLM evaluation
return await self._evaluate_video_basic(video_path)
async def _evaluate_video_basic(
self,
video_path: str,
) -> QualityScore:
"""Basic video quality evaluation"""
issues = []
try:
import subprocess
import json
# Use ffprobe to get video info
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_format", "-show_streams", video_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
issues.append("Failed to read video metadata")
return QualityScore(overall_score=0.5, issues=issues)
info = json.loads(result.stdout)
# Check for video stream
video_stream = None
for stream in info.get("streams", []):
if stream.get("codec_type") == "video":
video_stream = stream
break
if not video_stream:
issues.append("No video stream found")
return QualityScore(overall_score=0.0, passed=False, issues=issues)
# Check dimensions
width = video_stream.get("width", 0)
height = video_stream.get("height", 0)
if width < 256 or height < 256:
issues.append(f"Video too small: {width}x{height}")
# Check duration
duration = float(info.get("format", {}).get("duration", 0))
if duration < 0.5:
issues.append(f"Video too short: {duration:.1f}s")
# Calculate scores
aesthetic_score = 0.7
text_match_score = 0.7
technical_score = 0.8 if not issues else 0.5
overall = (
aesthetic_score * self.config.aesthetic_weight +
text_match_score * self.config.text_match_weight +
technical_score * self.config.technical_weight
)
return QualityScore(
aesthetic_score=aesthetic_score,
text_match_score=text_match_score,
technical_score=technical_score,
overall_score=overall,
issues=issues,
)
except Exception as e:
logger.error(f"Video evaluation failed: {e}")
return QualityScore(
overall_score=0.5,
issues=[f"Evaluation error: {str(e)}"]
)
def _build_evaluation_prompt(
self,
prompt: str,
narration: Optional[str] = None,
) -> str:
"""Build the evaluation prompt for VLM"""
context = f"Narration: {narration}\n" if narration else ""
return f"""Evaluate this AI-generated image on the following criteria.
Rate each from 0.0 to 1.0.
Image Generation Prompt: {prompt}
{context}
Evaluation Criteria:
1. Aesthetic Quality (0.0-1.0):
- Is the image visually appealing?
- Good composition, colors, and style?
2. Prompt Matching (0.0-1.0):
- Does the image accurately represent the prompt?
- Are key elements from the prompt visible?
3. Technical Quality (0.0-1.0):
- Is the image clear and well-defined?
- Free of artifacts, distortions, or blurriness?
- Natural looking (no AI artifacts like extra fingers)?
Respond in JSON format:
{{
"aesthetic_score": 0.0,
"text_match_score": 0.0,
"technical_score": 0.0,
"issues": ["list of any problems found"]
}}
"""
class HybridQualityGate(QualityGate):
"""
Hybrid quality gate combining objective metrics with VLM evaluation
Evaluation flow:
1. Calculate technical metrics (fast, local)
2. Calculate CLIP score if enabled (local, requires CLIP)
3. If smart_skip enabled and objective score >= threshold, skip VLM
4. Otherwise, call VLM for subjective evaluation
5. Combine scores with configurable weights
Example:
>>> gate = HybridQualityGate(llm_service, config)
>>> score = await gate.evaluate_image(
... image_path="frame_001.png",
... prompt="A sunset over mountains"
... )
"""
def __init__(
self,
llm_service=None,
config: Optional[HybridQualityConfig] = None
):
parent_config = config or HybridQualityConfig()
super().__init__(llm_service, parent_config)
self.hybrid_config = parent_config
self._feature_extractor = None
self._metrics_calculator = None
self._vlm_evaluator = None
@property
def feature_extractor(self):
"""Lazy-load feature extractor"""
if self._feature_extractor is None:
from pixelle_video.services.quality.feature_extractor import (
FeatureExtractor, FeatureExtractorConfig
)
self._feature_extractor = FeatureExtractor(
FeatureExtractorConfig(
model_name=self.hybrid_config.clip_model,
cache_features=self.hybrid_config.cache_features
)
)
return self._feature_extractor
@property
def metrics_calculator(self):
"""Lazy-load metrics calculator"""
if self._metrics_calculator is None:
from pixelle_video.services.quality.objective_metrics import (
ObjectiveMetricsCalculator
)
self._metrics_calculator = ObjectiveMetricsCalculator(
sharpness_threshold=self.hybrid_config.sharpness_threshold
)
return self._metrics_calculator
@property
def vlm_evaluator(self):
"""Lazy-load VLM evaluator"""
if self._vlm_evaluator is None:
from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
self._vlm_evaluator = VLMEvaluator(self.llm_service)
return self._vlm_evaluator
async def evaluate_image(
self,
image_path: str,
prompt: str,
narration: Optional[str] = None,
) -> QualityScore:
"""Evaluate image quality using hybrid approach"""
start_time = time.time()
issues = []
if not Path(image_path).exists():
return QualityScore(
passed=False,
issues=["Image file not found"],
evaluation_time_ms=(time.time() - start_time) * 1000
)
# Step 1: Technical metrics (fast, local)
technical_score = 0.7
technical_metrics = None
if self.hybrid_config.enable_technical_metrics:
technical_metrics = self.metrics_calculator.analyze_image(image_path)
technical_score = technical_metrics.overall_technical
issues.extend(technical_metrics.issues)
# Step 2: CLIP score (if available)
clip_score = None
text_match_score = 0.7
if self.hybrid_config.enable_clip_score:
clip_score = self.feature_extractor.calculate_clip_score(
image_path, prompt
)
if clip_score is not None:
text_match_score = clip_score
# Step 3: Determine if VLM needed
objective_score = (technical_score + text_match_score) / 2
use_vlm = True
aesthetic_score = 0.7
if self.hybrid_config.enable_smart_skip:
if objective_score >= self.hybrid_config.smart_skip_threshold:
use_vlm = False
logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")
# Step 4: VLM evaluation (if needed)
if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
vlm_result = await self.vlm_evaluator.evaluate_image(
image_path, prompt, narration
)
aesthetic_score = vlm_result.aesthetic_score or 0.7
if clip_score is not None:
text_match_score = (
clip_score * self.hybrid_config.clip_weight +
vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
)
else:
text_match_score = vlm_result.text_match_score or 0.7
issues.extend(vlm_result.issues)
# Step 5: Calculate overall
overall = (
aesthetic_score * self.config.aesthetic_weight +
text_match_score * self.config.text_match_weight +
technical_score * self.config.technical_weight
)
score = QualityScore(
aesthetic_score=aesthetic_score,
text_match_score=text_match_score,
technical_score=technical_score,
overall_score=overall,
issues=issues,
evaluation_time_ms=(time.time() - start_time) * 1000
)
score.passed = overall >= self.config.overall_threshold
logger.debug(
f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
)
return score