- Add FeatureExtractor for CLIP-based image/text feature extraction - Add ObjectiveMetricsCalculator for technical quality metrics - Add VLMEvaluator for vision language model evaluation - Add HybridQualityGate combining objective + VLM evaluation - Enhance CharacterMemory with visual feature support - Add quality optional dependency (torch, ftfy, regex) - Add unit tests for new modules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
545 lines
18 KiB
Python
545 lines
18 KiB
Python
# Copyright (C) 2025 AIDC-AI
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
QualityGate - Quality evaluation system for generated content
|
|
|
|
Evaluates images and videos based on:
|
|
- Aesthetic quality (visual appeal)
|
|
- Text-to-image matching (semantic alignment)
|
|
- Technical quality (clarity, no artifacts)
|
|
|
|
Includes HybridQualityGate for combined objective + VLM evaluation.
|
|
"""
|
|
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
from pathlib import Path
|
|
|
|
from loguru import logger
|
|
|
|
from pixelle_video.services.quality.models import QualityScore, QualityConfig
|
|
|
|
|
|
@dataclass
|
|
class HybridQualityConfig(QualityConfig):
|
|
"""Extended configuration for hybrid quality evaluation"""
|
|
|
|
# CLIP settings
|
|
enable_clip_score: bool = True
|
|
clip_model: str = "ViT-B/32"
|
|
clip_weight: float = 0.5
|
|
|
|
# Technical metrics settings
|
|
enable_technical_metrics: bool = True
|
|
sharpness_threshold: float = 0.3
|
|
|
|
# Smart VLM skip
|
|
enable_smart_skip: bool = True
|
|
smart_skip_threshold: float = 0.75
|
|
|
|
# Feature caching
|
|
cache_features: bool = True
|
|
|
|
|
|
class QualityGate:
|
|
"""
|
|
Quality evaluation gate for AI-generated content
|
|
|
|
Uses VLM (Vision Language Model) or local models to evaluate:
|
|
1. Aesthetic quality - Is the image visually appealing?
|
|
2. Text matching - Does the image match the prompt/narration?
|
|
3. Technical quality - Is the image clear and free of artifacts?
|
|
|
|
Example:
|
|
>>> gate = QualityGate(llm_service, config)
|
|
>>> score = await gate.evaluate_image(
|
|
... image_path="output/frame_001.png",
|
|
... prompt="A sunset over mountains",
|
|
... narration="夕阳西下,余晖洒满山间"
|
|
... )
|
|
>>> if score.passed:
|
|
... print("Image quality approved!")
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
llm_service=None,
|
|
config: Optional[QualityConfig] = None
|
|
):
|
|
"""
|
|
Initialize QualityGate
|
|
|
|
Args:
|
|
llm_service: LLM service for VLM-based evaluation
|
|
config: Quality configuration
|
|
"""
|
|
self.llm_service = llm_service
|
|
self.config = config or QualityConfig()
|
|
|
|
async def evaluate_image(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate the quality of a generated image
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
prompt: The prompt used to generate the image
|
|
narration: Optional narration text for context
|
|
|
|
Returns:
|
|
QualityScore with evaluation results
|
|
"""
|
|
start_time = time.time()
|
|
issues = []
|
|
|
|
# Validate image exists
|
|
if not Path(image_path).exists():
|
|
return QualityScore(
|
|
passed=False,
|
|
issues=["Image file not found"],
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
# Evaluate using VLM or fallback to basic checks
|
|
if self.config.use_vlm_evaluation and self.llm_service:
|
|
score = await self._evaluate_with_vlm(image_path, prompt, narration)
|
|
else:
|
|
score = await self._evaluate_basic(image_path, prompt)
|
|
|
|
# Set evaluation time
|
|
score.evaluation_time_ms = (time.time() - start_time) * 1000
|
|
|
|
# Determine if passed
|
|
score.passed = score.overall_score >= self.config.overall_threshold
|
|
|
|
logger.debug(
|
|
f"Quality evaluation: overall={score.overall_score:.2f}, "
|
|
f"passed={score.passed}, time={score.evaluation_time_ms:.0f}ms"
|
|
)
|
|
|
|
return score
|
|
|
|
async def evaluate_video(
|
|
self,
|
|
video_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate the quality of a generated video
|
|
|
|
Args:
|
|
video_path: Path to the video file
|
|
prompt: The prompt used to generate the video
|
|
narration: Optional narration text for context
|
|
|
|
Returns:
|
|
QualityScore with evaluation results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Validate video exists
|
|
if not Path(video_path).exists():
|
|
return QualityScore(
|
|
passed=False,
|
|
issues=["Video file not found"],
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
# For video, we can extract key frames and evaluate
|
|
# For now, use VLM with video input or sample frames
|
|
if self.config.use_vlm_evaluation and self.llm_service:
|
|
score = await self._evaluate_video_with_vlm(video_path, prompt, narration)
|
|
else:
|
|
score = await self._evaluate_video_basic(video_path)
|
|
|
|
score.evaluation_time_ms = (time.time() - start_time) * 1000
|
|
score.passed = score.overall_score >= self.config.overall_threshold
|
|
|
|
return score
|
|
|
|
async def _evaluate_with_vlm(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate image quality using Vision Language Model
|
|
|
|
Uses the LLM with vision capability to assess:
|
|
- Visual quality and aesthetics
|
|
- Prompt-image alignment
|
|
- Technical defects
|
|
"""
|
|
evaluation_prompt = self._build_evaluation_prompt(prompt, narration)
|
|
|
|
try:
|
|
# Call LLM with image (requires VLM-capable model like GPT-4o, Qwen-VL)
|
|
# Note: This requires the LLM service to support vision input
|
|
# For now, we'll use a basic score if VLM is not available
|
|
|
|
# TODO: Implement actual VLM call when integrating with vision-capable LLM
|
|
# response = await self.llm_service(
|
|
# prompt=evaluation_prompt,
|
|
# images=[image_path],
|
|
# response_type=ImageQualityResponse
|
|
# )
|
|
|
|
# Fallback to basic evaluation for now
|
|
logger.debug("VLM evaluation: using basic fallback (VLM integration pending)")
|
|
return await self._evaluate_basic(image_path, prompt)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"VLM evaluation failed: {e}, falling back to basic")
|
|
return await self._evaluate_basic(image_path, prompt)
|
|
|
|
async def _evaluate_basic(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
) -> QualityScore:
|
|
"""
|
|
Basic image quality evaluation without VLM
|
|
|
|
Performs simple checks:
|
|
- File size and dimensions
|
|
- Image format validation
|
|
"""
|
|
issues = []
|
|
|
|
try:
|
|
# Import PIL for basic checks
|
|
from PIL import Image
|
|
|
|
with Image.open(image_path) as img:
|
|
width, height = img.size
|
|
|
|
# Check minimum dimensions
|
|
if width < 256 or height < 256:
|
|
issues.append(f"Image too small: {width}x{height}")
|
|
|
|
# Check aspect ratio (not too extreme)
|
|
aspect = max(width, height) / min(width, height)
|
|
if aspect > 4:
|
|
issues.append(f"Extreme aspect ratio: {aspect:.1f}")
|
|
|
|
# Basic scores (generous defaults when VLM not available)
|
|
aesthetic_score = 0.7 if not issues else 0.4
|
|
text_match_score = 0.7 # Can't properly evaluate without VLM
|
|
technical_score = 0.8 if not issues else 0.5
|
|
|
|
# Calculate overall
|
|
overall = (
|
|
aesthetic_score * self.config.aesthetic_weight +
|
|
text_match_score * self.config.text_match_weight +
|
|
technical_score * self.config.technical_weight
|
|
)
|
|
|
|
return QualityScore(
|
|
aesthetic_score=aesthetic_score,
|
|
text_match_score=text_match_score,
|
|
technical_score=technical_score,
|
|
overall_score=overall,
|
|
issues=issues,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Basic evaluation failed: {e}")
|
|
return QualityScore(
|
|
overall_score=0.0,
|
|
passed=False,
|
|
issues=[f"Evaluation error: {str(e)}"]
|
|
)
|
|
|
|
async def _evaluate_video_with_vlm(
|
|
self,
|
|
video_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""Evaluate video using VLM (placeholder for future implementation)"""
|
|
# TODO: Implement video frame sampling and VLM evaluation
|
|
return await self._evaluate_video_basic(video_path)
|
|
|
|
async def _evaluate_video_basic(
|
|
self,
|
|
video_path: str,
|
|
) -> QualityScore:
|
|
"""Basic video quality evaluation"""
|
|
issues = []
|
|
|
|
try:
|
|
import subprocess
|
|
import json
|
|
|
|
# Use ffprobe to get video info
|
|
cmd = [
|
|
"ffprobe", "-v", "quiet", "-print_format", "json",
|
|
"-show_format", "-show_streams", video_path
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
issues.append("Failed to read video metadata")
|
|
return QualityScore(overall_score=0.5, issues=issues)
|
|
|
|
info = json.loads(result.stdout)
|
|
|
|
# Check for video stream
|
|
video_stream = None
|
|
for stream in info.get("streams", []):
|
|
if stream.get("codec_type") == "video":
|
|
video_stream = stream
|
|
break
|
|
|
|
if not video_stream:
|
|
issues.append("No video stream found")
|
|
return QualityScore(overall_score=0.0, passed=False, issues=issues)
|
|
|
|
# Check dimensions
|
|
width = video_stream.get("width", 0)
|
|
height = video_stream.get("height", 0)
|
|
if width < 256 or height < 256:
|
|
issues.append(f"Video too small: {width}x{height}")
|
|
|
|
# Check duration
|
|
duration = float(info.get("format", {}).get("duration", 0))
|
|
if duration < 0.5:
|
|
issues.append(f"Video too short: {duration:.1f}s")
|
|
|
|
# Calculate scores
|
|
aesthetic_score = 0.7
|
|
text_match_score = 0.7
|
|
technical_score = 0.8 if not issues else 0.5
|
|
|
|
overall = (
|
|
aesthetic_score * self.config.aesthetic_weight +
|
|
text_match_score * self.config.text_match_weight +
|
|
technical_score * self.config.technical_weight
|
|
)
|
|
|
|
return QualityScore(
|
|
aesthetic_score=aesthetic_score,
|
|
text_match_score=text_match_score,
|
|
technical_score=technical_score,
|
|
overall_score=overall,
|
|
issues=issues,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Video evaluation failed: {e}")
|
|
return QualityScore(
|
|
overall_score=0.5,
|
|
issues=[f"Evaluation error: {str(e)}"]
|
|
)
|
|
|
|
def _build_evaluation_prompt(
|
|
self,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> str:
|
|
"""Build the evaluation prompt for VLM"""
|
|
context = f"Narration: {narration}\n" if narration else ""
|
|
|
|
return f"""Evaluate this AI-generated image on the following criteria.
|
|
Rate each from 0.0 to 1.0.
|
|
|
|
Image Generation Prompt: {prompt}
|
|
{context}
|
|
Evaluation Criteria:
|
|
|
|
1. Aesthetic Quality (0.0-1.0):
|
|
- Is the image visually appealing?
|
|
- Good composition, colors, and style?
|
|
|
|
2. Prompt Matching (0.0-1.0):
|
|
- Does the image accurately represent the prompt?
|
|
- Are key elements from the prompt visible?
|
|
|
|
3. Technical Quality (0.0-1.0):
|
|
- Is the image clear and well-defined?
|
|
- Free of artifacts, distortions, or blurriness?
|
|
- Natural looking (no AI artifacts like extra fingers)?
|
|
|
|
Respond in JSON format:
|
|
{{
|
|
"aesthetic_score": 0.0,
|
|
"text_match_score": 0.0,
|
|
"technical_score": 0.0,
|
|
"issues": ["list of any problems found"]
|
|
}}
|
|
"""
|
|
|
|
|
|
class HybridQualityGate(QualityGate):
|
|
"""
|
|
Hybrid quality gate combining objective metrics with VLM evaluation
|
|
|
|
Evaluation flow:
|
|
1. Calculate technical metrics (fast, local)
|
|
2. Calculate CLIP score if enabled (local, requires CLIP)
|
|
3. If smart_skip enabled and objective score >= threshold, skip VLM
|
|
4. Otherwise, call VLM for subjective evaluation
|
|
5. Combine scores with configurable weights
|
|
|
|
Example:
|
|
>>> gate = HybridQualityGate(llm_service, config)
|
|
>>> score = await gate.evaluate_image(
|
|
... image_path="frame_001.png",
|
|
... prompt="A sunset over mountains"
|
|
... )
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
llm_service=None,
|
|
config: Optional[HybridQualityConfig] = None
|
|
):
|
|
parent_config = config or HybridQualityConfig()
|
|
super().__init__(llm_service, parent_config)
|
|
|
|
self.hybrid_config = parent_config
|
|
self._feature_extractor = None
|
|
self._metrics_calculator = None
|
|
self._vlm_evaluator = None
|
|
|
|
@property
|
|
def feature_extractor(self):
|
|
"""Lazy-load feature extractor"""
|
|
if self._feature_extractor is None:
|
|
from pixelle_video.services.quality.feature_extractor import (
|
|
FeatureExtractor, FeatureExtractorConfig
|
|
)
|
|
self._feature_extractor = FeatureExtractor(
|
|
FeatureExtractorConfig(
|
|
model_name=self.hybrid_config.clip_model,
|
|
cache_features=self.hybrid_config.cache_features
|
|
)
|
|
)
|
|
return self._feature_extractor
|
|
|
|
@property
|
|
def metrics_calculator(self):
|
|
"""Lazy-load metrics calculator"""
|
|
if self._metrics_calculator is None:
|
|
from pixelle_video.services.quality.objective_metrics import (
|
|
ObjectiveMetricsCalculator
|
|
)
|
|
self._metrics_calculator = ObjectiveMetricsCalculator(
|
|
sharpness_threshold=self.hybrid_config.sharpness_threshold
|
|
)
|
|
return self._metrics_calculator
|
|
|
|
@property
|
|
def vlm_evaluator(self):
|
|
"""Lazy-load VLM evaluator"""
|
|
if self._vlm_evaluator is None:
|
|
from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator
|
|
self._vlm_evaluator = VLMEvaluator(self.llm_service)
|
|
return self._vlm_evaluator
|
|
|
|
async def evaluate_image(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""Evaluate image quality using hybrid approach"""
|
|
start_time = time.time()
|
|
issues = []
|
|
|
|
if not Path(image_path).exists():
|
|
return QualityScore(
|
|
passed=False,
|
|
issues=["Image file not found"],
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
# Step 1: Technical metrics (fast, local)
|
|
technical_score = 0.7
|
|
technical_metrics = None
|
|
|
|
if self.hybrid_config.enable_technical_metrics:
|
|
technical_metrics = self.metrics_calculator.analyze_image(image_path)
|
|
technical_score = technical_metrics.overall_technical
|
|
issues.extend(technical_metrics.issues)
|
|
|
|
# Step 2: CLIP score (if available)
|
|
clip_score = None
|
|
text_match_score = 0.7
|
|
|
|
if self.hybrid_config.enable_clip_score:
|
|
clip_score = self.feature_extractor.calculate_clip_score(
|
|
image_path, prompt
|
|
)
|
|
if clip_score is not None:
|
|
text_match_score = clip_score
|
|
|
|
# Step 3: Determine if VLM needed
|
|
objective_score = (technical_score + text_match_score) / 2
|
|
use_vlm = True
|
|
aesthetic_score = 0.7
|
|
|
|
if self.hybrid_config.enable_smart_skip:
|
|
if objective_score >= self.hybrid_config.smart_skip_threshold:
|
|
use_vlm = False
|
|
logger.debug(f"Smart skip: {objective_score:.2f} >= threshold")
|
|
|
|
# Step 4: VLM evaluation (if needed)
|
|
if use_vlm and self.config.use_vlm_evaluation and self.llm_service:
|
|
vlm_result = await self.vlm_evaluator.evaluate_image(
|
|
image_path, prompt, narration
|
|
)
|
|
aesthetic_score = vlm_result.aesthetic_score or 0.7
|
|
|
|
if clip_score is not None:
|
|
text_match_score = (
|
|
clip_score * self.hybrid_config.clip_weight +
|
|
vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight)
|
|
)
|
|
else:
|
|
text_match_score = vlm_result.text_match_score or 0.7
|
|
|
|
issues.extend(vlm_result.issues)
|
|
|
|
# Step 5: Calculate overall
|
|
overall = (
|
|
aesthetic_score * self.config.aesthetic_weight +
|
|
text_match_score * self.config.text_match_weight +
|
|
technical_score * self.config.technical_weight
|
|
)
|
|
|
|
score = QualityScore(
|
|
aesthetic_score=aesthetic_score,
|
|
text_match_score=text_match_score,
|
|
technical_score=technical_score,
|
|
overall_score=overall,
|
|
issues=issues,
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
score.passed = overall >= self.config.overall_threshold
|
|
|
|
logger.debug(
|
|
f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, "
|
|
f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms"
|
|
)
|
|
|
|
return score
|