# Copyright (C) 2025 AIDC-AI # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ QualityGate - Quality evaluation system for generated content Evaluates images and videos based on: - Aesthetic quality (visual appeal) - Text-to-image matching (semantic alignment) - Technical quality (clarity, no artifacts) Includes HybridQualityGate for combined objective + VLM evaluation. """ import time from dataclasses import dataclass from typing import Optional from pathlib import Path from loguru import logger from pixelle_video.services.quality.models import QualityScore, QualityConfig @dataclass class HybridQualityConfig(QualityConfig): """Extended configuration for hybrid quality evaluation""" # CLIP settings enable_clip_score: bool = True clip_model: str = "ViT-B/32" clip_weight: float = 0.5 # Technical metrics settings enable_technical_metrics: bool = True sharpness_threshold: float = 0.3 # Smart VLM skip enable_smart_skip: bool = True smart_skip_threshold: float = 0.75 # Feature caching cache_features: bool = True class QualityGate: """ Quality evaluation gate for AI-generated content Uses VLM (Vision Language Model) or local models to evaluate: 1. Aesthetic quality - Is the image visually appealing? 2. Text matching - Does the image match the prompt/narration? 3. Technical quality - Is the image clear and free of artifacts? Example: >>> gate = QualityGate(llm_service, config) >>> score = await gate.evaluate_image( ... image_path="output/frame_001.png", ... prompt="A sunset over mountains", ... narration="夕阳西下,余晖洒满山间" ... ) >>> if score.passed: ... print("Image quality approved!") """ def __init__( self, llm_service=None, config: Optional[QualityConfig] = None ): """ Initialize QualityGate Args: llm_service: LLM service for VLM-based evaluation config: Quality configuration """ self.llm_service = llm_service self.config = config or QualityConfig() async def evaluate_image( self, image_path: str, prompt: str, narration: Optional[str] = None, ) -> QualityScore: """ Evaluate the quality of a generated image Args: image_path: Path to the image file prompt: The prompt used to generate the image narration: Optional narration text for context Returns: QualityScore with evaluation results """ start_time = time.time() issues = [] # Validate image exists if not Path(image_path).exists(): return QualityScore( passed=False, issues=["Image file not found"], evaluation_time_ms=(time.time() - start_time) * 1000 ) # Evaluate using VLM or fallback to basic checks if self.config.use_vlm_evaluation and self.llm_service: score = await self._evaluate_with_vlm(image_path, prompt, narration) else: score = await self._evaluate_basic(image_path, prompt) # Set evaluation time score.evaluation_time_ms = (time.time() - start_time) * 1000 # Determine if passed score.passed = score.overall_score >= self.config.overall_threshold logger.debug( f"Quality evaluation: overall={score.overall_score:.2f}, " f"passed={score.passed}, time={score.evaluation_time_ms:.0f}ms" ) return score async def evaluate_video( self, video_path: str, prompt: str, narration: Optional[str] = None, ) -> QualityScore: """ Evaluate the quality of a generated video Args: video_path: Path to the video file prompt: The prompt used to generate the video narration: Optional narration text for context Returns: QualityScore with evaluation results """ start_time = time.time() # Validate video exists if not Path(video_path).exists(): return QualityScore( passed=False, issues=["Video file not found"], evaluation_time_ms=(time.time() - start_time) * 1000 ) # For video, we can extract key frames and evaluate # For now, use VLM with video input or sample frames if self.config.use_vlm_evaluation and self.llm_service: score = await self._evaluate_video_with_vlm(video_path, prompt, narration) else: score = await self._evaluate_video_basic(video_path) score.evaluation_time_ms = (time.time() - start_time) * 1000 score.passed = score.overall_score >= self.config.overall_threshold return score async def _evaluate_with_vlm( self, image_path: str, prompt: str, narration: Optional[str] = None, ) -> QualityScore: """ Evaluate image quality using Vision Language Model Uses the LLM with vision capability to assess: - Visual quality and aesthetics - Prompt-image alignment - Technical defects """ evaluation_prompt = self._build_evaluation_prompt(prompt, narration) try: # Call LLM with image (requires VLM-capable model like GPT-4o, Qwen-VL) # Note: This requires the LLM service to support vision input # For now, we'll use a basic score if VLM is not available # TODO: Implement actual VLM call when integrating with vision-capable LLM # response = await self.llm_service( # prompt=evaluation_prompt, # images=[image_path], # response_type=ImageQualityResponse # ) # Fallback to basic evaluation for now logger.debug("VLM evaluation: using basic fallback (VLM integration pending)") return await self._evaluate_basic(image_path, prompt) except Exception as e: logger.warning(f"VLM evaluation failed: {e}, falling back to basic") return await self._evaluate_basic(image_path, prompt) async def _evaluate_basic( self, image_path: str, prompt: str, ) -> QualityScore: """ Basic image quality evaluation without VLM Performs simple checks: - File size and dimensions - Image format validation """ issues = [] try: # Import PIL for basic checks from PIL import Image with Image.open(image_path) as img: width, height = img.size # Check minimum dimensions if width < 256 or height < 256: issues.append(f"Image too small: {width}x{height}") # Check aspect ratio (not too extreme) aspect = max(width, height) / min(width, height) if aspect > 4: issues.append(f"Extreme aspect ratio: {aspect:.1f}") # Basic scores (generous defaults when VLM not available) aesthetic_score = 0.7 if not issues else 0.4 text_match_score = 0.7 # Can't properly evaluate without VLM technical_score = 0.8 if not issues else 0.5 # Calculate overall overall = ( aesthetic_score * self.config.aesthetic_weight + text_match_score * self.config.text_match_weight + technical_score * self.config.technical_weight ) return QualityScore( aesthetic_score=aesthetic_score, text_match_score=text_match_score, technical_score=technical_score, overall_score=overall, issues=issues, ) except Exception as e: logger.error(f"Basic evaluation failed: {e}") return QualityScore( overall_score=0.0, passed=False, issues=[f"Evaluation error: {str(e)}"] ) async def _evaluate_video_with_vlm( self, video_path: str, prompt: str, narration: Optional[str] = None, ) -> QualityScore: """Evaluate video using VLM (placeholder for future implementation)""" # TODO: Implement video frame sampling and VLM evaluation return await self._evaluate_video_basic(video_path) async def _evaluate_video_basic( self, video_path: str, ) -> QualityScore: """Basic video quality evaluation""" issues = [] try: import subprocess import json # Use ffprobe to get video info cmd = [ "ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", video_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: issues.append("Failed to read video metadata") return QualityScore(overall_score=0.5, issues=issues) info = json.loads(result.stdout) # Check for video stream video_stream = None for stream in info.get("streams", []): if stream.get("codec_type") == "video": video_stream = stream break if not video_stream: issues.append("No video stream found") return QualityScore(overall_score=0.0, passed=False, issues=issues) # Check dimensions width = video_stream.get("width", 0) height = video_stream.get("height", 0) if width < 256 or height < 256: issues.append(f"Video too small: {width}x{height}") # Check duration duration = float(info.get("format", {}).get("duration", 0)) if duration < 0.5: issues.append(f"Video too short: {duration:.1f}s") # Calculate scores aesthetic_score = 0.7 text_match_score = 0.7 technical_score = 0.8 if not issues else 0.5 overall = ( aesthetic_score * self.config.aesthetic_weight + text_match_score * self.config.text_match_weight + technical_score * self.config.technical_weight ) return QualityScore( aesthetic_score=aesthetic_score, text_match_score=text_match_score, technical_score=technical_score, overall_score=overall, issues=issues, ) except Exception as e: logger.error(f"Video evaluation failed: {e}") return QualityScore( overall_score=0.5, issues=[f"Evaluation error: {str(e)}"] ) def _build_evaluation_prompt( self, prompt: str, narration: Optional[str] = None, ) -> str: """Build the evaluation prompt for VLM""" context = f"Narration: {narration}\n" if narration else "" return f"""Evaluate this AI-generated image on the following criteria. Rate each from 0.0 to 1.0. Image Generation Prompt: {prompt} {context} Evaluation Criteria: 1. Aesthetic Quality (0.0-1.0): - Is the image visually appealing? - Good composition, colors, and style? 2. Prompt Matching (0.0-1.0): - Does the image accurately represent the prompt? - Are key elements from the prompt visible? 3. Technical Quality (0.0-1.0): - Is the image clear and well-defined? - Free of artifacts, distortions, or blurriness? - Natural looking (no AI artifacts like extra fingers)? Respond in JSON format: {{ "aesthetic_score": 0.0, "text_match_score": 0.0, "technical_score": 0.0, "issues": ["list of any problems found"] }} """ class HybridQualityGate(QualityGate): """ Hybrid quality gate combining objective metrics with VLM evaluation Evaluation flow: 1. Calculate technical metrics (fast, local) 2. Calculate CLIP score if enabled (local, requires CLIP) 3. If smart_skip enabled and objective score >= threshold, skip VLM 4. Otherwise, call VLM for subjective evaluation 5. Combine scores with configurable weights Example: >>> gate = HybridQualityGate(llm_service, config) >>> score = await gate.evaluate_image( ... image_path="frame_001.png", ... prompt="A sunset over mountains" ... ) """ def __init__( self, llm_service=None, config: Optional[HybridQualityConfig] = None ): parent_config = config or HybridQualityConfig() super().__init__(llm_service, parent_config) self.hybrid_config = parent_config self._feature_extractor = None self._metrics_calculator = None self._vlm_evaluator = None @property def feature_extractor(self): """Lazy-load feature extractor""" if self._feature_extractor is None: from pixelle_video.services.quality.feature_extractor import ( FeatureExtractor, FeatureExtractorConfig ) self._feature_extractor = FeatureExtractor( FeatureExtractorConfig( model_name=self.hybrid_config.clip_model, cache_features=self.hybrid_config.cache_features ) ) return self._feature_extractor @property def metrics_calculator(self): """Lazy-load metrics calculator""" if self._metrics_calculator is None: from pixelle_video.services.quality.objective_metrics import ( ObjectiveMetricsCalculator ) self._metrics_calculator = ObjectiveMetricsCalculator( sharpness_threshold=self.hybrid_config.sharpness_threshold ) return self._metrics_calculator @property def vlm_evaluator(self): """Lazy-load VLM evaluator""" if self._vlm_evaluator is None: from pixelle_video.services.quality.vlm_evaluator import VLMEvaluator self._vlm_evaluator = VLMEvaluator(self.llm_service) return self._vlm_evaluator async def evaluate_image( self, image_path: str, prompt: str, narration: Optional[str] = None, ) -> QualityScore: """Evaluate image quality using hybrid approach""" start_time = time.time() issues = [] if not Path(image_path).exists(): return QualityScore( passed=False, issues=["Image file not found"], evaluation_time_ms=(time.time() - start_time) * 1000 ) # Step 1: Technical metrics (fast, local) technical_score = 0.7 technical_metrics = None if self.hybrid_config.enable_technical_metrics: technical_metrics = self.metrics_calculator.analyze_image(image_path) technical_score = technical_metrics.overall_technical issues.extend(technical_metrics.issues) # Step 2: CLIP score (if available) clip_score = None text_match_score = 0.7 if self.hybrid_config.enable_clip_score: clip_score = self.feature_extractor.calculate_clip_score( image_path, prompt ) if clip_score is not None: text_match_score = clip_score # Step 3: Determine if VLM needed objective_score = (technical_score + text_match_score) / 2 use_vlm = True aesthetic_score = 0.7 if self.hybrid_config.enable_smart_skip: if objective_score >= self.hybrid_config.smart_skip_threshold: use_vlm = False logger.debug(f"Smart skip: {objective_score:.2f} >= threshold") # Step 4: VLM evaluation (if needed) if use_vlm and self.config.use_vlm_evaluation and self.llm_service: vlm_result = await self.vlm_evaluator.evaluate_image( image_path, prompt, narration ) aesthetic_score = vlm_result.aesthetic_score or 0.7 if clip_score is not None: text_match_score = ( clip_score * self.hybrid_config.clip_weight + vlm_result.text_match_score * (1 - self.hybrid_config.clip_weight) ) else: text_match_score = vlm_result.text_match_score or 0.7 issues.extend(vlm_result.issues) # Step 5: Calculate overall overall = ( aesthetic_score * self.config.aesthetic_weight + text_match_score * self.config.text_match_weight + technical_score * self.config.technical_weight ) score = QualityScore( aesthetic_score=aesthetic_score, text_match_score=text_match_score, technical_score=technical_score, overall_score=overall, issues=issues, evaluation_time_ms=(time.time() - start_time) * 1000 ) score.passed = overall >= self.config.overall_threshold logger.debug( f"Hybrid eval: overall={overall:.2f}, clip={clip_score}, " f"vlm_used={use_vlm}, time={score.evaluation_time_ms:.0f}ms" ) return score