364 lines
12 KiB
Python
364 lines
12 KiB
Python
# Copyright (C) 2025 AIDC-AI
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
QualityGate - Quality evaluation system for generated content
|
|
|
|
Evaluates images and videos based on:
|
|
- Aesthetic quality (visual appeal)
|
|
- Text-to-image matching (semantic alignment)
|
|
- Technical quality (clarity, no artifacts)
|
|
"""
|
|
|
|
import time
|
|
from typing import Optional
|
|
from pathlib import Path
|
|
|
|
from loguru import logger
|
|
|
|
from pixelle_video.services.quality.models import QualityScore, QualityConfig
|
|
|
|
|
|
class QualityGate:
|
|
"""
|
|
Quality evaluation gate for AI-generated content
|
|
|
|
Uses VLM (Vision Language Model) or local models to evaluate:
|
|
1. Aesthetic quality - Is the image visually appealing?
|
|
2. Text matching - Does the image match the prompt/narration?
|
|
3. Technical quality - Is the image clear and free of artifacts?
|
|
|
|
Example:
|
|
>>> gate = QualityGate(llm_service, config)
|
|
>>> score = await gate.evaluate_image(
|
|
... image_path="output/frame_001.png",
|
|
... prompt="A sunset over mountains",
|
|
... narration="夕阳西下,余晖洒满山间"
|
|
... )
|
|
>>> if score.passed:
|
|
... print("Image quality approved!")
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
llm_service=None,
|
|
config: Optional[QualityConfig] = None
|
|
):
|
|
"""
|
|
Initialize QualityGate
|
|
|
|
Args:
|
|
llm_service: LLM service for VLM-based evaluation
|
|
config: Quality configuration
|
|
"""
|
|
self.llm_service = llm_service
|
|
self.config = config or QualityConfig()
|
|
|
|
async def evaluate_image(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate the quality of a generated image
|
|
|
|
Args:
|
|
image_path: Path to the image file
|
|
prompt: The prompt used to generate the image
|
|
narration: Optional narration text for context
|
|
|
|
Returns:
|
|
QualityScore with evaluation results
|
|
"""
|
|
start_time = time.time()
|
|
issues = []
|
|
|
|
# Validate image exists
|
|
if not Path(image_path).exists():
|
|
return QualityScore(
|
|
passed=False,
|
|
issues=["Image file not found"],
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
# Evaluate using VLM or fallback to basic checks
|
|
if self.config.use_vlm_evaluation and self.llm_service:
|
|
score = await self._evaluate_with_vlm(image_path, prompt, narration)
|
|
else:
|
|
score = await self._evaluate_basic(image_path, prompt)
|
|
|
|
# Set evaluation time
|
|
score.evaluation_time_ms = (time.time() - start_time) * 1000
|
|
|
|
# Determine if passed
|
|
score.passed = score.overall_score >= self.config.overall_threshold
|
|
|
|
logger.debug(
|
|
f"Quality evaluation: overall={score.overall_score:.2f}, "
|
|
f"passed={score.passed}, time={score.evaluation_time_ms:.0f}ms"
|
|
)
|
|
|
|
return score
|
|
|
|
async def evaluate_video(
|
|
self,
|
|
video_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate the quality of a generated video
|
|
|
|
Args:
|
|
video_path: Path to the video file
|
|
prompt: The prompt used to generate the video
|
|
narration: Optional narration text for context
|
|
|
|
Returns:
|
|
QualityScore with evaluation results
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Validate video exists
|
|
if not Path(video_path).exists():
|
|
return QualityScore(
|
|
passed=False,
|
|
issues=["Video file not found"],
|
|
evaluation_time_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
# For video, we can extract key frames and evaluate
|
|
# For now, use VLM with video input or sample frames
|
|
if self.config.use_vlm_evaluation and self.llm_service:
|
|
score = await self._evaluate_video_with_vlm(video_path, prompt, narration)
|
|
else:
|
|
score = await self._evaluate_video_basic(video_path)
|
|
|
|
score.evaluation_time_ms = (time.time() - start_time) * 1000
|
|
score.passed = score.overall_score >= self.config.overall_threshold
|
|
|
|
return score
|
|
|
|
async def _evaluate_with_vlm(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""
|
|
Evaluate image quality using Vision Language Model
|
|
|
|
Uses the LLM with vision capability to assess:
|
|
- Visual quality and aesthetics
|
|
- Prompt-image alignment
|
|
- Technical defects
|
|
"""
|
|
evaluation_prompt = self._build_evaluation_prompt(prompt, narration)
|
|
|
|
try:
|
|
# Call LLM with image (requires VLM-capable model like GPT-4o, Qwen-VL)
|
|
# Note: This requires the LLM service to support vision input
|
|
# For now, we'll use a basic score if VLM is not available
|
|
|
|
# TODO: Implement actual VLM call when integrating with vision-capable LLM
|
|
# response = await self.llm_service(
|
|
# prompt=evaluation_prompt,
|
|
# images=[image_path],
|
|
# response_type=ImageQualityResponse
|
|
# )
|
|
|
|
# Fallback to basic evaluation for now
|
|
logger.debug("VLM evaluation: using basic fallback (VLM integration pending)")
|
|
return await self._evaluate_basic(image_path, prompt)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"VLM evaluation failed: {e}, falling back to basic")
|
|
return await self._evaluate_basic(image_path, prompt)
|
|
|
|
async def _evaluate_basic(
|
|
self,
|
|
image_path: str,
|
|
prompt: str,
|
|
) -> QualityScore:
|
|
"""
|
|
Basic image quality evaluation without VLM
|
|
|
|
Performs simple checks:
|
|
- File size and dimensions
|
|
- Image format validation
|
|
"""
|
|
issues = []
|
|
|
|
try:
|
|
# Import PIL for basic checks
|
|
from PIL import Image
|
|
|
|
with Image.open(image_path) as img:
|
|
width, height = img.size
|
|
|
|
# Check minimum dimensions
|
|
if width < 256 or height < 256:
|
|
issues.append(f"Image too small: {width}x{height}")
|
|
|
|
# Check aspect ratio (not too extreme)
|
|
aspect = max(width, height) / min(width, height)
|
|
if aspect > 4:
|
|
issues.append(f"Extreme aspect ratio: {aspect:.1f}")
|
|
|
|
# Basic scores (generous defaults when VLM not available)
|
|
aesthetic_score = 0.7 if not issues else 0.4
|
|
text_match_score = 0.7 # Can't properly evaluate without VLM
|
|
technical_score = 0.8 if not issues else 0.5
|
|
|
|
# Calculate overall
|
|
overall = (
|
|
aesthetic_score * self.config.aesthetic_weight +
|
|
text_match_score * self.config.text_match_weight +
|
|
technical_score * self.config.technical_weight
|
|
)
|
|
|
|
return QualityScore(
|
|
aesthetic_score=aesthetic_score,
|
|
text_match_score=text_match_score,
|
|
technical_score=technical_score,
|
|
overall_score=overall,
|
|
issues=issues,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Basic evaluation failed: {e}")
|
|
return QualityScore(
|
|
overall_score=0.0,
|
|
passed=False,
|
|
issues=[f"Evaluation error: {str(e)}"]
|
|
)
|
|
|
|
async def _evaluate_video_with_vlm(
|
|
self,
|
|
video_path: str,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> QualityScore:
|
|
"""Evaluate video using VLM (placeholder for future implementation)"""
|
|
# TODO: Implement video frame sampling and VLM evaluation
|
|
return await self._evaluate_video_basic(video_path)
|
|
|
|
async def _evaluate_video_basic(
|
|
self,
|
|
video_path: str,
|
|
) -> QualityScore:
|
|
"""Basic video quality evaluation"""
|
|
issues = []
|
|
|
|
try:
|
|
import subprocess
|
|
import json
|
|
|
|
# Use ffprobe to get video info
|
|
cmd = [
|
|
"ffprobe", "-v", "quiet", "-print_format", "json",
|
|
"-show_format", "-show_streams", video_path
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
issues.append("Failed to read video metadata")
|
|
return QualityScore(overall_score=0.5, issues=issues)
|
|
|
|
info = json.loads(result.stdout)
|
|
|
|
# Check for video stream
|
|
video_stream = None
|
|
for stream in info.get("streams", []):
|
|
if stream.get("codec_type") == "video":
|
|
video_stream = stream
|
|
break
|
|
|
|
if not video_stream:
|
|
issues.append("No video stream found")
|
|
return QualityScore(overall_score=0.0, passed=False, issues=issues)
|
|
|
|
# Check dimensions
|
|
width = video_stream.get("width", 0)
|
|
height = video_stream.get("height", 0)
|
|
if width < 256 or height < 256:
|
|
issues.append(f"Video too small: {width}x{height}")
|
|
|
|
# Check duration
|
|
duration = float(info.get("format", {}).get("duration", 0))
|
|
if duration < 0.5:
|
|
issues.append(f"Video too short: {duration:.1f}s")
|
|
|
|
# Calculate scores
|
|
aesthetic_score = 0.7
|
|
text_match_score = 0.7
|
|
technical_score = 0.8 if not issues else 0.5
|
|
|
|
overall = (
|
|
aesthetic_score * self.config.aesthetic_weight +
|
|
text_match_score * self.config.text_match_weight +
|
|
technical_score * self.config.technical_weight
|
|
)
|
|
|
|
return QualityScore(
|
|
aesthetic_score=aesthetic_score,
|
|
text_match_score=text_match_score,
|
|
technical_score=technical_score,
|
|
overall_score=overall,
|
|
issues=issues,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Video evaluation failed: {e}")
|
|
return QualityScore(
|
|
overall_score=0.5,
|
|
issues=[f"Evaluation error: {str(e)}"]
|
|
)
|
|
|
|
def _build_evaluation_prompt(
|
|
self,
|
|
prompt: str,
|
|
narration: Optional[str] = None,
|
|
) -> str:
|
|
"""Build the evaluation prompt for VLM"""
|
|
context = f"Narration: {narration}\n" if narration else ""
|
|
|
|
return f"""Evaluate this AI-generated image on the following criteria.
|
|
Rate each from 0.0 to 1.0.
|
|
|
|
Image Generation Prompt: {prompt}
|
|
{context}
|
|
Evaluation Criteria:
|
|
|
|
1. Aesthetic Quality (0.0-1.0):
|
|
- Is the image visually appealing?
|
|
- Good composition, colors, and style?
|
|
|
|
2. Prompt Matching (0.0-1.0):
|
|
- Does the image accurately represent the prompt?
|
|
- Are key elements from the prompt visible?
|
|
|
|
3. Technical Quality (0.0-1.0):
|
|
- Is the image clear and well-defined?
|
|
- Free of artifacts, distortions, or blurriness?
|
|
- Natural looking (no AI artifacts like extra fingers)?
|
|
|
|
Respond in JSON format:
|
|
{{
|
|
"aesthetic_score": 0.0,
|
|
"text_match_score": 0.0,
|
|
"technical_score": 0.0,
|
|
"issues": ["list of any problems found"]
|
|
}}
|
|
"""
|