Files
AI-Video/pixelle_video/services/quality/vlm_evaluator.py
empty 56db9bf9d2 feat: Add hybrid quality evaluation system with CLIP and VLM support
- Add FeatureExtractor for CLIP-based image/text feature extraction
- Add ObjectiveMetricsCalculator for technical quality metrics
- Add VLMEvaluator for vision language model evaluation
- Add HybridQualityGate combining objective + VLM evaluation
- Enhance CharacterMemory with visual feature support
- Add quality optional dependency (torch, ftfy, regex)
- Add unit tests for new modules

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 15:56:44 +08:00

244 lines
7.9 KiB
Python

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
VLMEvaluator - Vision Language Model based image quality evaluation
Supports multiple VLM providers:
- OpenAI: gpt-4-vision-preview, gpt-4o
- Qwen-VL: qwen-vl-max, qwen-vl-plus
- GLM-4V: via OpenAI compatible API
"""
import base64
import json
import re
from dataclasses import dataclass, field
from typing import Optional, List
from pathlib import Path
from loguru import logger
@dataclass
class VLMEvaluationResult:
"""Result from VLM evaluation"""
aesthetic_score: float = 0.0
text_match_score: float = 0.0
technical_score: float = 0.0
issues: List[str] = field(default_factory=list)
raw_response: Optional[str] = None
def to_dict(self) -> dict:
return {
"aesthetic_score": self.aesthetic_score,
"text_match_score": self.text_match_score,
"technical_score": self.technical_score,
"issues": self.issues,
}
@dataclass
class VLMEvaluatorConfig:
"""Configuration for VLM evaluator"""
provider: str = "auto" # "openai", "qwen", "auto"
model: Optional[str] = None # Auto-select if None
max_image_size: int = 1024 # Max image dimension
timeout: int = 30
temperature: float = 0.1 # Low for consistent evaluation
class VLMEvaluator:
"""
VLM-based image quality evaluator
Example:
>>> evaluator = VLMEvaluator(llm_service)
>>> result = await evaluator.evaluate_image(
... image_path="frame_001.png",
... prompt="A sunset over mountains"
... )
"""
EVALUATION_PROMPT = """请评估这张AI生成的图片质量。
生成提示词: {prompt}
{narration_section}
请从以下三个维度评分(0.0-1.0):
1. **美学质量** (aesthetic_score): 构图、色彩搭配、视觉吸引力
2. **图文匹配** (text_match_score): 图片与提示词的语义对齐程度
3. **技术质量** (technical_score): 清晰度、无伪影、无变形
同时列出发现的问题(如有)。
请以JSON格式返回:
```json
{{
"aesthetic_score": 0.0-1.0,
"text_match_score": 0.0-1.0,
"technical_score": 0.0-1.0,
"issues": ["问题1", "问题2"]
}}
```"""
def __init__(
self,
llm_service=None,
config: Optional[VLMEvaluatorConfig] = None
):
self.llm_service = llm_service
self.config = config or VLMEvaluatorConfig()
def _encode_image_base64(self, image_path: str) -> str:
"""Encode image to base64, with optional resizing"""
from PIL import Image
import io
with Image.open(image_path) as img:
# Resize if too large
max_size = self.config.max_image_size
if max(img.size) > max_size:
ratio = max_size / max(img.size)
new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
img = img.resize(new_size, Image.Resampling.LANCZOS)
# Convert to RGB if needed
if img.mode in ('RGBA', 'P'):
img = img.convert('RGB')
# Encode to base64
buffer = io.BytesIO()
img.save(buffer, format='JPEG', quality=85)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def _parse_response(self, response: str) -> VLMEvaluationResult:
"""Parse VLM response to extract scores"""
result = VLMEvaluationResult(raw_response=response)
try:
# Try to extract JSON from response
json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
if json_match:
json_str = json_match.group(1)
else:
# Try to find raw JSON
brace_start = response.find('{')
brace_end = response.rfind('}')
if brace_start != -1 and brace_end > brace_start:
json_str = response[brace_start:brace_end + 1]
else:
logger.warning("No JSON found in VLM response")
return result
data = json.loads(json_str)
result.aesthetic_score = float(data.get('aesthetic_score', 0.0))
result.text_match_score = float(data.get('text_match_score', 0.0))
result.technical_score = float(data.get('technical_score', 0.0))
result.issues = data.get('issues', [])
# Clamp scores to valid range
result.aesthetic_score = max(0.0, min(1.0, result.aesthetic_score))
result.text_match_score = max(0.0, min(1.0, result.text_match_score))
result.technical_score = max(0.0, min(1.0, result.technical_score))
except (json.JSONDecodeError, ValueError) as e:
logger.warning(f"Failed to parse VLM response: {e}")
return result
async def evaluate_image(
self,
image_path: str,
prompt: str,
narration: Optional[str] = None
) -> VLMEvaluationResult:
"""
Evaluate image quality using VLM
Args:
image_path: Path to image file
prompt: Generation prompt
narration: Optional narration text
Returns:
VLMEvaluationResult with scores
"""
if not Path(image_path).exists():
return VLMEvaluationResult(issues=["Image file not found"])
if not self.llm_service:
logger.warning("No LLM service provided for VLM evaluation")
return VLMEvaluationResult(issues=["No LLM service"])
try:
# Encode image
image_b64 = self._encode_image_base64(image_path)
# Build prompt
narration_section = f"旁白文案: {narration}" if narration else ""
eval_prompt = self.EVALUATION_PROMPT.format(
prompt=prompt,
narration_section=narration_section
)
# Call VLM via LLM service with vision
response = await self._call_vlm(image_b64, eval_prompt)
return self._parse_response(response)
except Exception as e:
logger.error(f"VLM evaluation failed: {e}")
return VLMEvaluationResult(issues=[f"Evaluation error: {str(e)}"])
async def _call_vlm(self, image_b64: str, prompt: str) -> str:
"""Call VLM with image and prompt"""
from openai import AsyncOpenAI
# Get config from LLM service
base_url = self.llm_service._get_config_value("base_url")
api_key = self.llm_service._get_config_value("api_key")
model = self.config.model or self.llm_service._get_config_value("model")
client = AsyncOpenAI(api_key=api_key, base_url=base_url)
# Build message with image
messages = [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_b64}"
}
},
{
"type": "text",
"text": prompt
}
]
}
]
response = await client.chat.completions.create(
model=model,
messages=messages,
temperature=self.config.temperature,
max_tokens=500,
timeout=self.config.timeout
)
return response.choices[0].message.content