# Copyright (C) 2025 AIDC-AI # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ VLMEvaluator - Vision Language Model based image quality evaluation Supports multiple VLM providers: - OpenAI: gpt-4-vision-preview, gpt-4o - Qwen-VL: qwen-vl-max, qwen-vl-plus - GLM-4V: via OpenAI compatible API """ import base64 import json import re from dataclasses import dataclass, field from typing import Optional, List from pathlib import Path from loguru import logger @dataclass class VLMEvaluationResult: """Result from VLM evaluation""" aesthetic_score: float = 0.0 text_match_score: float = 0.0 technical_score: float = 0.0 issues: List[str] = field(default_factory=list) raw_response: Optional[str] = None def to_dict(self) -> dict: return { "aesthetic_score": self.aesthetic_score, "text_match_score": self.text_match_score, "technical_score": self.technical_score, "issues": self.issues, } @dataclass class VLMEvaluatorConfig: """Configuration for VLM evaluator""" provider: str = "auto" # "openai", "qwen", "auto" model: Optional[str] = None # Auto-select if None max_image_size: int = 1024 # Max image dimension timeout: int = 30 temperature: float = 0.1 # Low for consistent evaluation class VLMEvaluator: """ VLM-based image quality evaluator Example: >>> evaluator = VLMEvaluator(llm_service) >>> result = await evaluator.evaluate_image( ... image_path="frame_001.png", ... prompt="A sunset over mountains" ... ) """ EVALUATION_PROMPT = """请评估这张AI生成的图片质量。 生成提示词: {prompt} {narration_section} 请从以下三个维度评分(0.0-1.0): 1. **美学质量** (aesthetic_score): 构图、色彩搭配、视觉吸引力 2. **图文匹配** (text_match_score): 图片与提示词的语义对齐程度 3. **技术质量** (technical_score): 清晰度、无伪影、无变形 同时列出发现的问题(如有)。 请以JSON格式返回: ```json {{ "aesthetic_score": 0.0-1.0, "text_match_score": 0.0-1.0, "technical_score": 0.0-1.0, "issues": ["问题1", "问题2"] }} ```""" def __init__( self, llm_service=None, config: Optional[VLMEvaluatorConfig] = None ): self.llm_service = llm_service self.config = config or VLMEvaluatorConfig() def _encode_image_base64(self, image_path: str) -> str: """Encode image to base64, with optional resizing""" from PIL import Image import io with Image.open(image_path) as img: # Resize if too large max_size = self.config.max_image_size if max(img.size) > max_size: ratio = max_size / max(img.size) new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio)) img = img.resize(new_size, Image.Resampling.LANCZOS) # Convert to RGB if needed if img.mode in ('RGBA', 'P'): img = img.convert('RGB') # Encode to base64 buffer = io.BytesIO() img.save(buffer, format='JPEG', quality=85) return base64.b64encode(buffer.getvalue()).decode('utf-8') def _parse_response(self, response: str) -> VLMEvaluationResult: """Parse VLM response to extract scores""" result = VLMEvaluationResult(raw_response=response) try: # Try to extract JSON from response json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response) if json_match: json_str = json_match.group(1) else: # Try to find raw JSON brace_start = response.find('{') brace_end = response.rfind('}') if brace_start != -1 and brace_end > brace_start: json_str = response[brace_start:brace_end + 1] else: logger.warning("No JSON found in VLM response") return result data = json.loads(json_str) result.aesthetic_score = float(data.get('aesthetic_score', 0.0)) result.text_match_score = float(data.get('text_match_score', 0.0)) result.technical_score = float(data.get('technical_score', 0.0)) result.issues = data.get('issues', []) # Clamp scores to valid range result.aesthetic_score = max(0.0, min(1.0, result.aesthetic_score)) result.text_match_score = max(0.0, min(1.0, result.text_match_score)) result.technical_score = max(0.0, min(1.0, result.technical_score)) except (json.JSONDecodeError, ValueError) as e: logger.warning(f"Failed to parse VLM response: {e}") return result async def evaluate_image( self, image_path: str, prompt: str, narration: Optional[str] = None ) -> VLMEvaluationResult: """ Evaluate image quality using VLM Args: image_path: Path to image file prompt: Generation prompt narration: Optional narration text Returns: VLMEvaluationResult with scores """ if not Path(image_path).exists(): return VLMEvaluationResult(issues=["Image file not found"]) if not self.llm_service: logger.warning("No LLM service provided for VLM evaluation") return VLMEvaluationResult(issues=["No LLM service"]) try: # Encode image image_b64 = self._encode_image_base64(image_path) # Build prompt narration_section = f"旁白文案: {narration}" if narration else "" eval_prompt = self.EVALUATION_PROMPT.format( prompt=prompt, narration_section=narration_section ) # Call VLM via LLM service with vision response = await self._call_vlm(image_b64, eval_prompt) return self._parse_response(response) except Exception as e: logger.error(f"VLM evaluation failed: {e}") return VLMEvaluationResult(issues=[f"Evaluation error: {str(e)}"]) async def _call_vlm(self, image_b64: str, prompt: str) -> str: """Call VLM with image and prompt""" from openai import AsyncOpenAI # Get config from LLM service base_url = self.llm_service._get_config_value("base_url") api_key = self.llm_service._get_config_value("api_key") model = self.config.model or self.llm_service._get_config_value("model") client = AsyncOpenAI(api_key=api_key, base_url=base_url) # Build message with image messages = [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_b64}" } }, { "type": "text", "text": prompt } ] } ] response = await client.chat.completions.create( model=model, messages=messages, temperature=self.config.temperature, max_tokens=500, timeout=self.config.timeout ) return response.choices[0].message.content