AI-Video/pixelle_video/services/quality/vlm_evaluator.py

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
VLMEvaluator - Vision Language Model based image quality evaluation

Supports multiple VLM providers:
- OpenAI: gpt-4-vision-preview, gpt-4o
- Qwen-VL: qwen-vl-max, qwen-vl-plus
- GLM-4V: via OpenAI compatible API
"""

import base64
import json
import re
from dataclasses import dataclass, field
from typing import Optional, List
from pathlib import Path

from loguru import logger


@dataclass
class VLMEvaluationResult:
    """Result from VLM evaluation"""
    aesthetic_score: float = 0.0
    text_match_score: float = 0.0
    technical_score: float = 0.0
    issues: List[str] = field(default_factory=list)
    raw_response: Optional[str] = None

    def to_dict(self) -> dict:
        return {
            "aesthetic_score": self.aesthetic_score,
            "text_match_score": self.text_match_score,
            "technical_score": self.technical_score,
            "issues": self.issues,
        }


@dataclass
class VLMEvaluatorConfig:
    """Configuration for VLM evaluator"""
    provider: str = "auto"  # "openai", "qwen", "auto"
    model: Optional[str] = None  # Auto-select if None
    max_image_size: int = 1024  # Max image dimension
    timeout: int = 30
    temperature: float = 0.1  # Low for consistent evaluation


class VLMEvaluator:
    """
    VLM-based image quality evaluator

    Example:
        >>> evaluator = VLMEvaluator(llm_service)
        >>> result = await evaluator.evaluate_image(
        ...     image_path="frame_001.png",
        ...     prompt="A sunset over mountains"
        ... )
    """

    EVALUATION_PROMPT = """请评估这张AI生成的图片质量。

生成提示词: {prompt}
{narration_section}

请从以下三个维度评分(0.0-1.0):

1. **美学质量** (aesthetic_score): 构图、色彩搭配、视觉吸引力
2. **图文匹配** (text_match_score): 图片与提示词的语义对齐程度
3. **技术质量** (technical_score): 清晰度、无伪影、无变形

同时列出发现的问题(如有)。

请以JSON格式返回:
```json
{{
    "aesthetic_score": 0.0-1.0,
    "text_match_score": 0.0-1.0,
    "technical_score": 0.0-1.0,
    "issues": ["问题1", "问题2"]
}}
```"""

    def __init__(
        self,
        llm_service=None,
        config: Optional[VLMEvaluatorConfig] = None
    ):
        self.llm_service = llm_service
        self.config = config or VLMEvaluatorConfig()

    def _encode_image_base64(self, image_path: str) -> str:
        """Encode image to base64, with optional resizing"""
        from PIL import Image
        import io

        with Image.open(image_path) as img:
            # Resize if too large
            max_size = self.config.max_image_size
            if max(img.size) > max_size:
                ratio = max_size / max(img.size)
                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
                img = img.resize(new_size, Image.Resampling.LANCZOS)

            # Convert to RGB if needed
            if img.mode in ('RGBA', 'P'):
                img = img.convert('RGB')

            # Encode to base64
            buffer = io.BytesIO()
            img.save(buffer, format='JPEG', quality=85)
            return base64.b64encode(buffer.getvalue()).decode('utf-8')

    def _parse_response(self, response: str) -> VLMEvaluationResult:
        """Parse VLM response to extract scores"""
        result = VLMEvaluationResult(raw_response=response)

        try:
            # Try to extract JSON from response
            json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find raw JSON
                brace_start = response.find('{')
                brace_end = response.rfind('}')
                if brace_start != -1 and brace_end > brace_start:
                    json_str = response[brace_start:brace_end + 1]
                else:
                    logger.warning("No JSON found in VLM response")
                    return result

            data = json.loads(json_str)

            result.aesthetic_score = float(data.get('aesthetic_score', 0.0))
            result.text_match_score = float(data.get('text_match_score', 0.0))
            result.technical_score = float(data.get('technical_score', 0.0))
            result.issues = data.get('issues', [])

            # Clamp scores to valid range
            result.aesthetic_score = max(0.0, min(1.0, result.aesthetic_score))
            result.text_match_score = max(0.0, min(1.0, result.text_match_score))
            result.technical_score = max(0.0, min(1.0, result.technical_score))

        except (json.JSONDecodeError, ValueError) as e:
            logger.warning(f"Failed to parse VLM response: {e}")

        return result

    async def evaluate_image(
        self,
        image_path: str,
        prompt: str,
        narration: Optional[str] = None
    ) -> VLMEvaluationResult:
        """
        Evaluate image quality using VLM

        Args:
            image_path: Path to image file
            prompt: Generation prompt
            narration: Optional narration text

        Returns:
            VLMEvaluationResult with scores
        """
        if not Path(image_path).exists():
            return VLMEvaluationResult(issues=["Image file not found"])

        if not self.llm_service:
            logger.warning("No LLM service provided for VLM evaluation")
            return VLMEvaluationResult(issues=["No LLM service"])

        try:
            # Encode image
            image_b64 = self._encode_image_base64(image_path)

            # Build prompt
            narration_section = f"旁白文案: {narration}" if narration else ""
            eval_prompt = self.EVALUATION_PROMPT.format(
                prompt=prompt,
                narration_section=narration_section
            )

            # Call VLM via LLM service with vision
            response = await self._call_vlm(image_b64, eval_prompt)

            return self._parse_response(response)

        except Exception as e:
            logger.error(f"VLM evaluation failed: {e}")
            return VLMEvaluationResult(issues=[f"Evaluation error: {str(e)}"])

    async def _call_vlm(self, image_b64: str, prompt: str) -> str:
        """Call VLM with image and prompt"""
        from openai import AsyncOpenAI

        # Get config from LLM service
        base_url = self.llm_service._get_config_value("base_url")
        api_key = self.llm_service._get_config_value("api_key")
        model = self.config.model or self.llm_service._get_config_value("model")

        client = AsyncOpenAI(api_key=api_key, base_url=base_url)

        # Build message with image
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image_b64}"
                        }
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]

        response = await client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=self.config.temperature,
            max_tokens=500,
            timeout=self.config.timeout
        )

        return response.choices[0].message.content