feat: Add hybrid quality evaluation system with CLIP and VLM support

- Add FeatureExtractor for CLIP-based image/text feature extraction - Add ObjectiveMetricsCalculator for technical quality metrics - Add VLMEvaluator for vision language model evaluation - Add HybridQualityGate combining objective + VLM evaluation - Enhance CharacterMemory with visual feature support - Add quality optional dependency (torch, ftfy, regex) - Add unit tests for new modules 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 15:56:44 +08:00
parent ca018a9b1f
commit 56db9bf9d2
12 changed files with 1230 additions and 4 deletions
--- a/pixelle_video/services/quality/vlm_evaluator.py
+++ b/pixelle_video/services/quality/vlm_evaluator.py
@@ -0,0 +1,243 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+VLMEvaluator - Vision Language Model based image quality evaluation
+
+Supports multiple VLM providers:
+- OpenAI: gpt-4-vision-preview, gpt-4o
+- Qwen-VL: qwen-vl-max, qwen-vl-plus
+- GLM-4V: via OpenAI compatible API
+"""
+
+import base64
+import json
+import re
+from dataclasses import dataclass, field
+from typing import Optional, List
+from pathlib import Path
+
+from loguru import logger
+
+
+@dataclass
+class VLMEvaluationResult:
+    """Result from VLM evaluation"""
+    aesthetic_score: float = 0.0
+    text_match_score: float = 0.0
+    technical_score: float = 0.0
+    issues: List[str] = field(default_factory=list)
+    raw_response: Optional[str] = None
+
+    def to_dict(self) -> dict:
+        return {
+            "aesthetic_score": self.aesthetic_score,
+            "text_match_score": self.text_match_score,
+            "technical_score": self.technical_score,
+            "issues": self.issues,
+        }
+
+
+@dataclass
+class VLMEvaluatorConfig:
+    """Configuration for VLM evaluator"""
+    provider: str = "auto"  # "openai", "qwen", "auto"
+    model: Optional[str] = None  # Auto-select if None
+    max_image_size: int = 1024  # Max image dimension
+    timeout: int = 30
+    temperature: float = 0.1  # Low for consistent evaluation
+
+
+class VLMEvaluator:
+    """
+    VLM-based image quality evaluator
+
+    Example:
+        >>> evaluator = VLMEvaluator(llm_service)
+        >>> result = await evaluator.evaluate_image(
+        ...     image_path="frame_001.png",
+        ...     prompt="A sunset over mountains"
+        ... )
+    """
+
+    EVALUATION_PROMPT = """请评估这张AI生成的图片质量。
+
+生成提示词: {prompt}
+{narration_section}
+
+请从以下三个维度评分(0.0-1.0):
+
+1. **美学质量** (aesthetic_score): 构图、色彩搭配、视觉吸引力
+2. **图文匹配** (text_match_score): 图片与提示词的语义对齐程度
+3. **技术质量** (technical_score): 清晰度、无伪影、无变形
+
+同时列出发现的问题(如有)。
+
+请以JSON格式返回:
+```json
+{{
+    "aesthetic_score": 0.0-1.0,
+    "text_match_score": 0.0-1.0,
+    "technical_score": 0.0-1.0,
+    "issues": ["问题1", "问题2"]
+}}
+```"""
+
+    def __init__(
+        self,
+        llm_service=None,
+        config: Optional[VLMEvaluatorConfig] = None
+    ):
+        self.llm_service = llm_service
+        self.config = config or VLMEvaluatorConfig()
+
+    def _encode_image_base64(self, image_path: str) -> str:
+        """Encode image to base64, with optional resizing"""
+        from PIL import Image
+        import io
+
+        with Image.open(image_path) as img:
+            # Resize if too large
+            max_size = self.config.max_image_size
+            if max(img.size) > max_size:
+                ratio = max_size / max(img.size)
+                new_size = (int(img.size[0] * ratio), int(img.size[1] * ratio))
+                img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            # Convert to RGB if needed
+            if img.mode in ('RGBA', 'P'):
+                img = img.convert('RGB')
+
+            # Encode to base64
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=85)
+            return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+    def _parse_response(self, response: str) -> VLMEvaluationResult:
+        """Parse VLM response to extract scores"""
+        result = VLMEvaluationResult(raw_response=response)
+
+        try:
+            # Try to extract JSON from response
+            json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response)
+            if json_match:
+                json_str = json_match.group(1)
+            else:
+                # Try to find raw JSON
+                brace_start = response.find('{')
+                brace_end = response.rfind('}')
+                if brace_start != -1 and brace_end > brace_start:
+                    json_str = response[brace_start:brace_end + 1]
+                else:
+                    logger.warning("No JSON found in VLM response")
+                    return result
+
+            data = json.loads(json_str)
+
+            result.aesthetic_score = float(data.get('aesthetic_score', 0.0))
+            result.text_match_score = float(data.get('text_match_score', 0.0))
+            result.technical_score = float(data.get('technical_score', 0.0))
+            result.issues = data.get('issues', [])
+
+            # Clamp scores to valid range
+            result.aesthetic_score = max(0.0, min(1.0, result.aesthetic_score))
+            result.text_match_score = max(0.0, min(1.0, result.text_match_score))
+            result.technical_score = max(0.0, min(1.0, result.technical_score))
+
+        except (json.JSONDecodeError, ValueError) as e:
+            logger.warning(f"Failed to parse VLM response: {e}")
+
+        return result
+
+    async def evaluate_image(
+        self,
+        image_path: str,
+        prompt: str,
+        narration: Optional[str] = None
+    ) -> VLMEvaluationResult:
+        """
+        Evaluate image quality using VLM
+
+        Args:
+            image_path: Path to image file
+            prompt: Generation prompt
+            narration: Optional narration text
+
+        Returns:
+            VLMEvaluationResult with scores
+        """
+        if not Path(image_path).exists():
+            return VLMEvaluationResult(issues=["Image file not found"])
+
+        if not self.llm_service:
+            logger.warning("No LLM service provided for VLM evaluation")
+            return VLMEvaluationResult(issues=["No LLM service"])
+
+        try:
+            # Encode image
+            image_b64 = self._encode_image_base64(image_path)
+
+            # Build prompt
+            narration_section = f"旁白文案: {narration}" if narration else ""
+            eval_prompt = self.EVALUATION_PROMPT.format(
+                prompt=prompt,
+                narration_section=narration_section
+            )
+
+            # Call VLM via LLM service with vision
+            response = await self._call_vlm(image_b64, eval_prompt)
+
+            return self._parse_response(response)
+
+        except Exception as e:
+            logger.error(f"VLM evaluation failed: {e}")
+            return VLMEvaluationResult(issues=[f"Evaluation error: {str(e)}"])
+
+    async def _call_vlm(self, image_b64: str, prompt: str) -> str:
+        """Call VLM with image and prompt"""
+        from openai import AsyncOpenAI
+
+        # Get config from LLM service
+        base_url = self.llm_service._get_config_value("base_url")
+        api_key = self.llm_service._get_config_value("api_key")
+        model = self.config.model or self.llm_service._get_config_value("model")
+
+        client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+
+        # Build message with image
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_b64}"
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt
+                    }
+                ]
+            }
+        ]
+
+        response = await client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=self.config.temperature,
+            max_tokens=500,
+            timeout=self.config.timeout
+        )
+
+        return response.choices[0].message.content