From 2be9256c486edffb65866721286ad0bfc2fb5e57 Mon Sep 17 00:00:00 2001
From: empty <let5sne.mac@gmail.com>
Date: Wed, 7 Jan 2026 00:22:33 +0800
Subject: [PATCH] fix: Use OpenAI multimodal message format for VLM style
 extraction

---
 pixelle_video/services/quality/style_guard.py | 48 +++++++++++++++----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/pixelle_video/services/quality/style_guard.py b/pixelle_video/services/quality/style_guard.py
index 2001a77..2109457 100644
--- a/pixelle_video/services/quality/style_guard.py
+++ b/pixelle_video/services/quality/style_guard.py
@@ -170,6 +170,7 @@ class StyleGuard:
             
             import base64
             import os
+            from openai import AsyncOpenAI
             
             # Read and encode image
             if not os.path.exists(image_path):
@@ -183,7 +184,7 @@ class StyleGuard:
             ext = os.path.splitext(image_path)[1].lower()
             media_type = "image/png" if ext == ".png" else "image/jpeg"
             
-            # Call VLM to analyze style
+            # Style extraction prompt
             style_prompt = """Analyze this image and extract its visual style characteristics.
 
 Provide a concise style description that could be used as a prefix for image generation prompts to maintain visual consistency.
@@ -199,16 +200,43 @@ Output format (JSON):
 
 Focus on creating a specific, reproducible style_prefix that will generate visually consistent images."""
 
-            # Try to call LLM with vision capability
+            # Build multimodal message with image
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": style_prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:{media_type};base64,{image_data}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # Get LLM config for VLM call
+            from pixelle_video.config import config_manager
+            llm_config = config_manager.config.llm
+            
+            # Create OpenAI client directly for VLM call
+            client = AsyncOpenAI(
+                api_key=llm_config.api_key,
+                base_url=llm_config.base_url
+            )
+            
+            # Call VLM with multimodal message
             try:
-                response = await self.llm_service(
-                    prompt=style_prompt,
-                    images=[f"data:{media_type};base64,{image_data}"],
+                response = await client.chat.completions.create(
+                    model=llm_config.model,
+                    messages=messages,
                     temperature=0.3,
                     max_tokens=500
                 )
+                vlm_response = response.choices[0].message.content
+                logger.debug(f"VLM style extraction response: {vlm_response[:100]}...")
             except Exception as e:
-                # Fallback: try without image (text-only LLM)
                 logger.warning(f"VLM call failed, using basic extraction: {e}")
                 return self._extract_basic(image_path)
             
@@ -218,11 +246,11 @@ Focus on creating a specific, reproducible style_prefix that will generate visua
             
             try:
                 # Try to extract JSON from response
-                match = re.search(r'\{[\s\S]*\}', response)
+                match = re.search(r'\{[\s\S]*\}', vlm_response)
                 if match:
                     data = json.loads(match.group())
                 else:
-                    data = json.loads(response)
+                    data = json.loads(vlm_response)
                 
                 anchor = StyleAnchor(
                     art_style=data.get("art_style", ""),
@@ -239,9 +267,9 @@ Focus on creating a specific, reproducible style_prefix that will generate visua
             except (json.JSONDecodeError, KeyError) as e:
                 logger.warning(f"Failed to parse VLM response: {e}")
                 # Use the raw response as style_prefix if it looks reasonable
-                if len(response) < 200 and len(response) > 20:
+                if len(vlm_response) < 200 and len(vlm_response) > 20:
                     return StyleAnchor(
-                        style_prefix=response.strip(),
+                        style_prefix=vlm_response.strip(),
                         reference_image=image_path,
                     )
                 return self._extract_basic(image_path)