feat: Add Qwen VL support for character analysis, configurable via VLM_PROVIDER

2026-01-07 09:29:43 +08:00
parent be216eacad
commit 92183b083b
1 changed files with 37 additions and 15 deletions
--- a/pixelle_video/services/quality/character_analyzer.py
+++ b/pixelle_video/services/quality/character_analyzer.py
@@ -145,29 +145,51 @@ Output ONLY the JSON object, no additional text."""
                }
            ]
            
-            # Get LLM config
-            from pixelle_video.config import config_manager
-            llm_config = config_manager.config.llm
+            # Get VLM configuration from environment or fallback to LLM config
+            import os
+            vlm_provider = os.getenv("VLM_PROVIDER", "qwen")  # qwen, glm, openai
+            vlm_api_key = os.getenv("VLM_API_KEY") or os.getenv("DASHSCOPE_API_KEY")
+            vlm_base_url = os.getenv("VLM_BASE_URL")
+            vlm_model = os.getenv("VLM_MODEL")
            
-            # Create OpenAI client for VLM call
+            # Configure based on provider
+            if vlm_provider == "qwen":
+                # 通义千问 Qwen VL
+                vlm_base_url = vlm_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
+                vlm_model = vlm_model or "qwen-vl-plus"  # or qwen-vl-max, qwen3-vl-plus
+                logger.info(f"Using Qwen VL: model={vlm_model}")
+            elif vlm_provider == "glm":
+                # 智谱 GLM-4V
+                from pixelle_video.config import config_manager
+                llm_config = config_manager.config.llm
+                vlm_api_key = vlm_api_key or llm_config.api_key
+                vlm_base_url = vlm_base_url or llm_config.base_url
+                vlm_model = vlm_model or "glm-4v-flash"
+                logger.info(f"Using GLM VL: model={vlm_model}")
+            else:  # openai or other
+                from pixelle_video.config import config_manager
+                llm_config = config_manager.config.llm
+                vlm_api_key = vlm_api_key or llm_config.api_key
+                vlm_base_url = vlm_base_url or llm_config.base_url
+                vlm_model = vlm_model or llm_config.model
+                logger.info(f"Using {vlm_provider} VL: model={vlm_model}")
+            
+            if not vlm_api_key:
+                logger.error("No VLM API key configured. Set VLM_API_KEY or DASHSCOPE_API_KEY environment variable.")
+                return CharacterAnalysisResult()
+            
+            # Create OpenAI-compatible client
            client = AsyncOpenAI(
-                api_key=llm_config.api_key,
-                base_url=llm_config.base_url
+                api_key=vlm_api_key,
+                base_url=vlm_base_url
            )
            
-            # Use vision model - GLM-4V for ZhiPu, or fall back to configured model
-            # Vision models: glm-4v, glm-4v-flash, gpt-4-vision-preview
-            vision_model = llm_config.model
-            if "glm" in llm_config.model.lower() and "v" not in llm_config.model.lower():
-                vision_model = "glm-4v-flash"  # Use GLM-4V for vision tasks
-            logger.info(f"Using vision model: {vision_model}")
-            
            # Call VLM
            response = await client.chat.completions.create(
-                model=vision_model,
+                model=vlm_model,
                messages=messages,
                temperature=0.3,
-                max_tokens=2000  # Increased to avoid truncation
+                max_tokens=2000
            )
            
            vlm_response = response.choices[0].message.content if response.choices else None