diff --git a/pixelle_video/services/quality/character_analyzer.py b/pixelle_video/services/quality/character_analyzer.py index 3a38435..349559e 100644 --- a/pixelle_video/services/quality/character_analyzer.py +++ b/pixelle_video/services/quality/character_analyzer.py @@ -145,29 +145,51 @@ Output ONLY the JSON object, no additional text.""" } ] - # Get LLM config - from pixelle_video.config import config_manager - llm_config = config_manager.config.llm + # Get VLM configuration from environment or fallback to LLM config + import os + vlm_provider = os.getenv("VLM_PROVIDER", "qwen") # qwen, glm, openai + vlm_api_key = os.getenv("VLM_API_KEY") or os.getenv("DASHSCOPE_API_KEY") + vlm_base_url = os.getenv("VLM_BASE_URL") + vlm_model = os.getenv("VLM_MODEL") - # Create OpenAI client for VLM call + # Configure based on provider + if vlm_provider == "qwen": + # 通义千问 Qwen VL + vlm_base_url = vlm_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + vlm_model = vlm_model or "qwen-vl-plus" # or qwen-vl-max, qwen3-vl-plus + logger.info(f"Using Qwen VL: model={vlm_model}") + elif vlm_provider == "glm": + # 智谱 GLM-4V + from pixelle_video.config import config_manager + llm_config = config_manager.config.llm + vlm_api_key = vlm_api_key or llm_config.api_key + vlm_base_url = vlm_base_url or llm_config.base_url + vlm_model = vlm_model or "glm-4v-flash" + logger.info(f"Using GLM VL: model={vlm_model}") + else: # openai or other + from pixelle_video.config import config_manager + llm_config = config_manager.config.llm + vlm_api_key = vlm_api_key or llm_config.api_key + vlm_base_url = vlm_base_url or llm_config.base_url + vlm_model = vlm_model or llm_config.model + logger.info(f"Using {vlm_provider} VL: model={vlm_model}") + + if not vlm_api_key: + logger.error("No VLM API key configured. Set VLM_API_KEY or DASHSCOPE_API_KEY environment variable.") + return CharacterAnalysisResult() + + # Create OpenAI-compatible client client = AsyncOpenAI( - api_key=llm_config.api_key, - base_url=llm_config.base_url + api_key=vlm_api_key, + base_url=vlm_base_url ) - # Use vision model - GLM-4V for ZhiPu, or fall back to configured model - # Vision models: glm-4v, glm-4v-flash, gpt-4-vision-preview - vision_model = llm_config.model - if "glm" in llm_config.model.lower() and "v" not in llm_config.model.lower(): - vision_model = "glm-4v-flash" # Use GLM-4V for vision tasks - logger.info(f"Using vision model: {vision_model}") - # Call VLM response = await client.chat.completions.create( - model=vision_model, + model=vlm_model, messages=messages, temperature=0.3, - max_tokens=2000 # Increased to avoid truncation + max_tokens=2000 ) vlm_response = response.choices[0].message.content if response.choices else None