feat: Add Qwen VL support for character analysis, configurable via VLM_PROVIDER

This commit is contained in:
empty
2026-01-07 09:29:43 +08:00
parent be216eacad
commit 92183b083b

View File

@@ -145,29 +145,51 @@ Output ONLY the JSON object, no additional text."""
} }
] ]
# Get LLM config # Get VLM configuration from environment or fallback to LLM config
from pixelle_video.config import config_manager import os
llm_config = config_manager.config.llm vlm_provider = os.getenv("VLM_PROVIDER", "qwen") # qwen, glm, openai
vlm_api_key = os.getenv("VLM_API_KEY") or os.getenv("DASHSCOPE_API_KEY")
vlm_base_url = os.getenv("VLM_BASE_URL")
vlm_model = os.getenv("VLM_MODEL")
# Create OpenAI client for VLM call # Configure based on provider
if vlm_provider == "qwen":
# 通义千问 Qwen VL
vlm_base_url = vlm_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
vlm_model = vlm_model or "qwen-vl-plus" # or qwen-vl-max, qwen3-vl-plus
logger.info(f"Using Qwen VL: model={vlm_model}")
elif vlm_provider == "glm":
# 智谱 GLM-4V
from pixelle_video.config import config_manager
llm_config = config_manager.config.llm
vlm_api_key = vlm_api_key or llm_config.api_key
vlm_base_url = vlm_base_url or llm_config.base_url
vlm_model = vlm_model or "glm-4v-flash"
logger.info(f"Using GLM VL: model={vlm_model}")
else: # openai or other
from pixelle_video.config import config_manager
llm_config = config_manager.config.llm
vlm_api_key = vlm_api_key or llm_config.api_key
vlm_base_url = vlm_base_url or llm_config.base_url
vlm_model = vlm_model or llm_config.model
logger.info(f"Using {vlm_provider} VL: model={vlm_model}")
if not vlm_api_key:
logger.error("No VLM API key configured. Set VLM_API_KEY or DASHSCOPE_API_KEY environment variable.")
return CharacterAnalysisResult()
# Create OpenAI-compatible client
client = AsyncOpenAI( client = AsyncOpenAI(
api_key=llm_config.api_key, api_key=vlm_api_key,
base_url=llm_config.base_url base_url=vlm_base_url
) )
# Use vision model - GLM-4V for ZhiPu, or fall back to configured model
# Vision models: glm-4v, glm-4v-flash, gpt-4-vision-preview
vision_model = llm_config.model
if "glm" in llm_config.model.lower() and "v" not in llm_config.model.lower():
vision_model = "glm-4v-flash" # Use GLM-4V for vision tasks
logger.info(f"Using vision model: {vision_model}")
# Call VLM # Call VLM
response = await client.chat.completions.create( response = await client.chat.completions.create(
model=vision_model, model=vlm_model,
messages=messages, messages=messages,
temperature=0.3, temperature=0.3,
max_tokens=2000 # Increased to avoid truncation max_tokens=2000
) )
vlm_response = response.choices[0].message.content if response.choices else None vlm_response = response.choices[0].message.content if response.choices else None