feat: Add Qwen VL support for character analysis, configurable via VLM_PROVIDER
This commit is contained in:
@@ -145,29 +145,51 @@ Output ONLY the JSON object, no additional text."""
|
||||
}
|
||||
]
|
||||
|
||||
# Get LLM config
|
||||
from pixelle_video.config import config_manager
|
||||
llm_config = config_manager.config.llm
|
||||
# Get VLM configuration from environment or fallback to LLM config
|
||||
import os
|
||||
vlm_provider = os.getenv("VLM_PROVIDER", "qwen") # qwen, glm, openai
|
||||
vlm_api_key = os.getenv("VLM_API_KEY") or os.getenv("DASHSCOPE_API_KEY")
|
||||
vlm_base_url = os.getenv("VLM_BASE_URL")
|
||||
vlm_model = os.getenv("VLM_MODEL")
|
||||
|
||||
# Create OpenAI client for VLM call
|
||||
# Configure based on provider
|
||||
if vlm_provider == "qwen":
|
||||
# 通义千问 Qwen VL
|
||||
vlm_base_url = vlm_base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
vlm_model = vlm_model or "qwen-vl-plus" # or qwen-vl-max, qwen3-vl-plus
|
||||
logger.info(f"Using Qwen VL: model={vlm_model}")
|
||||
elif vlm_provider == "glm":
|
||||
# 智谱 GLM-4V
|
||||
from pixelle_video.config import config_manager
|
||||
llm_config = config_manager.config.llm
|
||||
vlm_api_key = vlm_api_key or llm_config.api_key
|
||||
vlm_base_url = vlm_base_url or llm_config.base_url
|
||||
vlm_model = vlm_model or "glm-4v-flash"
|
||||
logger.info(f"Using GLM VL: model={vlm_model}")
|
||||
else: # openai or other
|
||||
from pixelle_video.config import config_manager
|
||||
llm_config = config_manager.config.llm
|
||||
vlm_api_key = vlm_api_key or llm_config.api_key
|
||||
vlm_base_url = vlm_base_url or llm_config.base_url
|
||||
vlm_model = vlm_model or llm_config.model
|
||||
logger.info(f"Using {vlm_provider} VL: model={vlm_model}")
|
||||
|
||||
if not vlm_api_key:
|
||||
logger.error("No VLM API key configured. Set VLM_API_KEY or DASHSCOPE_API_KEY environment variable.")
|
||||
return CharacterAnalysisResult()
|
||||
|
||||
# Create OpenAI-compatible client
|
||||
client = AsyncOpenAI(
|
||||
api_key=llm_config.api_key,
|
||||
base_url=llm_config.base_url
|
||||
api_key=vlm_api_key,
|
||||
base_url=vlm_base_url
|
||||
)
|
||||
|
||||
# Use vision model - GLM-4V for ZhiPu, or fall back to configured model
|
||||
# Vision models: glm-4v, glm-4v-flash, gpt-4-vision-preview
|
||||
vision_model = llm_config.model
|
||||
if "glm" in llm_config.model.lower() and "v" not in llm_config.model.lower():
|
||||
vision_model = "glm-4v-flash" # Use GLM-4V for vision tasks
|
||||
logger.info(f"Using vision model: {vision_model}")
|
||||
|
||||
# Call VLM
|
||||
response = await client.chat.completions.create(
|
||||
model=vision_model,
|
||||
model=vlm_model,
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=2000 # Increased to avoid truncation
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
vlm_response = response.choices[0].message.content if response.choices else None
|
||||
|
||||
Reference in New Issue
Block a user