from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor from qwen_omni_utils import process_mm_info
# default: Load the model on the available device(s) # model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} ], }, { "role": "user", "content": [ {"type": "text", "text": "你好,介绍一下杭州的美食!"}, # {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"}, ], }, ]
# set use audio in video USE_AUDIO_IN_VIDEO = True
[10:00:38] /github/workspace/src/video/video_reader.cc:83: ERROR opening: https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4, Protocol not found