@@ -2313,7 +2313,6 @@ Usage: in(721)/out(7505)/total(0)
23132313` ` `
23142314
23152315
2316-
23172316# # 多模态
23182317
23192318使用 ` dashScopeClient.GetMultimodalGenerationAsync` 和 ` dashScopeClient.GetMultimodalGenerationStreamAsync` 来访问多模态文本生成接口。
@@ -2326,59 +2325,124 @@ Usage: in(721)/out(7505)/total(0)
23262325
23272326媒体内容可以通过公网 URL 或者 ` byte[]` 传入。
23282327
2328+ 您也可以通过 ` UploadTemporaryFileAsync` 方法上传临时文件获取 ` oss://` 开头的链接。
2329+
2330+ ` ` ` csharp
2331+ await using var lenna = File.OpenRead(" Lenna.jpg" );
2332+ string ossLink = await client.UploadTemporaryFileAsync(" qwen3-vl-plus" , lenna, " lenna.jpg" );
2333+ Console.WriteLine($" File uploaded: {ossLink}" );
2334+
2335+ // 使用链接
2336+ var messages = new List<MultimodalMessage> ();
2337+ messages.Add(
2338+ MultimodalMessage.User(
2339+ [
2340+ MultimodalMessageContent.ImageContent(ossLink),
2341+ MultimodalMessageContent.TextContent(" 她是谁?" )
2342+ ]));
2343+ ` ` `
2344+
2345+ 您可以通过参数 ` EnableThinking` 控制是否开启推理(需要模型支持)。
2346+
2347+ 参数 ` VlHighResolutionImages` 控制模型读取模型的精度,开启后会增加图片/视频的 Token 使用量。
2348+
2349+ 以下是完整示例:
2350+
23292351` ` ` csharp
2330- var image = await File.ReadAllBytesAsync(" Lenna.jpg" );
2331- var response = dashScopeClient.GetMultimodalGenerationStreamAsync(
2352+ await using var lenna = File.OpenRead(" Lenna.jpg" );
2353+ var ossLink = await client.UploadTemporaryFileAsync(" qwen3-vl-plus" , lenna, " lenna.jpg" );
2354+ Console.WriteLine($" File uploaded: {ossLink}" );
2355+ var messages = new List<MultimodalMessage> ();
2356+ messages.Add(
2357+ MultimodalMessage.User(
2358+ [
2359+ MultimodalMessageContent.ImageContent(ossLink),
2360+ MultimodalMessageContent.TextContent(" 她是谁?" )
2361+ ]));
2362+ var completion = client.GetMultimodalGenerationStreamAsync(
23322363 new ModelRequest< MultimodalInput, IMultimodalParameters> ()
23332364 {
2334- Model = " qvq-plus" ,
2335- Input = new MultimodalInput ()
2365+ Model = " qwen3-vl-plus" ,
2366+ Input = new MultimodalInput () { Messages = messages },
2367+ Parameters = new MultimodalParameters ()
23362368 {
2337- Messages =
2338- [
2339- MultimodalMessage.User(
2340- [
2341- MultimodalMessageContent.ImageContent(image, " image/jpeg" ),
2342- MultimodalMessageContent.TextContent(" 她是谁?" )
2343- ])
2344- ]
2345- },
2346- Parameters = new MultimodalParameters { IncrementalOutput = true, VlHighResolutionImages = false }
2369+ IncrementalOutput = true,
2370+ EnableThinking = true,
2371+ VlHighResolutionImages = true
2372+ }
23472373 });
2348-
2349- // output
2374+ var reply = new StringBuilder ();
23502375var reasoning = false ;
2351- await foreach (var modelResponse in response)
2376+ MultimodalTokenUsage? usage = null;
2377+ await foreach (var chunk in completion)
23522378{
2353- var choice = modelResponse .Output.Choices.FirstOrDefault () ;
2354- if (choice ! = null )
2379+ var choice = chunk .Output.Choices[0] ;
2380+ if (string.IsNullOrEmpty( choice.Message.ReasoningContent) == false )
23552381 {
2356- if (choice.FinishReason ! = " null" )
2382+ // reasoning
2383+ if (reasoning == false)
23572384 {
2358- break ;
2385+ Console.Write(" Reasoning > " );
2386+ reasoning = true ;
23592387 }
23602388
2361- if (string.IsNullOrEmpty(choice.Message.ReasoningContent) == false)
2362- {
2363- if (reasoning == false)
2364- {
2365- reasoning = true ;
2366- Console.WriteLine(" <think>" );
2367- }
2368-
2369- Console.Write(choice.Message.ReasoningContent);
2370- continue ;
2371- }
2389+ Console.Write(choice.Message.ReasoningContent);
2390+ continue ;
2391+ }
23722392
2373- if (reasoning)
2374- {
2375- reasoning = false ;
2376- Console.WriteLine(" </think>" );
2377- }
2393+ if (reasoning)
2394+ {
2395+ reasoning = false ;
2396+ Console.WriteLine ();
2397+ Console.Write(" Assistant > " );
2398+ }
23782399
2379- Console.Write(choice.Message.Content[0].Text);
2400+ if (choice.Message.Content.Count == 0)
2401+ {
2402+ continue ;
23802403 }
2404+
2405+ Console.Write(choice.Message.Content[0].Text);
2406+ reply.Append(choice.Message.Content[0].Text);
2407+ usage = chunk.Usage;
23812408}
2409+
2410+ Console.WriteLine ();
2411+ messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString ())]));
2412+ if (usage ! = null)
2413+ {
2414+ Console.WriteLine(
2415+ $" Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/reasoning({usage.OutputTokensDetails?.ReasoningTokens})/total({usage.TotalTokens})" );
2416+ }
2417+
2418+ /*
2419+ Reasoning > 用户现在需要识别图中的人物。这张照片里的女性是Nancy Sinatra(南希·辛纳特拉),她是美国60年代著名的歌手、演员,也是Frank Sinatra的女儿。她的标志性风格包括复古装扮和独特的 音乐风格,这张照片的造型(宽檐帽、羽毛装饰)符合她那个时期的时尚风格。需要确认信息准确性,Nancy Sinatra在60年代的影像资料中常见这样的复古造型,所以判断是她。
2420+
2421+ Assistant > 图中人物是** 南希·辛纳特拉(Nancy Sinatra)** ,她是美国20世纪60年代著名的歌手、演员,也是传奇歌手弗兰克·辛纳特拉(Frank Sinatra)的女儿。她以独特的复古风格、音乐作品(如经典歌曲 * These Boots Are Made for Walkin’* )和影视表现闻名,这张照片的造型(宽檐帽搭配羽毛装饰等)也契合她标志性的时尚风格。
2422+ Usage: in(271)/out(199)/image(258)/reasoning(98)/total(470)
2423+ * /
2424+ ` ` `
2425+
2426+ # ### 传入视频文件
2427+
2428+ VL 系列模型支持传入视频文件,通过 Video 参数传入视频文件链接或者图片序列均可。
2429+
2430+ 传入视频文件时,您可以通过 ` fps` 参数来控制模型应当隔多少秒(1/fps 秒)抽取一帧作为输入。
2431+
2432+ ` ` ` csharp
2433+ // 使用本地文件需要提前上传
2434+ await using var video = File.OpenRead(" sample.mp4" );
2435+ var ossLink = await client.UploadTemporaryFileAsync(" qwen3-vl-plus" , video, " sample.mp4" );
2436+ Console.WriteLine($" File uploaded: {ossLink}" );
2437+
2438+ var messages = new List<MultimodalMessage> ();
2439+ messages.Add(
2440+ MultimodalMessage.User(
2441+ [
2442+ MultimodalMessageContent.VideoContent(ossLink, fps: 2),
2443+ // MultimodalMessageContent.VideoFrames(links),
2444+ MultimodalMessageContent.TextContent(" 这段视频的内容是什么?" )
2445+ ]));
23822446` ` `
23832447
23842448# # 语音合成
0 commit comments