Skip to content

Commit 2b36dcc

Browse files
committed
feat: add sample for uploading video file
1 parent b537b0a commit 2b36dcc

16 files changed

+1853
-44
lines changed

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,28 @@ await dashScopeClient.DeleteFileAsync(uploadedFile.Id);
510510
Use `GetMultimodalGenerationAsync`/`GetMultimodalGenerationStreamAsync`
511511
[Official Documentation](https://help.aliyun.com/zh/model-studio/multimodal)
512512

513+
### Upload file for multimodal usage
514+
515+
You can upload file to get an oss link before multimodal usage.
516+
517+
```csharp
518+
await using var video = File.OpenRead("sample.mp4");
519+
var ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", video, "sample.mp4");
520+
Console.WriteLine($"File uploaded: {ossLink}");
521+
522+
var messages = new List<MultimodalMessage>();
523+
messages.Add(
524+
MultimodalMessage.User(
525+
[
526+
MultimodalMessageContent.VideoContent(ossLink, fps: 2),
527+
// MultimodalMessageContent.VideoFrames(links),
528+
// MultimodalMessageContent.ImageContent(link)
529+
MultimodalMessageContent.TextContent("这段视频的内容是什么?")
530+
]));
531+
```
532+
533+
### Image recognition/thinking
534+
513535
```csharp
514536
var image = await File.ReadAllBytesAsync("Lenna.jpg");
515537
var response = dashScopeClient.GetMultimodalGenerationStreamAsync(
@@ -527,7 +549,13 @@ var response = dashScopeClient.GetMultimodalGenerationStreamAsync(
527549
])
528550
]
529551
},
530-
Parameters = new MultimodalParameters { IncrementalOutput = true, VlHighResolutionImages = false }
552+
Parameters =
553+
new MultimodalParameters
554+
{
555+
IncrementalOutput = true,
556+
// EnableThinking = true,
557+
VlHighResolutionImages = false
558+
}
531559
});
532560

533561
// output
@@ -564,6 +592,7 @@ await foreach (var modelResponse in response)
564592
}
565593
}
566594
```
595+
567596
## Text-to-Speech
568597

569598
Create a speech synthesis session using `dashScopeClient.CreateSpeechSynthesizerSocketSessionAsync()`.

README.zh-Hans.md

Lines changed: 103 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2313,7 +2313,6 @@ Usage: in(721)/out(7505)/total(0)
23132313
```
23142314
23152315
2316-
23172316
## 多模态
23182317
23192318
使用 `dashScopeClient.GetMultimodalGenerationAsync``dashScopeClient.GetMultimodalGenerationStreamAsync` 来访问多模态文本生成接口。
@@ -2326,59 +2325,124 @@ Usage: in(721)/out(7505)/total(0)
23262325
23272326
媒体内容可以通过公网 URL 或者 `byte[]` 传入。
23282327
2328+
您也可以通过 `UploadTemporaryFileAsync` 方法上传临时文件获取 `oss://` 开头的链接。
2329+
2330+
```csharp
2331+
await using var lenna = File.OpenRead("Lenna.jpg");
2332+
string ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", lenna, "lenna.jpg");
2333+
Console.WriteLine($"File uploaded: {ossLink}");
2334+
2335+
// 使用链接
2336+
var messages = new List<MultimodalMessage>();
2337+
messages.Add(
2338+
MultimodalMessage.User(
2339+
[
2340+
MultimodalMessageContent.ImageContent(ossLink),
2341+
MultimodalMessageContent.TextContent("她是谁?")
2342+
]));
2343+
```
2344+
2345+
您可以通过参数 `EnableThinking` 控制是否开启推理(需要模型支持)。
2346+
2347+
参数 `VlHighResolutionImages` 控制模型读取模型的精度,开启后会增加图片/视频的 Token 使用量。
2348+
2349+
以下是完整示例:
2350+
23292351
```csharp
2330-
var image = await File.ReadAllBytesAsync("Lenna.jpg");
2331-
var response = dashScopeClient.GetMultimodalGenerationStreamAsync(
2352+
await using var lenna = File.OpenRead("Lenna.jpg");
2353+
var ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", lenna, "lenna.jpg");
2354+
Console.WriteLine($"File uploaded: {ossLink}");
2355+
var messages = new List<MultimodalMessage>();
2356+
messages.Add(
2357+
MultimodalMessage.User(
2358+
[
2359+
MultimodalMessageContent.ImageContent(ossLink),
2360+
MultimodalMessageContent.TextContent("她是谁?")
2361+
]));
2362+
var completion = client.GetMultimodalGenerationStreamAsync(
23322363
new ModelRequest<MultimodalInput, IMultimodalParameters>()
23332364
{
2334-
Model = "qvq-plus",
2335-
Input = new MultimodalInput()
2365+
Model = "qwen3-vl-plus",
2366+
Input = new MultimodalInput() { Messages = messages },
2367+
Parameters = new MultimodalParameters()
23362368
{
2337-
Messages =
2338-
[
2339-
MultimodalMessage.User(
2340-
[
2341-
MultimodalMessageContent.ImageContent(image, "image/jpeg"),
2342-
MultimodalMessageContent.TextContent("她是谁?")
2343-
])
2344-
]
2345-
},
2346-
Parameters = new MultimodalParameters { IncrementalOutput = true, VlHighResolutionImages = false }
2369+
IncrementalOutput = true,
2370+
EnableThinking = true,
2371+
VlHighResolutionImages = true
2372+
}
23472373
});
2348-
2349-
// output
2374+
var reply = new StringBuilder();
23502375
var reasoning = false;
2351-
await foreach (var modelResponse in response)
2376+
MultimodalTokenUsage? usage = null;
2377+
await foreach (var chunk in completion)
23522378
{
2353-
var choice = modelResponse.Output.Choices.FirstOrDefault();
2354-
if (choice != null)
2379+
var choice = chunk.Output.Choices[0];
2380+
if (string.IsNullOrEmpty(choice.Message.ReasoningContent) == false)
23552381
{
2356-
if (choice.FinishReason != "null")
2382+
// reasoning
2383+
if (reasoning == false)
23572384
{
2358-
break;
2385+
Console.Write("Reasoning > ");
2386+
reasoning = true;
23592387
}
23602388

2361-
if (string.IsNullOrEmpty(choice.Message.ReasoningContent) == false)
2362-
{
2363-
if (reasoning == false)
2364-
{
2365-
reasoning = true;
2366-
Console.WriteLine("<think>");
2367-
}
2368-
2369-
Console.Write(choice.Message.ReasoningContent);
2370-
continue;
2371-
}
2389+
Console.Write(choice.Message.ReasoningContent);
2390+
continue;
2391+
}
23722392

2373-
if (reasoning)
2374-
{
2375-
reasoning = false;
2376-
Console.WriteLine("</think>");
2377-
}
2393+
if (reasoning)
2394+
{
2395+
reasoning = false;
2396+
Console.WriteLine();
2397+
Console.Write("Assistant > ");
2398+
}
23782399

2379-
Console.Write(choice.Message.Content[0].Text);
2400+
if (choice.Message.Content.Count == 0)
2401+
{
2402+
continue;
23802403
}
2404+
2405+
Console.Write(choice.Message.Content[0].Text);
2406+
reply.Append(choice.Message.Content[0].Text);
2407+
usage = chunk.Usage;
23812408
}
2409+
2410+
Console.WriteLine();
2411+
messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
2412+
if (usage != null)
2413+
{
2414+
Console.WriteLine(
2415+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/reasoning({usage.OutputTokensDetails?.ReasoningTokens})/total({usage.TotalTokens})");
2416+
}
2417+
2418+
/*
2419+
Reasoning > 用户现在需要识别图中的人物。这张照片里的女性是Nancy Sinatra(南希·辛纳特拉),她是美国60年代著名的歌手、演员,也是Frank Sinatra的女儿。她的标志性风格包括复古装扮和独特的 音乐风格,这张照片的造型(宽檐帽、羽毛装饰)符合她那个时期的时尚风格。需要确认信息准确性,Nancy Sinatra在60年代的影像资料中常见这样的复古造型,所以判断是她。
2420+
2421+
Assistant > 图中人物是**南希·辛纳特拉(Nancy Sinatra)**,她是美国20世纪60年代著名的歌手、演员,也是传奇歌手弗兰克·辛纳特拉(Frank Sinatra)的女儿。她以独特的复古风格、音乐作品(如经典歌曲 *These Boots Are Made for Walkin’* )和影视表现闻名,这张照片的造型(宽檐帽搭配羽毛装饰等)也契合她标志性的时尚风格。
2422+
Usage: in(271)/out(199)/image(258)/reasoning(98)/total(470)
2423+
*/
2424+
```
2425+
2426+
#### 传入视频文件
2427+
2428+
VL 系列模型支持传入视频文件,通过 Video 参数传入视频文件链接或者图片序列均可。
2429+
2430+
传入视频文件时,您可以通过 `fps` 参数来控制模型应当隔多少秒(1/fps 秒)抽取一帧作为输入。
2431+
2432+
```csharp
2433+
// 使用本地文件需要提前上传
2434+
await using var video = File.OpenRead("sample.mp4");
2435+
var ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", video, "sample.mp4");
2436+
Console.WriteLine($"File uploaded: {ossLink}");
2437+
2438+
var messages = new List<MultimodalMessage>();
2439+
messages.Add(
2440+
MultimodalMessage.User(
2441+
[
2442+
MultimodalMessageContent.VideoContent(ossLink, fps: 2),
2443+
// MultimodalMessageContent.VideoFrames(links),
2444+
MultimodalMessageContent.TextContent("这段视频的内容是什么?")
2445+
]));
23822446
```
23832447
23842448
## 语音合成

sample/Cnblogs.DashScope.Sample/Cnblogs.DashScope.Sample.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
<None Update="1024-2.txt">
2727
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
2828
</None>
29+
<None Update="sample.mp4">
30+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
31+
</None>
2932
</ItemGroup>
3033

3134
<ItemGroup>
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
using System.Text;
2+
using Cnblogs.DashScope.Core;
3+
4+
namespace Cnblogs.DashScope.Sample.Multimodal;
5+
6+
public class ImageUploadSample : ISample
7+
{
8+
/// <inheritdoc />
9+
public string Description => "Upload image from file system";
10+
11+
/// <inheritdoc />
12+
public async Task RunAsync(IDashScopeClient client)
13+
{
14+
// upload file
15+
await using var lenna = File.OpenRead("Lenna.jpg");
16+
var ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", lenna, "lenna.jpg");
17+
Console.WriteLine($"File uploaded: {ossLink}");
18+
var messages = new List<MultimodalMessage>();
19+
messages.Add(
20+
MultimodalMessage.User(
21+
[
22+
MultimodalMessageContent.ImageContent(ossLink),
23+
MultimodalMessageContent.TextContent("她是谁?")
24+
]));
25+
var completion = client.GetMultimodalGenerationStreamAsync(
26+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
27+
{
28+
Model = "qwen3-vl-plus",
29+
Input = new MultimodalInput() { Messages = messages },
30+
Parameters = new MultimodalParameters()
31+
{
32+
IncrementalOutput = true,
33+
EnableThinking = true,
34+
VlHighResolutionImages = true
35+
}
36+
});
37+
var reply = new StringBuilder();
38+
var reasoning = false;
39+
MultimodalTokenUsage? usage = null;
40+
await foreach (var chunk in completion)
41+
{
42+
var choice = chunk.Output.Choices[0];
43+
if (string.IsNullOrEmpty(choice.Message.ReasoningContent) == false)
44+
{
45+
// reasoning
46+
if (reasoning == false)
47+
{
48+
Console.Write("Reasoning > ");
49+
reasoning = true;
50+
}
51+
52+
Console.Write(choice.Message.ReasoningContent);
53+
continue;
54+
}
55+
56+
if (reasoning)
57+
{
58+
reasoning = false;
59+
Console.WriteLine();
60+
Console.Write("Assistant > ");
61+
}
62+
63+
if (choice.Message.Content.Count == 0)
64+
{
65+
continue;
66+
}
67+
68+
Console.Write(choice.Message.Content[0].Text);
69+
reply.Append(choice.Message.Content[0].Text);
70+
usage = chunk.Usage;
71+
}
72+
73+
Console.WriteLine();
74+
messages.Add(MultimodalMessage.Assistant([MultimodalMessageContent.TextContent(reply.ToString())]));
75+
if (usage != null)
76+
{
77+
Console.WriteLine(
78+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/reasoning({usage.OutputTokensDetails?.ReasoningTokens})/total({usage.TotalTokens})");
79+
}
80+
}
81+
}
82+
83+
/*
84+
Reasoning > 用户现在需要识别图中的人物。这张照片里的女性是Nancy Sinatra(南希·辛纳特拉),她是美国60年代著名的歌手、演员,也是Frank Sinatra的女儿。她的标志性风格包括复古装扮和独特的 音乐风格,这张照片的造型(宽檐帽、羽毛装饰)符合她那个时期的时尚风格。需要确认信息准确性,Nancy Sinatra在60年代的影像资料中常见这样的复古造型,所以判断是她。
85+
86+
Assistant > 图中人物是**南希·辛纳特拉(Nancy Sinatra)**,她是美国20世纪60年代著名的歌手、演员,也是传奇歌手弗兰克·辛纳特拉(Frank Sinatra)的女儿。她以独特的复古风格、音乐作品(如经典歌曲 *These Boots Are Made for Walkin’* )和影视表现闻名,这张照片的造型(宽檐帽搭配羽毛装饰等)也契合她标志性的时尚风格。
87+
Usage: in(271)/out(199)/image(258)/reasoning(98)/total(470)
88+
*/

0 commit comments

Comments
 (0)