Skip to content

Commit 65789da

Browse files
Barry SongChristoph Hellwig
Barry Song
authored and
Christoph Hellwig
committed
dma-mapping: add benchmark support for streaming DMA APIs
Nowadays, there are increasing requirements to benchmark the performance of dma_map and dma_unmap particually while the device is attached to an IOMMU. This patch enables the support. Users can run specified number of threads to do dma_map_page and dma_unmap_page on a specific NUMA node with the specified duration. Then dma_map_benchmark will calculate the average latency for map and unmap. A difficulity for this benchmark is that dma_map/unmap APIs must run on a particular device. Each device might have different backend of IOMMU or non-IOMMU. So we use the driver_override to bind dma_map_benchmark to a particual device by: For platform devices: echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override echo xxx > /sys/bus/platform/drivers/xxx/unbind echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind For PCI devices: echo dma_map_benchmark > /sys/bus/pci/devices/0000:00:01.0/driver_override echo 0000:00:01.0 > /sys/bus/pci/drivers/xxx/unbind echo 0000:00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind Cc: Will Deacon <[email protected]> Cc: Shuah Khan <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Marek Szyprowski <[email protected]> Cc: Robin Murphy <[email protected]> Signed-off-by: Barry Song <[email protected]> [hch: folded in two fixes from Colin Ian King <[email protected]>] Signed-off-by: Christoph Hellwig <[email protected]>
1 parent 819b70a commit 65789da

File tree

3 files changed

+371
-0
lines changed

3 files changed

+371
-0
lines changed

Diff for: kernel/dma/Kconfig

+9
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,12 @@ config DMA_API_DEBUG_SG
229229
is technically out-of-spec.
230230

231231
If unsure, say N.
232+
233+
config DMA_MAP_BENCHMARK
234+
bool "Enable benchmarking of streaming DMA mapping"
235+
depends on DEBUG_FS
236+
help
237+
Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
238+
performance of dma_(un)map_page.
239+
240+
See tools/testing/selftests/dma/dma_map_benchmark.c

Diff for: kernel/dma/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o
1010
obj-$(CONFIG_SWIOTLB) += swiotlb.o
1111
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
1212
obj-$(CONFIG_DMA_REMAP) += remap.o
13+
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o

Diff for: kernel/dma/map_benchmark.c

+361
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,361 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
/*
3+
* Copyright (C) 2020 Hisilicon Limited.
4+
*/
5+
6+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7+
8+
#include <linux/debugfs.h>
9+
#include <linux/delay.h>
10+
#include <linux/device.h>
11+
#include <linux/dma-mapping.h>
12+
#include <linux/kernel.h>
13+
#include <linux/kthread.h>
14+
#include <linux/math64.h>
15+
#include <linux/module.h>
16+
#include <linux/pci.h>
17+
#include <linux/platform_device.h>
18+
#include <linux/slab.h>
19+
#include <linux/timekeeping.h>
20+
21+
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
22+
#define DMA_MAP_MAX_THREADS 1024
23+
#define DMA_MAP_MAX_SECONDS 300
24+
25+
#define DMA_MAP_BIDIRECTIONAL 0
26+
#define DMA_MAP_TO_DEVICE 1
27+
#define DMA_MAP_FROM_DEVICE 2
28+
29+
struct map_benchmark {
30+
__u64 avg_map_100ns; /* average map latency in 100ns */
31+
__u64 map_stddev; /* standard deviation of map latency */
32+
__u64 avg_unmap_100ns; /* as above */
33+
__u64 unmap_stddev;
34+
__u32 threads; /* how many threads will do map/unmap in parallel */
35+
__u32 seconds; /* how long the test will last */
36+
__s32 node; /* which numa node this benchmark will run on */
37+
__u32 dma_bits; /* DMA addressing capability */
38+
__u32 dma_dir; /* DMA data direction */
39+
__u64 expansion[10]; /* For future use */
40+
};
41+
42+
struct map_benchmark_data {
43+
struct map_benchmark bparam;
44+
struct device *dev;
45+
struct dentry *debugfs;
46+
enum dma_data_direction dir;
47+
atomic64_t sum_map_100ns;
48+
atomic64_t sum_unmap_100ns;
49+
atomic64_t sum_sq_map;
50+
atomic64_t sum_sq_unmap;
51+
atomic64_t loops;
52+
};
53+
54+
static int map_benchmark_thread(void *data)
55+
{
56+
void *buf;
57+
dma_addr_t dma_addr;
58+
struct map_benchmark_data *map = data;
59+
int ret = 0;
60+
61+
buf = (void *)__get_free_page(GFP_KERNEL);
62+
if (!buf)
63+
return -ENOMEM;
64+
65+
while (!kthread_should_stop()) {
66+
u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
67+
ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
68+
ktime_t map_delta, unmap_delta;
69+
70+
/*
71+
* for a non-coherent device, if we don't stain them in the
72+
* cache, this will give an underestimate of the real-world
73+
* overhead of BIDIRECTIONAL or TO_DEVICE mappings;
74+
* 66 means evertything goes well! 66 is lucky.
75+
*/
76+
if (map->dir != DMA_FROM_DEVICE)
77+
memset(buf, 0x66, PAGE_SIZE);
78+
79+
map_stime = ktime_get();
80+
dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
81+
if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
82+
pr_err("dma_map_single failed on %s\n",
83+
dev_name(map->dev));
84+
ret = -ENOMEM;
85+
goto out;
86+
}
87+
map_etime = ktime_get();
88+
map_delta = ktime_sub(map_etime, map_stime);
89+
90+
unmap_stime = ktime_get();
91+
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
92+
unmap_etime = ktime_get();
93+
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
94+
95+
/* calculate sum and sum of squares */
96+
97+
map_100ns = div64_ul(map_delta, 100);
98+
unmap_100ns = div64_ul(unmap_delta, 100);
99+
map_sq = map_100ns * map_100ns;
100+
unmap_sq = unmap_100ns * unmap_100ns;
101+
102+
atomic64_add(map_100ns, &map->sum_map_100ns);
103+
atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
104+
atomic64_add(map_sq, &map->sum_sq_map);
105+
atomic64_add(unmap_sq, &map->sum_sq_unmap);
106+
atomic64_inc(&map->loops);
107+
}
108+
109+
out:
110+
free_page((unsigned long)buf);
111+
return ret;
112+
}
113+
114+
static int do_map_benchmark(struct map_benchmark_data *map)
115+
{
116+
struct task_struct **tsk;
117+
int threads = map->bparam.threads;
118+
int node = map->bparam.node;
119+
const cpumask_t *cpu_mask = cpumask_of_node(node);
120+
u64 loops;
121+
int ret = 0;
122+
int i;
123+
124+
tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
125+
if (!tsk)
126+
return -ENOMEM;
127+
128+
get_device(map->dev);
129+
130+
for (i = 0; i < threads; i++) {
131+
tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
132+
map->bparam.node, "dma-map-benchmark/%d", i);
133+
if (IS_ERR(tsk[i])) {
134+
pr_err("create dma_map thread failed\n");
135+
ret = PTR_ERR(tsk[i]);
136+
goto out;
137+
}
138+
139+
if (node != NUMA_NO_NODE)
140+
kthread_bind_mask(tsk[i], cpu_mask);
141+
}
142+
143+
/* clear the old value in the previous benchmark */
144+
atomic64_set(&map->sum_map_100ns, 0);
145+
atomic64_set(&map->sum_unmap_100ns, 0);
146+
atomic64_set(&map->sum_sq_map, 0);
147+
atomic64_set(&map->sum_sq_unmap, 0);
148+
atomic64_set(&map->loops, 0);
149+
150+
for (i = 0; i < threads; i++)
151+
wake_up_process(tsk[i]);
152+
153+
msleep_interruptible(map->bparam.seconds * 1000);
154+
155+
/* wait for the completion of benchmark threads */
156+
for (i = 0; i < threads; i++) {
157+
ret = kthread_stop(tsk[i]);
158+
if (ret)
159+
goto out;
160+
}
161+
162+
loops = atomic64_read(&map->loops);
163+
if (likely(loops > 0)) {
164+
u64 map_variance, unmap_variance;
165+
u64 sum_map = atomic64_read(&map->sum_map_100ns);
166+
u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
167+
u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
168+
u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
169+
170+
/* average latency */
171+
map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
172+
map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
173+
174+
/* standard deviation of latency */
175+
map_variance = div64_u64(sum_sq_map, loops) -
176+
map->bparam.avg_map_100ns *
177+
map->bparam.avg_map_100ns;
178+
unmap_variance = div64_u64(sum_sq_unmap, loops) -
179+
map->bparam.avg_unmap_100ns *
180+
map->bparam.avg_unmap_100ns;
181+
map->bparam.map_stddev = int_sqrt64(map_variance);
182+
map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
183+
}
184+
185+
out:
186+
put_device(map->dev);
187+
kfree(tsk);
188+
return ret;
189+
}
190+
191+
static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
192+
unsigned long arg)
193+
{
194+
struct map_benchmark_data *map = file->private_data;
195+
void __user *argp = (void __user *)arg;
196+
u64 old_dma_mask;
197+
198+
int ret;
199+
200+
if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
201+
return -EFAULT;
202+
203+
switch (cmd) {
204+
case DMA_MAP_BENCHMARK:
205+
if (map->bparam.threads == 0 ||
206+
map->bparam.threads > DMA_MAP_MAX_THREADS) {
207+
pr_err("invalid thread number\n");
208+
return -EINVAL;
209+
}
210+
211+
if (map->bparam.seconds == 0 ||
212+
map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
213+
pr_err("invalid duration seconds\n");
214+
return -EINVAL;
215+
}
216+
217+
if (map->bparam.node != NUMA_NO_NODE &&
218+
!node_possible(map->bparam.node)) {
219+
pr_err("invalid numa node\n");
220+
return -EINVAL;
221+
}
222+
223+
switch (map->bparam.dma_dir) {
224+
case DMA_MAP_BIDIRECTIONAL:
225+
map->dir = DMA_BIDIRECTIONAL;
226+
break;
227+
case DMA_MAP_FROM_DEVICE:
228+
map->dir = DMA_FROM_DEVICE;
229+
break;
230+
case DMA_MAP_TO_DEVICE:
231+
map->dir = DMA_TO_DEVICE;
232+
break;
233+
default:
234+
pr_err("invalid DMA direction\n");
235+
return -EINVAL;
236+
}
237+
238+
old_dma_mask = dma_get_mask(map->dev);
239+
240+
ret = dma_set_mask(map->dev,
241+
DMA_BIT_MASK(map->bparam.dma_bits));
242+
if (ret) {
243+
pr_err("failed to set dma_mask on device %s\n",
244+
dev_name(map->dev));
245+
return -EINVAL;
246+
}
247+
248+
ret = do_map_benchmark(map);
249+
250+
/*
251+
* restore the original dma_mask as many devices' dma_mask are
252+
* set by architectures, acpi, busses. When we bind them back
253+
* to their original drivers, those drivers shouldn't see
254+
* dma_mask changed by benchmark
255+
*/
256+
dma_set_mask(map->dev, old_dma_mask);
257+
break;
258+
default:
259+
return -EINVAL;
260+
}
261+
262+
if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
263+
return -EFAULT;
264+
265+
return ret;
266+
}
267+
268+
static const struct file_operations map_benchmark_fops = {
269+
.open = simple_open,
270+
.unlocked_ioctl = map_benchmark_ioctl,
271+
};
272+
273+
static void map_benchmark_remove_debugfs(void *data)
274+
{
275+
struct map_benchmark_data *map = (struct map_benchmark_data *)data;
276+
277+
debugfs_remove(map->debugfs);
278+
}
279+
280+
static int __map_benchmark_probe(struct device *dev)
281+
{
282+
struct dentry *entry;
283+
struct map_benchmark_data *map;
284+
int ret;
285+
286+
map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
287+
if (!map)
288+
return -ENOMEM;
289+
map->dev = dev;
290+
291+
ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
292+
if (ret) {
293+
pr_err("Can't add debugfs remove action\n");
294+
return ret;
295+
}
296+
297+
/*
298+
* we only permit a device bound with this driver, 2nd probe
299+
* will fail
300+
*/
301+
entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
302+
&map_benchmark_fops);
303+
if (IS_ERR(entry))
304+
return PTR_ERR(entry);
305+
map->debugfs = entry;
306+
307+
return 0;
308+
}
309+
310+
static int map_benchmark_platform_probe(struct platform_device *pdev)
311+
{
312+
return __map_benchmark_probe(&pdev->dev);
313+
}
314+
315+
static struct platform_driver map_benchmark_platform_driver = {
316+
.driver = {
317+
.name = "dma_map_benchmark",
318+
},
319+
.probe = map_benchmark_platform_probe,
320+
};
321+
322+
static int
323+
map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
324+
{
325+
return __map_benchmark_probe(&pdev->dev);
326+
}
327+
328+
static struct pci_driver map_benchmark_pci_driver = {
329+
.name = "dma_map_benchmark",
330+
.probe = map_benchmark_pci_probe,
331+
};
332+
333+
static int __init map_benchmark_init(void)
334+
{
335+
int ret;
336+
337+
ret = pci_register_driver(&map_benchmark_pci_driver);
338+
if (ret)
339+
return ret;
340+
341+
ret = platform_driver_register(&map_benchmark_platform_driver);
342+
if (ret) {
343+
pci_unregister_driver(&map_benchmark_pci_driver);
344+
return ret;
345+
}
346+
347+
return 0;
348+
}
349+
350+
static void __exit map_benchmark_cleanup(void)
351+
{
352+
platform_driver_unregister(&map_benchmark_platform_driver);
353+
pci_unregister_driver(&map_benchmark_pci_driver);
354+
}
355+
356+
module_init(map_benchmark_init);
357+
module_exit(map_benchmark_cleanup);
358+
359+
MODULE_AUTHOR("Barry Song <[email protected]>");
360+
MODULE_DESCRIPTION("dma_map benchmark driver");
361+
MODULE_LICENSE("GPL");

0 commit comments

Comments
 (0)