Skip to content

Very slightly better fused delta-net#1330

Merged
ikawrakow merged 1 commit intomainfrom
ik/slightly_better_fdn
Feb 27, 2026
Merged

Very slightly better fused delta-net#1330
ikawrakow merged 1 commit intomainfrom
ik/slightly_better_fdn

Conversation

@ikawrakow
Copy link
Owner

~1% or less performance gain with full GPU offload.

@ubergarm
Copy link
Contributor

Yes, testing with Qwen3.5-35B-A3B MoE

sweep-bench-Qwen3 5-35B-A3B-3090TI
👈 Details

main@216f4436 default batches (-ub 512 -b 2048)

model=/models/ubergarm/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q4_0.gguf
./build/bin/llama-sweep-bench \
  --model "$model" \
  -c 98816 \
  --n-predict 128 \
  -ger \
  --merge-qkv \
  -ngl 99 \
  --threads 1 \
  --warmup-batch
PP TG N_KV T_PP s S_PP t/s T_TG s S_TG t/s
512 128 0 0.174 2936.00 1.193 107.29
512 128 512 0.172 2978.06 1.170 109.40
512 128 1024 0.172 2975.15 1.164 109.93
512 128 1536 0.177 2900.77 1.168 109.58
512 128 2048 0.176 2914.49 1.166 109.76
512 128 2560 0.177 2890.83 1.169 109.52
512 128 3072 0.177 2899.65 1.169 109.46
512 128 3584 0.178 2884.00 1.170 109.36
512 128 4096 0.179 2867.25 1.172 109.26
512 128 4608 0.178 2873.43 1.173 109.09
512 128 5120 0.179 2853.29 1.175 108.92
512 128 5632 0.181 2823.75 1.176 108.87
512 128 6144 0.182 2806.13 1.176 108.84
512 128 6656 0.183 2798.10 1.178 108.62
512 128 7168 0.183 2795.26 1.178 108.62
512 128 7680 0.183 2793.95 1.182 108.32
512 128 8192 0.184 2784.21 1.184 108.10
512 128 8704 0.185 2762.43 1.187 107.80
512 128 9216 0.186 2754.01 1.190 107.53
512 128 9728 0.185 2768.54 1.191 107.44
512 128 10240 0.186 2750.38 1.196 107.02
512 128 10752 0.188 2724.69 1.211 105.68
512 128 11264 0.189 2707.98 1.214 105.45
512 128 11776 0.188 2717.85 1.214 105.48
512 128 12288 0.190 2700.91 1.215 105.37
512 128 12800 0.190 2696.82 1.214 105.41
512 128 13312 0.191 2676.28 1.214 105.41
512 128 13824 0.192 2665.08 1.215 105.38
512 128 14336 0.191 2676.61 1.215 105.35
512 128 14848 0.193 2657.19 1.215 105.31
512 128 15360 0.194 2634.88 1.216 105.22
512 128 15872 0.195 2623.21 1.219 104.97
512 128 16384 0.194 2645.57 1.216 105.23
512 128 16896 0.195 2625.26 1.217 105.19
512 128 17408 0.197 2603.07 1.218 105.13
512 128 17920 0.197 2594.68 1.218 105.06
512 128 18432 0.196 2608.57 1.219 104.97
512 128 18944 0.197 2595.38 1.221 104.81
512 128 19456 0.198 2587.47 1.223 104.70
512 128 19968 0.199 2576.27 1.225 104.51
512 128 20480 0.199 2567.30 1.226 104.43
512 128 20992 0.201 2550.50 1.227 104.31
512 128 21504 0.201 2547.58 1.239 103.31
512 128 22016 0.203 2525.80 1.244 102.91
512 128 22528 0.204 2513.44 1.244 102.86
512 128 23040 0.204 2513.12 1.246 102.73
512 128 23552 0.204 2513.52 1.244 102.91
512 128 24064 0.205 2497.05 1.245 102.85
512 128 24576 0.204 2503.97 1.242 103.09
512 128 25088 0.206 2489.41 1.243 103.01
512 128 25600 0.206 2479.72 1.241 103.11
512 128 26112 0.207 2469.62 1.243 102.96
512 128 26624 0.207 2475.58 1.246 102.72
512 128 27136 0.208 2455.98 1.245 102.81
512 128 27648 0.209 2454.05 1.245 102.84
512 128 28160 0.210 2442.36 1.247 102.64
512 128 28672 0.212 2418.16 1.246 102.73
512 128 29184 0.211 2427.28 1.248 102.53
512 128 29696 0.212 2418.51 1.248 102.53
512 128 30208 0.212 2419.07 1.250 102.41
512 128 30720 0.213 2407.89 1.250 102.39
512 128 31232 0.212 2410.92 1.253 102.18
512 128 31744 0.214 2396.97 1.256 101.91
512 128 32256 0.215 2379.81 1.269 100.83
512 128 32768 0.215 2381.68 1.273 100.58
512 128 33280 0.215 2384.12 1.273 100.56
512 128 33792 0.216 2372.90 1.274 100.46
512 128 34304 0.218 2350.38 1.275 100.41
512 128 34816 0.217 2364.12 1.275 100.41
512 128 35328 0.218 2345.98 1.274 100.44
512 128 35840 0.219 2341.59 1.274 100.51
512 128 36352 0.219 2334.62 1.276 100.34
512 128 36864 0.220 2325.55 1.275 100.41
512 128 37376 0.221 2320.74 1.278 100.13
512 128 37888 0.220 2326.55 1.275 100.37
512 128 38400 0.222 2306.58 1.278 100.18
512 128 38912 0.222 2305.20 1.277 100.21
512 128 39424 0.223 2300.19 1.278 100.15
512 128 39936 0.224 2288.28 1.278 100.17
512 128 40448 0.224 2282.99 1.281 99.89
512 128 40960 0.225 2279.36 1.281 99.91
512 128 41472 0.226 2267.83 1.284 99.72
512 128 41984 0.227 2253.10 1.283 99.73
512 128 42496 0.227 2252.03 1.285 99.64
512 128 43008 0.227 2255.51 1.298 98.63
512 128 43520 0.229 2236.28 1.304 98.15
512 128 44032 0.229 2231.85 1.305 98.11
512 128 44544 0.230 2229.91 1.307 97.96
512 128 45056 0.230 2226.16 1.306 98.02
512 128 45568 0.231 2213.97 1.306 98.03
512 128 46080 0.233 2199.33 1.306 97.98
512 128 46592 0.232 2209.53 1.306 98.01
512 128 47104 0.233 2197.03 1.306 97.99
512 128 47616 0.234 2188.17 1.308 97.82
512 128 48128 0.234 2189.37 1.309 97.77
512 128 48640 0.234 2190.24 1.306 97.98
512 128 49152 0.236 2167.81 1.307 97.94
512 128 49664 0.236 2172.29 1.309 97.77
512 128 50176 0.237 2159.85 1.310 97.73
512 128 50688 0.239 2141.53 1.311 97.60
512 128 51200 0.238 2151.55 1.311 97.63
512 128 51712 0.240 2136.59 1.313 97.47
512 128 52224 0.238 2149.40 1.314 97.43
512 128 52736 0.240 2135.31 1.315 97.33
512 128 53248 0.240 2132.65 1.314 97.42
512 128 53760 0.241 2122.75 1.327 96.49
512 128 54272 0.242 2113.30 1.333 96.03
512 128 54784 0.243 2109.95 1.333 96.03
512 128 55296 0.244 2099.63 1.333 96.06
512 128 55808 0.243 2108.25 1.334 95.96
512 128 56320 0.244 2096.52 1.335 95.91
512 128 56832 0.245 2086.57 1.337 95.77
512 128 57344 0.245 2091.17 1.336 95.81
512 128 57856 0.246 2083.07 1.337 95.76
512 128 58368 0.246 2082.30 1.338 95.66
512 128 58880 0.246 2077.39 1.341 95.46
512 128 59392 0.247 2069.54 1.339 95.62
512 128 59904 0.247 2073.38 1.340 95.53
512 128 60416 0.249 2059.34 1.340 95.55
512 128 60928 0.250 2050.26 1.343 95.34
512 128 61440 0.252 2035.36 1.343 95.29
512 128 61952 0.252 2034.27 1.343 95.30
512 128 62464 0.252 2034.20 1.345 95.19
512 128 62976 0.253 2026.64 1.347 95.02
512 128 63488 0.253 2025.28 1.347 95.00
512 128 64000 0.253 2021.48 1.350 94.79
512 128 64512 0.253 2023.06 1.359 94.16
512 128 65024 0.254 2018.31 1.365 93.79
512 128 65536 0.255 2010.00 1.364 93.86
512 128 66048 0.255 2005.91 1.366 93.70
512 128 66560 0.258 1985.67 1.364 93.85
512 128 67072 0.257 1989.40 1.366 93.70
512 128 67584 0.259 1975.02 1.367 93.64
512 128 68096 0.258 1981.90 1.367 93.62
512 128 68608 0.259 1978.90 1.368 93.54
512 128 69120 0.260 1968.30 1.370 93.43
512 128 69632 0.260 1969.75 1.372 93.28
512 128 70144 0.261 1964.11 1.369 93.50
512 128 70656 0.262 1955.85 1.370 93.42
512 128 71168 0.262 1957.91 1.371 93.34
512 128 71680 0.264 1937.62 1.372 93.27
512 128 72192 0.264 1938.03 1.373 93.22
512 128 72704 0.264 1941.02 1.374 93.14
512 128 73216 0.266 1925.88 1.376 93.04
512 128 73728 0.265 1929.55 1.376 93.02
512 128 74240 0.266 1925.61 1.379 92.83
512 128 74752 0.266 1923.06 1.379 92.84
512 128 75264 0.266 1924.26 1.388 92.25
512 128 75776 0.267 1916.84 1.396 91.71
512 128 76288 0.268 1912.85 1.394 91.81
512 128 76800 0.270 1893.33 1.398 91.58
512 128 77312 0.271 1892.38 1.398 91.57
512 128 77824 0.271 1890.14 1.397 91.61
512 128 78336 0.270 1894.69 1.398 91.54
512 128 78848 0.272 1879.97 1.398 91.58
512 128 79360 0.272 1881.99 1.398 91.54
512 128 79872 0.273 1875.89 1.399 91.46
512 128 80384 0.273 1872.88 1.403 91.26
512 128 80896 0.273 1876.07 1.401 91.39
512 128 81408 0.274 1869.01 1.402 91.28
512 128 81920 0.276 1851.81 1.403 91.22
512 128 82432 0.277 1847.06 1.403 91.21
512 128 82944 0.279 1834.21 1.404 91.19
512 128 83456 0.277 1850.88 1.407 90.96
512 128 83968 0.278 1839.01 1.409 90.84
512 128 84480 0.277 1845.76 1.409 90.87
512 128 84992 0.281 1824.47 1.408 90.88
512 128 85504 0.281 1822.07 1.410 90.77
512 128 86016 0.280 1826.77 1.419 90.21
512 128 86528 0.282 1812.53 1.427 89.70
512 128 87040 0.282 1814.56 1.427 89.68
512 128 87552 0.281 1821.73 1.429 89.59
512 128 88064 0.283 1811.06 1.428 89.60
512 128 88576 0.283 1809.21 1.428 89.64
512 128 89088 0.284 1804.11 1.427 89.70
512 128 89600 0.285 1796.31 1.428 89.61
512 128 90112 0.285 1794.43 1.429 89.59
512 128 90624 0.287 1786.69 1.431 89.45
512 128 91136 0.285 1794.44 1.433 89.31
512 128 91648 0.286 1792.13 1.432 89.40
512 128 92160 0.288 1780.38 1.432 89.39
512 128 92672 0.287 1781.72 1.434 89.29
512 128 93184 0.289 1769.00 1.433 89.30
512 128 93696 0.291 1760.04 1.436 89.16
512 128 94208 0.292 1755.96 1.436 89.15
512 128 94720 0.291 1756.82 1.439 88.97
512 128 95232 0.290 1763.12 1.437 89.05
512 128 95744 0.292 1753.33 1.438 89.01
512 128 96256 0.294 1742.83 1.443 88.70
512 128 96768 0.295 1736.30 1.449 88.35
512 128 97280 0.294 1739.55 1.457 87.83
512 128 97792 0.296 1730.10 1.457 87.85
512 128 98304 0.296 1731.85 1.460 87.69

main@216f4436 -ub 2048 -b 2048

model=/models/ubergarm/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-Q4_0.gguf
./build/bin/llama-sweep-bench \
  --model "$model" \
  -c 36864 \
  --n-predict 128 \
  -ger \
  --merge-qkv \
  -ngl 99 \
  -ub 2048 -b 2048 \
  --threads 1 \
  --warmup-batch
PP TG N_KV T_PP s S_PP t/s T_TG s S_TG t/s
2048 128 0 0.520 3938.17 1.188 107.75
2048 128 2048 0.527 3888.98 1.171 109.33
2048 128 4096 0.535 3826.04 1.173 109.09
2048 128 6144 0.542 3780.14 1.174 108.99
2048 128 8192 0.551 3717.73 1.183 108.24
2048 128 10240 0.559 3664.62 1.193 107.29
2048 128 12288 0.568 3604.28 1.209 105.87
2048 128 14336 0.579 3539.66 1.209 105.84
2048 128 16384 0.587 3491.04 1.212 105.64
2048 128 18432 0.594 3446.04 1.216 105.23
2048 128 20480 0.603 3394.70 1.223 104.68
2048 128 22528 0.613 3340.21 1.237 103.44
2048 128 24576 0.623 3285.83 1.239 103.28
2048 128 26624 0.634 3228.28 1.244 102.93
2048 128 28672 0.644 3181.49 1.244 102.89
2048 128 30720 0.649 3156.60 1.248 102.54
2048 128 32768 0.660 3102.96 1.270 100.81
2048 128 34816 0.668 3063.68 1.272 100.60

PR1320 ik/fused_delta_net_2@0579a868 -ub 2048 -b 2048 -fdn 512

./build/bin/llama-sweep-bench \
  --model "$model" \
  -c 98816 \
  --n-predict 128 \
  -fdn 512 \
  -ger \
  --merge-qkv \
  -ngl 99 \
  --threads 1 \
  --warmup-batch
PP TG N_KV T_PP s S_PP t/s T_TG s S_TG t/s
512 128 0 0.199 2575.00 0.968 132.21
512 128 512 0.198 2585.68 0.942 135.90
512 128 1024 0.199 2572.18 0.941 136.02
512 128 1536 0.202 2531.40 0.944 135.56
512 128 2048 0.202 2535.22 0.943 135.67
512 128 2560 0.203 2521.24 0.944 135.53
512 128 3072 0.203 2522.16 0.947 135.19
512 128 3584 0.204 2511.29 0.946 135.36
512 128 4096 0.205 2497.51 0.947 135.20
512 128 4608 0.204 2504.43 0.948 135.04
512 128 5120 0.205 2491.80 0.951 134.59
512 128 5632 0.208 2464.79 0.949 134.82
512 128 6144 0.209 2450.60 0.951 134.60
512 128 6656 0.209 2447.25 0.957 133.80
512 128 7168 0.210 2440.00 0.955 134.01
512 128 7680 0.209 2447.04 0.959 133.54
512 128 8192 0.211 2431.78 0.961 133.21
512 128 8704 0.211 2423.35 0.963 132.96
512 128 9216 0.213 2408.46 0.968 132.22
512 128 9728 0.212 2414.50 0.971 131.86
512 128 10240 0.212 2409.94 0.973 131.58
512 128 10752 0.214 2393.80 0.984 130.06
512 128 11264 0.215 2379.13 0.987 129.65
512 128 11776 0.215 2384.02 0.987 129.74
512 128 12288 0.216 2369.23 0.988 129.62
512 128 12800 0.216 2366.54 0.988 129.49
512 128 13312 0.217 2359.79 0.988 129.58
512 128 13824 0.218 2345.02 0.989 129.40
512 128 14336 0.219 2341.90 0.989 129.40
512 128 14848 0.219 2334.82 0.991 129.18
512 128 15360 0.220 2322.61 0.991 129.12
512 128 15872 0.221 2316.96 0.993 128.85
512 128 16384 0.220 2328.72 0.992 128.99
512 128 16896 0.222 2307.97 0.993 128.86
512 128 17408 0.222 2304.64 0.996 128.52
512 128 17920 0.223 2290.88 0.995 128.65
512 128 18432 0.223 2299.15 0.997 128.42
512 128 18944 0.224 2287.06 0.999 128.17
512 128 19456 0.224 2285.92 0.998 128.26
512 128 19968 0.225 2272.65 1.001 127.87
512 128 20480 0.226 2269.62 1.002 127.70
512 128 20992 0.226 2260.53 1.003 127.67
512 128 21504 0.227 2253.05 1.013 126.37
512 128 22016 0.229 2234.47 1.018 125.75
512 128 22528 0.230 2227.72 1.017 125.80
512 128 23040 0.230 2226.15 1.019 125.57
512 128 23552 0.230 2223.19 1.018 125.74
512 128 24064 0.231 2212.11 1.019 125.60
512 128 24576 0.232 2210.98 1.019 125.67
512 128 25088 0.232 2209.30 1.018 125.73
512 128 25600 0.233 2197.75 1.019 125.62
512 128 26112 0.235 2182.55 1.019 125.57
512 128 26624 0.233 2198.82 1.024 125.05
512 128 27136 0.235 2179.29 1.024 125.01
512 128 27648 0.235 2175.51 1.025 124.89
512 128 28160 0.236 2168.10 1.027 124.68
512 128 28672 0.238 2153.17 1.028 124.46
512 128 29184 0.238 2153.07 1.029 124.35
512 128 29696 0.238 2148.15 1.030 124.29
512 128 30208 0.238 2149.18 1.031 124.20
512 128 30720 0.239 2145.33 1.032 124.05
512 128 31232 0.239 2142.17 1.033 123.97
512 128 31744 0.240 2132.63 1.037 123.45
512 128 32256 0.241 2124.10 1.050 121.90
512 128 32768 0.241 2120.12 1.052 121.62
512 128 33280 0.242 2115.81 1.051 121.83
512 128 33792 0.243 2111.18 1.053 121.61
512 128 34304 0.244 2100.08 1.052 121.72
512 128 34816 0.243 2102.70 1.052 121.66
512 128 35328 0.245 2087.80 1.053 121.57
512 128 35840 0.244 2094.67 1.053 121.56
512 128 36352 0.246 2081.12 1.053 121.51
512 128 36864 0.247 2075.43 1.054 121.46
512 128 37376 0.247 2069.08 1.056 121.26
512 128 37888 0.246 2082.12 1.055 121.33
512 128 38400 0.249 2057.52 1.057 121.14
512 128 38912 0.249 2053.40 1.058 121.02
512 128 39424 0.249 2056.65 1.058 121.02
512 128 39936 0.251 2040.78 1.057 121.08
512 128 40448 0.251 2042.97 1.059 120.86
512 128 40960 0.251 2042.21 1.059 120.82
512 128 41472 0.251 2036.90 1.060 120.72
512 128 41984 0.254 2019.04 1.063 120.41
512 128 42496 0.253 2019.81 1.064 120.36
512 128 43008 0.254 2015.30 1.076 118.92
512 128 43520 0.256 1998.88 1.082 118.34
512 128 44032 0.256 1998.88 1.081 118.44
512 128 44544 0.256 2003.39 1.083 118.18
512 128 45056 0.256 1997.53 1.083 118.18
512 128 45568 0.257 1995.21 1.083 118.17
512 128 46080 0.259 1975.39 1.084 118.11
512 128 46592 0.258 1983.08 1.083 118.16
512 128 47104 0.260 1972.85 1.086 117.91
512 128 47616 0.261 1960.16 1.086 117.88
512 128 48128 0.261 1961.87 1.088 117.69
512 128 48640 0.260 1965.90 1.086 117.81
512 128 49152 0.263 1943.79 1.088 117.68
512 128 49664 0.263 1948.94 1.088 117.66
512 128 50176 0.263 1948.15 1.089 117.53
512 128 50688 0.265 1934.18 1.091 117.30
512 128 51200 0.264 1941.67 1.091 117.37
512 128 51712 0.266 1922.65 1.093 117.13
512 128 52224 0.265 1930.69 1.093 117.06
512 128 52736 0.267 1921.17 1.095 116.91
512 128 53248 0.267 1918.03 1.096 116.75
512 128 53760 0.267 1917.87 1.106 115.76
512 128 54272 0.268 1909.47 1.112 115.15
512 128 54784 0.269 1903.39 1.112 115.09
512 128 55296 0.270 1895.47 1.113 115.03
512 128 55808 0.269 1903.45 1.112 115.14
512 128 56320 0.270 1897.75 1.113 115.01
512 128 56832 0.273 1876.17 1.113 114.97
512 128 57344 0.271 1888.58 1.114 114.94
512 128 57856 0.272 1879.98 1.115 114.80
512 128 58368 0.272 1879.22 1.115 114.79
512 128 58880 0.274 1870.29 1.118 114.52
512 128 59392 0.275 1860.92 1.117 114.59
512 128 59904 0.274 1869.75 1.118 114.46
512 128 60416 0.276 1856.11 1.118 114.53
512 128 60928 0.277 1846.03 1.120 114.33
512 128 61440 0.277 1851.54 1.120 114.28
512 128 61952 0.277 1848.80 1.121 114.23
512 128 62464 0.277 1846.28 1.122 114.03
512 128 62976 0.280 1827.87 1.124 113.83
512 128 63488 0.279 1833.53 1.125 113.79
512 128 64000 0.279 1835.48 1.128 113.48
512 128 64512 0.280 1826.22 1.138 112.53
512 128 65024 0.280 1829.87 1.141 112.20
512 128 65536 0.281 1821.27 1.141 112.16
512 128 66048 0.283 1809.88 1.143 112.01
512 128 66560 0.284 1805.72 1.142 112.06
512 128 67072 0.284 1804.44 1.143 111.94
512 128 67584 0.286 1789.29 1.144 111.89
512 128 68096 0.285 1798.07 1.145 111.83
512 128 68608 0.285 1794.04 1.145 111.77
512 128 69120 0.286 1789.88 1.147 111.62
512 128 69632 0.287 1784.79 1.149 111.42
512 128 70144 0.288 1776.24 1.144 111.87
512 128 70656 0.288 1779.40 1.147 111.62
512 128 71168 0.289 1772.68 1.148 111.47
512 128 71680 0.290 1762.90 1.149 111.44
512 128 72192 0.291 1759.67 1.150 111.26
512 128 72704 0.290 1765.58 1.152 111.09
512 128 73216 0.292 1753.90 1.152 111.09
512 128 73728 0.292 1754.55 1.154 110.92
512 128 74240 0.292 1755.51 1.155 110.81
512 128 74752 0.292 1750.87 1.156 110.70
512 128 75264 0.294 1740.50 1.166 109.77
512 128 75776 0.294 1740.81 1.172 109.22
512 128 76288 0.295 1738.27 1.172 109.25
512 128 76800 0.296 1729.83 1.171 109.27
512 128 77312 0.297 1721.99 1.173 109.12
512 128 77824 0.296 1729.87 1.175 108.97
512 128 78336 0.298 1718.51 1.175 108.93
512 128 78848 0.298 1717.70 1.175 108.92
512 128 79360 0.298 1719.08 1.175 108.96
512 128 79872 0.299 1714.38 1.176 108.84
512 128 80384 0.298 1716.60 1.180 108.52
512 128 80896 0.300 1704.94 1.178 108.69
512 128 81408 0.301 1702.35 1.178 108.64
512 128 81920 0.305 1680.15 1.178 108.62
512 128 82432 0.303 1687.73 1.179 108.57
512 128 82944 0.305 1679.50 1.180 108.45
512 128 83456 0.304 1684.69 1.183 108.16
512 128 83968 0.305 1677.01 1.184 108.14
512 128 84480 0.305 1679.11 1.185 108.00
512 128 84992 0.306 1675.16 1.184 108.09
512 128 85504 0.306 1672.74 1.186 107.96
512 128 86016 0.307 1669.16 1.196 107.06
512 128 86528 0.309 1654.64 1.201 106.60
512 128 87040 0.310 1653.08 1.202 106.45
512 128 87552 0.308 1662.40 1.204 106.35
512 128 88064 0.309 1656.56 1.202 106.47
512 128 88576 0.311 1648.72 1.202 106.48
512 128 89088 0.309 1654.42 1.202 106.52
512 128 89600 0.311 1647.67 1.204 106.33
512 128 90112 0.311 1646.88 1.205 106.21
512 128 90624 0.312 1640.00 1.204 106.27
512 128 91136 0.311 1646.39 1.210 105.76
512 128 91648 0.312 1641.19 1.207 106.02
512 128 92160 0.314 1629.87 1.206 106.10
512 128 92672 0.316 1620.53 1.209 105.89
512 128 93184 0.315 1624.91 1.209 105.88
512 128 93696 0.315 1625.20 1.211 105.72
512 128 94208 0.319 1607.34 1.211 105.69
512 128 94720 0.317 1616.87 1.213 105.56
512 128 95232 0.319 1606.19 1.213 105.49
512 128 95744 0.317 1612.73 1.216 105.29
512 128 96256 0.320 1600.58 1.219 105.01
512 128 96768 0.318 1609.96 1.226 104.40
512 128 97280 0.322 1590.11 1.229 104.12
512 128 97792 0.323 1583.91 1.231 104.00
512 128 98304 0.323 1587.09 1.233 103.84

PR1330 ik/slightly_better_fdn@c115e185 -ub 2048 -b 2048 -fdn 512

./build/bin/llama-sweep-bench \
  --model "$model" \
  -c 98816 \
  --n-predict 128 \
  -fdn 512 \
  -ger \
  --merge-qkv \
  -ngl 99 \
  --threads 1 \
  --warmup-batch
PP TG N_KV T_PP s S_PP t/s T_TG s S_TG t/s
512 128 0 0.190 2691.18 0.965 132.67
512 128 512 0.190 2695.16 0.937 136.53
512 128 1024 0.191 2680.12 0.937 136.65
512 128 1536 0.194 2635.34 0.937 136.59
512 128 2048 0.194 2642.24 0.938 136.47
512 128 2560 0.195 2621.69 0.940 136.15
512 128 3072 0.195 2632.07 0.940 136.11
512 128 3584 0.196 2616.10 0.941 135.98
512 128 4096 0.197 2603.20 0.943 135.78
512 128 4608 0.197 2599.21 0.944 135.64
512 128 5120 0.198 2589.82 0.946 135.33
512 128 5632 0.200 2563.70 0.946 135.31
512 128 6144 0.201 2547.82 0.947 135.15
512 128 6656 0.201 2550.98 0.949 134.94
512 128 7168 0.203 2519.03 0.953 134.26
512 128 7680 0.202 2540.68 0.954 134.18
512 128 8192 0.202 2535.23 0.955 134.04
512 128 8704 0.203 2519.62 0.959 133.44
512 128 9216 0.204 2509.66 0.960 133.29
512 128 9728 0.204 2513.07 0.963 132.92
512 128 10240 0.205 2501.70 0.965 132.71
512 128 10752 0.206 2487.77 0.979 130.80
512 128 11264 0.207 2471.13 0.983 130.21
512 128 11776 0.206 2480.73 0.982 130.36
512 128 12288 0.208 2461.08 0.982 130.28
512 128 12800 0.208 2459.93 0.983 130.19
512 128 13312 0.209 2451.31 0.984 130.11
512 128 13824 0.211 2429.65 0.985 129.94
512 128 14336 0.210 2437.64 0.986 129.81
512 128 14848 0.211 2422.96 0.986 129.86
512 128 15360 0.212 2412.02 0.987 129.73
512 128 15872 0.213 2403.28 0.989 129.40
512 128 16384 0.212 2411.39 0.987 129.73
512 128 16896 0.213 2406.67 0.987 129.68
512 128 17408 0.215 2384.07 0.989 129.43
512 128 17920 0.215 2385.33 0.991 129.15
512 128 18432 0.214 2396.15 0.991 129.11
512 128 18944 0.216 2369.35 0.994 128.84
512 128 19456 0.216 2374.05 0.995 128.62
512 128 19968 0.217 2359.66 0.996 128.46
512 128 20480 0.218 2351.73 0.997 128.34
512 128 20992 0.218 2344.87 0.998 128.28
512 128 21504 0.219 2341.53 1.012 126.54
512 128 22016 0.220 2325.71 1.016 126.04
512 128 22528 0.222 2311.18 1.016 126.02
512 128 23040 0.222 2310.59 1.016 125.93
512 128 23552 0.221 2311.94 1.016 125.94
512 128 24064 0.223 2298.12 1.017 125.86
512 128 24576 0.224 2285.51 1.017 125.89
512 128 25088 0.224 2286.95 1.018 125.73
512 128 25600 0.225 2279.09 1.018 125.73
512 128 26112 0.226 2263.00 1.020 125.53
512 128 26624 0.226 2264.02 1.021 125.42
512 128 27136 0.226 2260.59 1.020 125.46
512 128 27648 0.226 2261.18 1.020 125.45
512 128 28160 0.228 2248.07 1.020 125.43
512 128 28672 0.230 2226.16 1.021 125.35
512 128 29184 0.230 2229.64 1.023 125.18
512 128 29696 0.230 2228.82 1.024 125.03
512 128 30208 0.230 2225.55 1.025 124.89
512 128 30720 0.232 2207.58 1.027 124.59
512 128 31232 0.231 2215.72 1.027 124.60
512 128 31744 0.243 2104.11 1.062 120.58
512 128 32256 0.233 2195.08 1.044 122.59
512 128 32768 0.234 2191.14 1.047 122.24
512 128 33280 0.233 2195.57 1.048 122.13
512 128 33792 0.234 2188.30 1.047 122.24
512 128 34304 0.236 2173.65 1.047 122.26
512 128 34816 0.235 2180.46 1.047 122.22
512 128 35328 0.237 2162.38 1.049 121.97
512 128 35840 0.237 2157.32 1.050 121.95
512 128 36352 0.238 2152.92 1.046 122.36
512 128 36864 0.238 2151.01 1.047 122.21
512 128 37376 0.239 2142.55 1.048 122.08
512 128 37888 0.238 2151.41 1.050 121.92
512 128 38400 0.240 2130.87 1.048 122.10
512 128 38912 0.242 2118.71 1.049 122.01
512 128 39424 0.242 2119.74 1.050 121.90
512 128 39936 0.243 2110.27 1.052 121.72
512 128 40448 0.242 2114.51 1.052 121.69
512 128 40960 0.243 2109.35 1.053 121.54
512 128 41472 0.244 2100.06 1.056 121.24
512 128 41984 0.246 2084.44 1.056 121.25
512 128 42496 0.245 2088.53 1.056 121.19
512 128 43008 0.246 2085.00 1.069 119.77
512 128 43520 0.248 2063.15 1.073 119.27
512 128 44032 0.248 2067.53 1.074 119.17
512 128 44544 0.249 2058.77 1.074 119.17
512 128 45056 0.249 2058.75 1.077 118.89
512 128 45568 0.249 2059.11 1.077 118.80
512 128 46080 0.251 2037.21 1.077 118.80
512 128 46592 0.249 2055.42 1.076 118.93
512 128 47104 0.250 2044.89 1.076 118.92
512 128 47616 0.253 2023.24 1.078 118.73
512 128 48128 0.254 2018.52 1.079 118.62
512 128 48640 0.252 2028.37 1.080 118.54
512 128 49152 0.254 2012.85 1.082 118.34
512 128 49664 0.254 2013.39 1.081 118.44
512 128 50176 0.255 2009.80 1.082 118.28
512 128 50688 0.257 1995.16 1.083 118.20
512 128 51200 0.256 1997.79 1.082 118.30
512 128 51712 0.258 1984.30 1.084 118.10
512 128 52224 0.257 1991.46 1.086 117.83
512 128 52736 0.258 1982.96 1.087 117.78
512 128 53248 0.258 1981.81 1.089 117.57
512 128 53760 0.260 1971.63 1.098 116.53
512 128 54272 0.260 1971.19 1.103 116.09
512 128 54784 0.260 1972.19 1.103 116.02
512 128 55296 0.262 1954.17 1.106 115.71
512 128 55808 0.262 1956.85 1.106 115.72
512 128 56320 0.261 1962.02 1.106 115.74
512 128 56832 0.265 1933.04 1.106 115.70
512 128 57344 0.263 1947.60 1.106 115.76
512 128 57856 0.264 1939.29 1.106 115.71
512 128 58368 0.266 1927.72 1.107 115.59
512 128 58880 0.265 1932.62 1.109 115.42
512 128 59392 0.267 1919.62 1.109 115.40
512 128 59904 0.266 1922.66 1.110 115.34
512 128 60416 0.266 1922.28 1.111 115.21
512 128 60928 0.268 1913.64 1.113 114.99
512 128 61440 0.269 1903.24 1.113 114.96
512 128 61952 0.270 1898.87 1.114 114.87
512 128 62464 0.270 1898.75 1.116 114.71
512 128 62976 0.287 1786.84 1.159 110.49
512 128 63488 0.272 1881.68 1.128 113.49
512 128 64000 0.272 1882.92 1.123 114.02
512 128 64512 0.272 1884.81 1.131 113.20
512 128 65024 0.273 1877.07 1.135 112.76
512 128 65536 0.273 1877.68 1.135 112.80
512 128 66048 0.273 1874.91 1.136 112.69
512 128 66560 0.275 1858.98 1.135 112.74
512 128 67072 0.276 1857.81 1.137 112.62
512 128 67584 0.278 1842.42 1.137 112.54
512 128 68096 0.276 1852.11 1.139 112.42
512 128 68608 0.277 1847.64 1.140 112.30
512 128 69120 0.279 1834.78 1.139 112.41
512 128 69632 0.279 1838.37 1.141 112.21
512 128 70144 0.279 1832.36 1.140 112.24
512 128 70656 0.279 1833.59 1.142 112.04
512 128 71168 0.280 1826.61 1.140 112.26
512 128 71680 0.282 1818.03 1.143 112.02
512 128 72192 0.282 1816.08 1.144 111.92
512 128 72704 0.282 1815.96 1.145 111.77
512 128 73216 0.284 1804.48 1.147 111.60
512 128 73728 0.285 1798.35 1.148 111.50
512 128 74240 0.284 1804.27 1.149 111.41
512 128 74752 0.284 1805.09 1.150 111.28
512 128 75264 0.284 1800.06 1.162 110.18
512 128 75776 0.286 1790.82 1.167 109.68
512 128 76288 0.286 1792.72 1.166 109.75
512 128 76800 0.289 1773.34 1.169 109.53
512 128 77312 0.289 1770.84 1.169 109.54
512 128 77824 0.288 1775.28 1.171 109.34
512 128 78336 0.289 1770.41 1.169 109.52
512 128 78848 0.289 1768.97 1.170 109.38
512 128 79360 0.290 1762.90 1.170 109.38
512 128 79872 0.291 1760.10 1.172 109.25
512 128 80384 0.291 1756.72 1.174 109.04
512 128 80896 0.292 1753.51 1.173 109.13
512 128 81408 0.293 1748.99 1.173 109.12
512 128 81920 0.296 1730.64 1.173 109.12
512 128 82432 0.297 1724.28 1.175 108.96
512 128 82944 0.295 1735.01 1.175 108.92
512 128 83456 0.296 1727.70 1.178 108.66
512 128 83968 0.297 1723.87 1.178 108.66
512 128 84480 0.297 1722.73 1.179 108.55
512 128 84992 0.298 1717.53 1.180 108.50
512 128 85504 0.298 1717.57 1.181 108.38
512 128 86016 0.297 1723.64 1.190 107.55
512 128 86528 0.302 1698.16 1.196 107.01
512 128 87040 0.299 1709.52 1.196 106.98
512 128 87552 0.300 1704.77 1.196 107.06
512 128 88064 0.301 1700.21 1.198 106.83
512 128 88576 0.302 1695.16 1.199 106.76
512 128 89088 0.301 1699.84 1.200 106.69
512 128 89600 0.303 1690.16 1.200 106.68
512 128 90112 0.303 1692.13 1.200 106.67
512 128 90624 0.303 1687.97 1.201 106.55
512 128 91136 0.304 1683.64 1.205 106.20
512 128 91648 0.305 1678.87 1.203 106.36
512 128 92160 0.308 1664.04 1.205 106.21
512 128 92672 0.307 1668.10 1.205 106.26
512 128 93184 0.306 1673.26 1.204 106.28
512 128 93696 0.307 1666.66 1.208 105.99
512 128 94208 0.308 1661.28 1.206 106.15
512 128 94720 0.334 1535.03 1.236 103.56
512 128 95232 0.309 1654.58 1.246 102.72
512 128 95744 0.311 1646.55 1.213 105.50
512 128 96256 0.314 1629.26 1.217 105.19
512 128 96768 0.312 1643.35 1.221 104.80
512 128 97280 0.314 1629.37 1.225 104.51
512 128 97792 0.314 1628.64 1.227 104.36
512 128 98304 0.315 1626.70 1.228 104.26

@ikawrakow
Copy link
Owner Author

@ubergarm

So, on CUDA the fused delta-net is still slightly slower than the chunked implementation for batch sizes greater than 64 or so. Hence, if running on CUDA, it is best to use -fdn 64.

On the CPU (at least on my CPU) the fused delta net is faster than the chunked implementation, so there one can use -fdn something_greater_than_u_batch

I have been trying to make the fused delta-net be as fast as the chunked implementation for all batch sizes, but no luck so far. If I managed to get there, it would simplify things a lot!

@magikRUKKOLA
Copy link

Okay cool its working. +~1% decode.

Qwen3.5 IQ2_KL, 8x3090:

Details

main: n_kv_max = 262144, n_batch = 4096, n_ubatch = 4096, flash_attn = 1, n_gpu_layers = 99, n_threads = 64, n_threads_batch = 64

PP TG N_KV T_PP s S_PP t/s T_TG s S_TG t/s
4096 1024 0 4.635 883.75 26.283 38.96
4096 1024 4096 4.758 860.81 26.306 38.93
4096 1024 8192 4.894 836.90 26.933 38.02
4096 1024 12288 5.024 815.29 27.379 37.40
4096 1024 16384 5.152 795.07 27.846 36.77
4096 1024 20480 5.277 776.24 28.125 36.41
4096 1024 24576 5.421 755.60 28.574 35.84
4096 1024 28672 5.528 740.98 28.799 35.56
4096 1024 32768 5.666 722.89 29.329 34.91
4096 1024 36864 5.793 707.01 29.634 34.55
4096 1024 40960 5.917 692.30 29.976 34.16
4096 1024 45056 6.049 677.19 30.320 33.77
4096 1024 49152 6.168 664.10 30.684 33.37
4096 1024 53248 6.297 650.44 31.045 32.98
4096 1024 57344 6.415 638.54 31.488 32.52
4096 1024 61440 6.570 623.43 31.816 32.19
4096 1024 65536 6.692 612.08 32.321 31.68
4096 1024 69632 6.817 600.81 32.681 31.33
4096 1024 73728 6.932 590.91 33.114 30.92
4096 1024 77824 7.061 580.06 33.400 30.66
4096 1024 81920 7.196 569.20 33.810 30.29
4096 1024 86016 7.335 558.43 34.118 30.01
4096 1024 90112 7.455 549.45 34.494 29.69
4096 1024 94208 7.576 540.69 35.029 29.23
4096 1024 98304 7.716 530.87 35.233 29.06
4096 1024 102400 7.851 521.74 35.601 28.76
4096 1024 106496 7.970 513.90 36.149 28.33
4096 1024 110592 8.099 505.72 36.451 28.09
4096 1024 114688 8.250 496.48 36.720 27.89
4096 1024 118784 8.346 490.77 37.086 27.61
4096 1024 122880 8.487 482.60 37.417 27.37
4096 1024 126976 8.686 471.57 37.813 27.08
4096 1024 131072 8.749 468.19 38.098 26.88
4096 1024 135168 8.870 461.79 38.442 26.64
4096 1024 139264 9.006 454.79 39.044 26.23
4096 1024 143360 9.138 448.26 39.285 26.07
4096 1024 147456 9.263 442.18 39.636 25.84
4096 1024 151552 9.397 435.90 39.925 25.65
4096 1024 155648 9.533 429.67 40.369 25.37
4096 1024 159744 9.660 424.00 40.751 25.13
4096 1024 163840 9.797 418.10 41.116 24.90
4096 1024 167936 9.929 412.53 41.723 24.54
4096 1024 172032 10.061 407.11 41.822 24.48
4096 1024 176128 10.185 402.16 42.232 24.25
4096 1024 180224 10.299 397.70 42.652 24.01
4096 1024 184320 10.457 391.70 43.048 23.79
4096 1024 188416 10.586 386.92 43.348 23.62
4096 1024 192512 10.667 383.99 43.740 23.41
4096 1024 196608 10.846 377.65 44.088 23.23
4096 1024 200704 10.959 373.77 44.413 23.06
4096 1024 204800 11.091 369.32 44.926 22.79
4096 1024 208896 11.247 364.19 45.138 22.69
4096 1024 212992 11.342 361.15 45.623 22.44
4096 1024 217088 11.496 356.30 45.762 22.38
4096 1024 221184 11.625 352.33 46.132 22.20
4096 1024 225280 11.754 348.48 46.610 21.97
4096 1024 229376 11.863 345.27 47.007 21.78
4096 1024 233472 11.995 341.49 47.240 21.68
4096 1024 237568 12.141 337.37 47.670 21.48
4096 1024 241664 12.269 333.84 48.016 21.33
4096 1024 245760 12.410 330.05 48.356 21.18
4096 1024 249856 12.537 326.70 48.951 20.92
4096 1024 253952 12.668 323.34 49.159 20.83
4096 1024 258048 12.841 318.97 49.609 20.64

refs:
#1320 (comment)

@ikawrakow ikawrakow merged commit facc8fd into main Feb 27, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants