Skip to content

Commit dddb64b

Browse files
subashab@codeaurora.orgdavem330
authored andcommitted
net: Add sysctl to toggle early demux for tcp and udp
Certain system process significant unconnected UDP workload. It would be preferrable to disable UDP early demux for those systems and enable it for TCP only. By disabling UDP demux, we see these slight gains on an ARM64 system- 782 -> 788Mbps unconnected single stream UDPv4 633 -> 654Mbps unconnected UDPv4 different sources The performance impact can change based on CPU architecure and cache sizes. There will not much difference seen if entire UDP hash table is in cache. Both sysctls are enabled by default to preserve existing behavior. v1->v2: Change function pointer instead of adding conditional as suggested by Stephen. v2->v3: Read once in callers to avoid issues due to compiler optimizations. Also update commit message with the tests. v3->v4: Store and use read once result instead of querying pointer again incorrectly. v4->v5: Refactor to avoid errors due to compilation with IPV6={m,n} Signed-off-by: Subash Abhinov Kasiviswanathan <[email protected]> Suggested-by: Eric Dumazet <[email protected]> Cc: Stephen Hemminger <[email protected]> Cc: Tom Herbert <[email protected]> Cc: David Miller <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 8fa96e3 commit dddb64b

File tree

12 files changed

+103
-14
lines changed

12 files changed

+103
-14
lines changed

Documentation/networking/ip-sysctl.txt

+10-1
Original file line numberDiff line numberDiff line change
@@ -856,12 +856,21 @@ ip_dynaddr - BOOLEAN
856856
ip_early_demux - BOOLEAN
857857
Optimize input packet processing down to one demux for
858858
certain kinds of local sockets. Currently we only do this
859-
for established TCP sockets.
859+
for established TCP and connected UDP sockets.
860860

861861
It may add an additional cost for pure routing workloads that
862862
reduces overall throughput, in such case you should disable it.
863863
Default: 1
864864

865+
tcp_early_demux - BOOLEAN
866+
Enable early demux for established TCP sockets.
867+
Default: 1
868+
869+
udp_early_demux - BOOLEAN
870+
Enable early demux for connected UDP sockets. Disable this if
871+
your system could experience more unconnected load.
872+
Default: 1
873+
865874
icmp_echo_ignore_all - BOOLEAN
866875
If set non-zero, then the kernel will ignore all ICMP ECHO
867876
requests sent to it.

include/net/netns/ipv4.h

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ struct netns_ipv4 {
9595
/* Shall we try to damage output packets if routing dev changes? */
9696
int sysctl_ip_dynaddr;
9797
int sysctl_ip_early_demux;
98+
int sysctl_tcp_early_demux;
99+
int sysctl_udp_early_demux;
98100

99101
int sysctl_fwmark_reflect;
100102
int sysctl_tcp_fwmark_accept;

include/net/protocol.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
/* This is used to register protocols. */
4141
struct net_protocol {
4242
void (*early_demux)(struct sk_buff *skb);
43+
void (*early_demux_handler)(struct sk_buff *skb);
4344
int (*handler)(struct sk_buff *skb);
4445
void (*err_handler)(struct sk_buff *skb, u32 info);
4546
unsigned int no_policy:1,
@@ -54,7 +55,7 @@ struct net_protocol {
5455
#if IS_ENABLED(CONFIG_IPV6)
5556
struct inet6_protocol {
5657
void (*early_demux)(struct sk_buff *skb);
57-
58+
void (*early_demux_handler)(struct sk_buff *skb);
5859
int (*handler)(struct sk_buff *skb);
5960

6061
void (*err_handler)(struct sk_buff *skb,
@@ -92,12 +93,12 @@ struct inet_protosw {
9293
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
9394
#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
9495

95-
extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
96+
extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS];
9697
extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS];
9798
extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS];
9899

99100
#if IS_ENABLED(CONFIG_IPV6)
100-
extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
101+
extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];
101102
#endif
102103

103104
int inet_add_protocol(const struct net_protocol *prot, unsigned char num);

include/net/udp.h

+1
Original file line numberDiff line numberDiff line change
@@ -372,4 +372,5 @@ void udp_encap_enable(void);
372372
#if IS_ENABLED(CONFIG_IPV6)
373373
void udpv6_encap_enable(void);
374374
#endif
375+
375376
#endif /* _UDP_H */

net/ipv4/af_inet.c

+6-2
Original file line numberDiff line numberDiff line change
@@ -1599,17 +1599,19 @@ static const struct net_protocol igmp_protocol = {
15991599
};
16001600
#endif
16011601

1602-
static const struct net_protocol tcp_protocol = {
1602+
static struct net_protocol tcp_protocol = {
16031603
.early_demux = tcp_v4_early_demux,
1604+
.early_demux_handler = tcp_v4_early_demux,
16041605
.handler = tcp_v4_rcv,
16051606
.err_handler = tcp_v4_err,
16061607
.no_policy = 1,
16071608
.netns_ok = 1,
16081609
.icmp_strict_tag_validation = 1,
16091610
};
16101611

1611-
static const struct net_protocol udp_protocol = {
1612+
static struct net_protocol udp_protocol = {
16121613
.early_demux = udp_v4_early_demux,
1614+
.early_demux_handler = udp_v4_early_demux,
16131615
.handler = udp_rcv,
16141616
.err_handler = udp_err,
16151617
.no_policy = 1,
@@ -1720,6 +1722,8 @@ static __net_init int inet_init_net(struct net *net)
17201722
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
17211723
net->ipv4.sysctl_ip_dynaddr = 0;
17221724
net->ipv4.sysctl_ip_early_demux = 1;
1725+
net->ipv4.sysctl_udp_early_demux = 1;
1726+
net->ipv4.sysctl_tcp_early_demux = 1;
17231727
#ifdef CONFIG_SYSCTL
17241728
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
17251729
#endif

net/ipv4/ip_input.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
313313
const struct iphdr *iph = ip_hdr(skb);
314314
struct rtable *rt;
315315
struct net_device *dev = skb->dev;
316+
void (*edemux)(struct sk_buff *skb);
316317

317318
/* if ingress device is enslaved to an L3 master device pass the
318319
* skb to its handler for processing
@@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
329330
int protocol = iph->protocol;
330331

331332
ipprot = rcu_dereference(inet_protos[protocol]);
332-
if (ipprot && ipprot->early_demux) {
333-
ipprot->early_demux(skb);
333+
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
334+
edemux(skb);
334335
/* must reload iph, skb->head might have changed */
335336
iph = ip_hdr(skb);
336337
}

net/ipv4/protocol.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#include <linux/spinlock.h>
2929
#include <net/protocol.h>
3030

31-
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
31+
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
3232
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
3333
EXPORT_SYMBOL(inet_offloads);
3434

net/ipv4/sysctl_net_ipv4.c

+67
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <net/cipso_ipv4.h>
2525
#include <net/inet_frag.h>
2626
#include <net/ping.h>
27+
#include <net/protocol.h>
2728

2829
static int zero;
2930
static int one = 1;
@@ -294,6 +295,58 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
294295
return ret;
295296
}
296297

298+
static void proc_configure_early_demux(int enabled, int protocol)
299+
{
300+
struct net_protocol *ipprot;
301+
#if IS_ENABLED(CONFIG_IPV6)
302+
struct inet6_protocol *ip6prot;
303+
#endif
304+
305+
ipprot = rcu_dereference(inet_protos[protocol]);
306+
if (ipprot)
307+
ipprot->early_demux = enabled ? ipprot->early_demux_handler :
308+
NULL;
309+
310+
#if IS_ENABLED(CONFIG_IPV6)
311+
ip6prot = rcu_dereference(inet6_protos[protocol]);
312+
if (ip6prot)
313+
ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
314+
NULL;
315+
#endif
316+
}
317+
318+
static int proc_tcp_early_demux(struct ctl_table *table, int write,
319+
void __user *buffer, size_t *lenp, loff_t *ppos)
320+
{
321+
int ret = 0;
322+
323+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
324+
325+
if (write && !ret) {
326+
int enabled = init_net.ipv4.sysctl_tcp_early_demux;
327+
328+
proc_configure_early_demux(enabled, IPPROTO_TCP);
329+
}
330+
331+
return ret;
332+
}
333+
334+
static int proc_udp_early_demux(struct ctl_table *table, int write,
335+
void __user *buffer, size_t *lenp, loff_t *ppos)
336+
{
337+
int ret = 0;
338+
339+
ret = proc_dointvec(table, write, buffer, lenp, ppos);
340+
341+
if (write && !ret) {
342+
int enabled = init_net.ipv4.sysctl_udp_early_demux;
343+
344+
proc_configure_early_demux(enabled, IPPROTO_UDP);
345+
}
346+
347+
return ret;
348+
}
349+
297350
static struct ctl_table ipv4_table[] = {
298351
{
299352
.procname = "tcp_timestamps",
@@ -749,6 +802,20 @@ static struct ctl_table ipv4_net_table[] = {
749802
.mode = 0644,
750803
.proc_handler = proc_dointvec
751804
},
805+
{
806+
.procname = "udp_early_demux",
807+
.data = &init_net.ipv4.sysctl_udp_early_demux,
808+
.maxlen = sizeof(int),
809+
.mode = 0644,
810+
.proc_handler = proc_udp_early_demux
811+
},
812+
{
813+
.procname = "tcp_early_demux",
814+
.data = &init_net.ipv4.sysctl_tcp_early_demux,
815+
.maxlen = sizeof(int),
816+
.mode = 0644,
817+
.proc_handler = proc_tcp_early_demux
818+
},
752819
{
753820
.procname = "ip_default_ttl",
754821
.data = &init_net.ipv4.sysctl_ip_default_ttl,

net/ipv6/ip6_input.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949

5050
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
5151
{
52+
void (*edemux)(struct sk_buff *skb);
53+
5254
/* if ingress device is enslaved to an L3 master device pass the
5355
* skb to its handler for processing
5456
*/
@@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
6062
const struct inet6_protocol *ipprot;
6163

6264
ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
63-
if (ipprot && ipprot->early_demux)
64-
ipprot->early_demux(skb);
65+
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
66+
edemux(skb);
6567
}
6668
if (!skb_valid_dst(skb))
6769
ip6_route_input(skb);

net/ipv6/protocol.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include <net/protocol.h>
2727

2828
#if IS_ENABLED(CONFIG_IPV6)
29-
const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
29+
struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly;
3030
EXPORT_SYMBOL(inet6_protos);
3131

3232
int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)

net/ipv6/tcp_ipv6.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1925,8 +1925,9 @@ struct proto tcpv6_prot = {
19251925
.diag_destroy = tcp_abort,
19261926
};
19271927

1928-
static const struct inet6_protocol tcpv6_protocol = {
1928+
static struct inet6_protocol tcpv6_protocol = {
19291929
.early_demux = tcp_v6_early_demux,
1930+
.early_demux_handler = tcp_v6_early_demux,
19301931
.handler = tcp_v6_rcv,
19311932
.err_handler = tcp_v6_err,
19321933
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,

net/ipv6/udp.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -1436,8 +1436,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
14361436
}
14371437
#endif
14381438

1439-
static const struct inet6_protocol udpv6_protocol = {
1439+
static struct inet6_protocol udpv6_protocol = {
14401440
.early_demux = udp_v6_early_demux,
1441+
.early_demux_handler = udp_v6_early_demux,
14411442
.handler = udpv6_rcv,
14421443
.err_handler = udpv6_err,
14431444
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,

0 commit comments

Comments
 (0)