Skip to content

Commit aef3a58

Browse files
author
Paolo Abeni
committed
Merge tag 'nf-24-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
Pablo Neira Ayuso says: ==================== Netfilter fixes for net v2: with kdoc fixes per Paolo Abeni. The following patchset contains Netfilter fixes for net: Patch #1 and #2 handle an esoteric scenario: Given two tasks sending UDP packets to one another, two packets of the same flow in each direction handled by different CPUs that result in two conntrack objects in NEW state, where reply packet loses race. Then, patch #3 adds a testcase for this scenario. Series from Florian Westphal. 1) NAT engine can falsely detect a port collision if it happens to pick up a reply packet as NEW rather than ESTABLISHED. Add extra code to detect this and suppress port reallocation in this case. 2) To complete the clash resolution in the reply direction, extend conntrack logic to detect clashing conntrack in the reply direction to existing entry. 3) Adds a test case. Then, an assorted list of fixes follow: 4) Add a selftest for tproxy, from Antonio Ojea. 5) Guard ctnetlink_*_size() functions under #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) From Andy Shevchenko. 6) Use -m socket --transparent in iptables tproxy documentation. From XIE Zhibang. 7) Call kfree_rcu() when releasing flowtable hooks to address race with netlink dump path, from Phil Sutter. 8) Fix compilation warning in nf_reject with CONFIG_BRIDGE_NETFILTER=n. From Simon Horman. 9) Guard ctnetlink_label_size() under CONFIG_NF_CONNTRACK_EVENTS which is its only user, to address a compilation warning. From Simon Horman. 10) Use rcu-protected list iteration over basechain hooks from netlink dump path. 11) Fix memcg for nf_tables, use GFP_KERNEL_ACCOUNT is not complete. 12) Remove old nfqueue conntrack clash resolution. Instead trying to use same destination address consistently which requires double DNAT, use the existing clash resolution which allows clashing packets go through with different destination. Antonio Ojea originally reported an issue from the postrouting chain, I proposed a fix: https://lore.kernel.org/netfilter-devel/ZuwSwAqKgCB2a51-@calendula/T/ which he reported it did not work for him. 13) Adds a selftest for patch 12. 14) Fixes ipvs.sh selftest. netfilter pull request 24-09-26 * tag 'nf-24-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: selftests: netfilter: Avoid hanging ipvs.sh kselftest: add test for nfqueue induced conntrack race netfilter: nfnetlink_queue: remove old clash resolution logic netfilter: nf_tables: missing objects with no memcg accounting netfilter: nf_tables: use rcu chain hook list iterator from netlink dump path netfilter: ctnetlink: compile ctnetlink_label_size with CONFIG_NF_CONNTRACK_EVENTS netfilter: nf_reject: Fix build warning when CONFIG_BRIDGE_NETFILTER=n netfilter: nf_tables: Keep deleted flowtable hooks until after RCU docs: tproxy: ignore non-transparent sockets in iptables netfilter: ctnetlink: Guard possible unused functions selftests: netfilter: nft_tproxy.sh: add tcp tests selftests: netfilter: add reverse-clash resolution test case netfilter: conntrack: add clash resolution for reverse collisions netfilter: nf_nat: don't try nat source port reallocation for reverse dir clash ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Paolo Abeni <[email protected]>
2 parents 72ef075 + fc78630 commit aef3a58

22 files changed

+1091
-132
lines changed

Documentation/networking/tproxy.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ The idea is that you identify packets with destination address matching a local
1717
socket on your box, set the packet mark to a certain value::
1818

1919
# iptables -t mangle -N DIVERT
20-
# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
20+
# iptables -t mangle -A PREROUTING -p tcp -m socket --transparent -j DIVERT
2121
# iptables -t mangle -A DIVERT -j MARK --set-mark 1
2222
# iptables -t mangle -A DIVERT -j ACCEPT
2323

include/linux/netfilter.h

-4
Original file line numberDiff line numberDiff line change
@@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
376376
struct nf_conn;
377377
enum nf_nat_manip_type;
378378
struct nlattr;
379-
enum ip_conntrack_dir;
380379

381380
struct nf_nat_hook {
382381
int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
383382
const struct nlattr *attr);
384383
void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
385-
unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
386-
enum nf_nat_manip_type mtype,
387-
enum ip_conntrack_dir dir);
388384
void (*remove_nat_bysrc)(struct nf_conn *ct);
389385
};
390386

net/ipv4/netfilter/nf_reject_ipv4.c

+4-6
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
239239
void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
240240
int hook)
241241
{
242-
struct sk_buff *nskb;
243-
struct iphdr *niph;
244242
const struct tcphdr *oth;
243+
struct sk_buff *nskb;
245244
struct tcphdr _oth;
246245

247246
oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
@@ -266,14 +265,12 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
266265
nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
267266

268267
skb_reserve(nskb, LL_MAX_HEADER);
269-
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
270-
ip4_dst_hoplimit(skb_dst(nskb)));
268+
nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
269+
ip4_dst_hoplimit(skb_dst(nskb)));
271270
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
272271
if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
273272
goto free_nskb;
274273

275-
niph = ip_hdr(nskb);
276-
277274
/* "Never happens" */
278275
if (nskb->len > dst_mtu(skb_dst(nskb)))
279276
goto free_nskb;
@@ -290,6 +287,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
290287
*/
291288
if (nf_bridge_info_exists(oldskb)) {
292289
struct ethhdr *oeth = eth_hdr(oldskb);
290+
struct iphdr *niph = ip_hdr(nskb);
293291
struct net_device *br_indev;
294292

295293
br_indev = nf_bridge_get_physindev(oldskb, net);

net/ipv6/netfilter/nf_reject_ipv6.c

+2-3
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,6 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
273273
const struct tcphdr *otcph;
274274
unsigned int otcplen, hh_len;
275275
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
276-
struct ipv6hdr *ip6h;
277276
struct dst_entry *dst = NULL;
278277
struct flowi6 fl6;
279278

@@ -329,8 +328,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
329328
nskb->mark = fl6.flowi6_mark;
330329

331330
skb_reserve(nskb, hh_len + dst->header_len);
332-
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
333-
ip6_dst_hoplimit(dst));
331+
nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst));
334332
nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
335333

336334
nf_ct_attach(nskb, oldskb);
@@ -345,6 +343,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
345343
*/
346344
if (nf_bridge_info_exists(oldskb)) {
347345
struct ethhdr *oeth = eth_hdr(oldskb);
346+
struct ipv6hdr *ip6h = ipv6_hdr(nskb);
348347
struct net_device *br_indev;
349348

350349
br_indev = nf_bridge_get_physindev(oldskb, net);

net/netfilter/nf_conntrack_core.c

+51-90
Original file line numberDiff line numberDiff line change
@@ -988,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
988988
tstamp->start = ktime_get_real_ns();
989989
}
990990

991+
/**
992+
* nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
993+
* @ct1: conntrack in hash table to check against
994+
* @ct2: merge candidate
995+
*
996+
* returns true if ct1 and ct2 happen to refer to the same flow, but
997+
* in opposing directions, i.e.
998+
* ct1: a:b -> c:d
999+
* ct2: c:d -> a:b
1000+
* for both directions. If so, @ct2 should not have been created
1001+
* as the skb should have been picked up as ESTABLISHED flow.
1002+
* But ct1 was not yet committed to hash table before skb that created
1003+
* ct2 had arrived.
1004+
*
1005+
* Note we don't compare netns because ct entries in different net
1006+
* namespace cannot clash to begin with.
1007+
*
1008+
* @return: true if ct1 and ct2 are identical when swapping origin/reply.
1009+
*/
1010+
static bool
1011+
nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
1012+
{
1013+
u16 id1, id2;
1014+
1015+
if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1016+
&ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
1017+
return false;
1018+
1019+
if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
1020+
&ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
1021+
return false;
1022+
1023+
id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
1024+
id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
1025+
if (id1 != id2)
1026+
return false;
1027+
1028+
id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
1029+
id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);
1030+
1031+
return id1 == id2;
1032+
}
1033+
1034+
static int nf_ct_can_merge(const struct nf_conn *ct,
1035+
const struct nf_conn *loser_ct)
1036+
{
1037+
return nf_ct_match(ct, loser_ct) ||
1038+
nf_ct_match_reverse(ct, loser_ct);
1039+
}
1040+
9911041
/* caller must hold locks to prevent concurrent changes */
9921042
static int __nf_ct_resolve_clash(struct sk_buff *skb,
9931043
struct nf_conntrack_tuple_hash *h)
@@ -999,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
9991049

10001050
loser_ct = nf_ct_get(skb, &ctinfo);
10011051

1002-
if (nf_ct_is_dying(ct))
1003-
return NF_DROP;
1004-
1005-
if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
1006-
nf_ct_match(ct, loser_ct)) {
1052+
if (nf_ct_can_merge(ct, loser_ct)) {
10071053
struct net *net = nf_ct_net(ct);
10081054

10091055
nf_conntrack_get(&ct->ct_general);
@@ -2151,80 +2197,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
21512197
nf_conntrack_get(skb_nfct(nskb));
21522198
}
21532199

2154-
static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
2155-
struct nf_conn *ct,
2156-
enum ip_conntrack_info ctinfo)
2157-
{
2158-
const struct nf_nat_hook *nat_hook;
2159-
struct nf_conntrack_tuple_hash *h;
2160-
struct nf_conntrack_tuple tuple;
2161-
unsigned int status;
2162-
int dataoff;
2163-
u16 l3num;
2164-
u8 l4num;
2165-
2166-
l3num = nf_ct_l3num(ct);
2167-
2168-
dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
2169-
if (dataoff <= 0)
2170-
return NF_DROP;
2171-
2172-
if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
2173-
l4num, net, &tuple))
2174-
return NF_DROP;
2175-
2176-
if (ct->status & IPS_SRC_NAT) {
2177-
memcpy(tuple.src.u3.all,
2178-
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2179-
sizeof(tuple.src.u3.all));
2180-
tuple.src.u.all =
2181-
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2182-
}
2183-
2184-
if (ct->status & IPS_DST_NAT) {
2185-
memcpy(tuple.dst.u3.all,
2186-
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2187-
sizeof(tuple.dst.u3.all));
2188-
tuple.dst.u.all =
2189-
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2190-
}
2191-
2192-
h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2193-
if (!h)
2194-
return NF_ACCEPT;
2195-
2196-
/* Store status bits of the conntrack that is clashing to re-do NAT
2197-
* mangling according to what it has been done already to this packet.
2198-
*/
2199-
status = ct->status;
2200-
2201-
nf_ct_put(ct);
2202-
ct = nf_ct_tuplehash_to_ctrack(h);
2203-
nf_ct_set(skb, ct, ctinfo);
2204-
2205-
nat_hook = rcu_dereference(nf_nat_hook);
2206-
if (!nat_hook)
2207-
return NF_ACCEPT;
2208-
2209-
if (status & IPS_SRC_NAT) {
2210-
unsigned int verdict = nat_hook->manip_pkt(skb, ct,
2211-
NF_NAT_MANIP_SRC,
2212-
IP_CT_DIR_ORIGINAL);
2213-
if (verdict != NF_ACCEPT)
2214-
return verdict;
2215-
}
2216-
2217-
if (status & IPS_DST_NAT) {
2218-
unsigned int verdict = nat_hook->manip_pkt(skb, ct,
2219-
NF_NAT_MANIP_DST,
2220-
IP_CT_DIR_ORIGINAL);
2221-
if (verdict != NF_ACCEPT)
2222-
return verdict;
2223-
}
2224-
2225-
return NF_ACCEPT;
2226-
}
2227-
22282200
/* This packet is coming from userspace via nf_queue, complete the packet
22292201
* processing after the helper invocation in nf_confirm().
22302202
*/
@@ -2288,17 +2260,6 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
22882260
if (!ct)
22892261
return NF_ACCEPT;
22902262

2291-
if (!nf_ct_is_confirmed(ct)) {
2292-
int ret = __nf_conntrack_update(net, skb, ct, ctinfo);
2293-
2294-
if (ret != NF_ACCEPT)
2295-
return ret;
2296-
2297-
ct = nf_ct_get(skb, &ctinfo);
2298-
if (!ct)
2299-
return NF_ACCEPT;
2300-
}
2301-
23022263
return nf_confirm_cthelper(skb, ct, ctinfo);
23032264
}
23042265

net/netfilter/nf_conntrack_netlink.c

+3-6
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,7 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
382382
#define ctnetlink_dump_secctx(a, b) (0)
383383
#endif
384384

385-
#ifdef CONFIG_NF_CONNTRACK_LABELS
385+
#ifdef CONFIG_NF_CONNTRACK_EVENTS
386386
static inline int ctnetlink_label_size(const struct nf_conn *ct)
387387
{
388388
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -391,6 +391,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
391391
return 0;
392392
return nla_total_size(sizeof(labels->bits));
393393
}
394+
#endif
394395

395396
static int
396397
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
@@ -411,10 +412,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
411412

412413
return 0;
413414
}
414-
#else
415-
#define ctnetlink_dump_labels(a, b) (0)
416-
#define ctnetlink_label_size(a) (0)
417-
#endif
418415

419416
#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
420417

@@ -652,7 +649,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)
652649

653650
return len + len4;
654651
}
655-
#endif
656652

657653
static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
658654
{
@@ -690,6 +686,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
690686
return 0;
691687
#endif
692688
}
689+
#endif
693690

694691
#ifdef CONFIG_NF_CONNTRACK_EVENTS
695692
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)

0 commit comments

Comments
 (0)