Skip to content

Commit 7f8a436

Browse files
joestringerdavem330
authored andcommitted
openvswitch: Add conntrack action
Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Signed-off-by: Joe Stringer <[email protected]> Signed-off-by: Justin Pettit <[email protected]> Signed-off-by: Andy Zhou <[email protected]> Acked-by: Thomas Graf <[email protected]> Acked-by: Pravin B Shelar <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e79e259 commit 7f8a436

File tree

13 files changed

+877
-37
lines changed

13 files changed

+877
-37
lines changed

include/uapi/linux/openvswitch.h

+40
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
164164
* %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
165165
* output port is actually a tunnel port. Contains the output tunnel key
166166
* extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
167+
* @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
168+
* %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
169+
* size.
167170
*
168171
* These attributes follow the &struct ovs_header within the Generic Netlink
169172
* payload for %OVS_PACKET_* commands.
@@ -180,6 +183,7 @@ enum ovs_packet_attr {
180183
OVS_PACKET_ATTR_UNUSED2,
181184
OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe,
182185
error logging should be suppressed. */
186+
OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */
183187
__OVS_PACKET_ATTR_MAX
184188
};
185189

@@ -319,6 +323,8 @@ enum ovs_key_attr {
319323
OVS_KEY_ATTR_MPLS, /* array of struct ovs_key_mpls.
320324
* The implementation may restrict
321325
* the accepted length of the array. */
326+
OVS_KEY_ATTR_CT_STATE, /* u8 bitmask of OVS_CS_F_* */
327+
OVS_KEY_ATTR_CT_ZONE, /* u16 connection tracking zone. */
322328

323329
#ifdef __KERNEL__
324330
OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */
@@ -431,6 +437,15 @@ struct ovs_key_nd {
431437
__u8 nd_tll[ETH_ALEN];
432438
};
433439

440+
/* OVS_KEY_ATTR_CT_STATE flags */
441+
#define OVS_CS_F_NEW 0x01 /* Beginning of a new connection. */
442+
#define OVS_CS_F_ESTABLISHED 0x02 /* Part of an existing connection. */
443+
#define OVS_CS_F_RELATED 0x04 /* Related to an established
444+
* connection. */
445+
#define OVS_CS_F_INVALID 0x20 /* Could not track connection. */
446+
#define OVS_CS_F_REPLY_DIR 0x40 /* Flow is in the reply direction. */
447+
#define OVS_CS_F_TRACKED 0x80 /* Conntrack has occurred. */
448+
434449
/**
435450
* enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
436451
* @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -594,6 +609,28 @@ struct ovs_action_hash {
594609
uint32_t hash_basis;
595610
};
596611

612+
/**
613+
* enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
614+
* @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
615+
* @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
616+
*/
617+
enum ovs_ct_attr {
618+
OVS_CT_ATTR_UNSPEC,
619+
OVS_CT_ATTR_FLAGS, /* u8 bitmask of OVS_CT_F_*. */
620+
OVS_CT_ATTR_ZONE, /* u16 zone id. */
621+
__OVS_CT_ATTR_MAX
622+
};
623+
624+
#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
625+
626+
/*
627+
* OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
628+
* @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows
629+
* future packets for the same connection to be identified as 'established'
630+
* or 'related'.
631+
*/
632+
#define OVS_CT_F_COMMIT 0x01
633+
597634
/**
598635
* enum ovs_action_attr - Action types.
599636
*
@@ -623,6 +660,8 @@ struct ovs_action_hash {
623660
* indicate the new packet contents. This could potentially still be
624661
* %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there
625662
* is no MPLS label stack, as determined by ethertype, no action is taken.
663+
* @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
664+
* entries in the flow key.
626665
*
627666
* Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
628667
* fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -648,6 +687,7 @@ enum ovs_action_attr {
648687
* data immediately followed by a mask.
649688
* The data must be zero for the unmasked
650689
* bits. */
690+
OVS_ACTION_ATTR_CT, /* One nested OVS_CT_ATTR_* . */
651691

652692
__OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted
653693
* from userspace. */

net/openvswitch/Kconfig

+11
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ config OPENVSWITCH
3131

3232
If unsure, say N.
3333

34+
config OPENVSWITCH_CONNTRACK
35+
bool "Open vSwitch conntrack action support"
36+
depends on OPENVSWITCH
37+
depends on NF_CONNTRACK
38+
default OPENVSWITCH
39+
---help---
40+
If you say Y here, then Open vSwitch module will be able to pass
41+
packets through conntrack.
42+
43+
Say N to exclude this support and reduce the binary size.
44+
3445
config OPENVSWITCH_GRE
3546
tristate "Open vSwitch GRE tunneling support"
3647
depends on OPENVSWITCH

net/openvswitch/Makefile

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ openvswitch-y := \
1515
vport-internal_dev.o \
1616
vport-netdev.o
1717

18+
openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o
19+
1820
obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o
1921
obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o
2022
obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o

net/openvswitch/actions.c

+169-6
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@
2222
#include <linux/in.h>
2323
#include <linux/ip.h>
2424
#include <linux/openvswitch.h>
25+
#include <linux/netfilter_ipv6.h>
2526
#include <linux/sctp.h>
2627
#include <linux/tcp.h>
2728
#include <linux/udp.h>
2829
#include <linux/in6.h>
2930
#include <linux/if_arp.h>
3031
#include <linux/if_vlan.h>
3132

33+
#include <net/dst.h>
3234
#include <net/ip.h>
3335
#include <net/ipv6.h>
3436
#include <net/checksum.h>
@@ -38,6 +40,7 @@
3840

3941
#include "datapath.h"
4042
#include "flow.h"
43+
#include "conntrack.h"
4144
#include "vport.h"
4245

4346
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
@@ -52,6 +55,20 @@ struct deferred_action {
5255
struct sw_flow_key pkt_key;
5356
};
5457

58+
#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN)
59+
struct ovs_frag_data {
60+
unsigned long dst;
61+
struct vport *vport;
62+
struct ovs_skb_cb cb;
63+
__be16 inner_protocol;
64+
__u16 vlan_tci;
65+
__be16 vlan_proto;
66+
unsigned int l2_len;
67+
u8 l2_data[MAX_L2_LEN];
68+
};
69+
70+
static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
71+
5572
#define DEFERRED_ACTION_FIFO_SIZE 10
5673
struct action_fifo {
5774
int head;
@@ -602,14 +619,145 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
602619
return 0;
603620
}
604621

605-
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
622+
static int ovs_vport_output(struct sock *sock, struct sk_buff *skb)
623+
{
624+
struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
625+
struct vport *vport = data->vport;
626+
627+
if (skb_cow_head(skb, data->l2_len) < 0) {
628+
kfree_skb(skb);
629+
return -ENOMEM;
630+
}
631+
632+
__skb_dst_copy(skb, data->dst);
633+
*OVS_CB(skb) = data->cb;
634+
skb->inner_protocol = data->inner_protocol;
635+
skb->vlan_tci = data->vlan_tci;
636+
skb->vlan_proto = data->vlan_proto;
637+
638+
/* Reconstruct the MAC header. */
639+
skb_push(skb, data->l2_len);
640+
memcpy(skb->data, &data->l2_data, data->l2_len);
641+
ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
642+
skb_reset_mac_header(skb);
643+
644+
ovs_vport_send(vport, skb);
645+
return 0;
646+
}
647+
648+
static unsigned int
649+
ovs_dst_get_mtu(const struct dst_entry *dst)
650+
{
651+
return dst->dev->mtu;
652+
}
653+
654+
static struct dst_ops ovs_dst_ops = {
655+
.family = AF_UNSPEC,
656+
.mtu = ovs_dst_get_mtu,
657+
};
658+
659+
/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
660+
* ovs_vport_output(), which is called once per fragmented packet.
661+
*/
662+
static void prepare_frag(struct vport *vport, struct sk_buff *skb)
663+
{
664+
unsigned int hlen = skb_network_offset(skb);
665+
struct ovs_frag_data *data;
666+
667+
data = this_cpu_ptr(&ovs_frag_data_storage);
668+
data->dst = skb->_skb_refdst;
669+
data->vport = vport;
670+
data->cb = *OVS_CB(skb);
671+
data->inner_protocol = skb->inner_protocol;
672+
data->vlan_tci = skb->vlan_tci;
673+
data->vlan_proto = skb->vlan_proto;
674+
data->l2_len = hlen;
675+
memcpy(&data->l2_data, skb->data, hlen);
676+
677+
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
678+
skb_pull(skb, hlen);
679+
}
680+
681+
static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,
682+
__be16 ethertype)
683+
{
684+
if (skb_network_offset(skb) > MAX_L2_LEN) {
685+
OVS_NLERR(1, "L2 header too long to fragment");
686+
return;
687+
}
688+
689+
if (ethertype == htons(ETH_P_IP)) {
690+
struct dst_entry ovs_dst;
691+
unsigned long orig_dst;
692+
693+
prepare_frag(vport, skb);
694+
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
695+
DST_OBSOLETE_NONE, DST_NOCOUNT);
696+
ovs_dst.dev = vport->dev;
697+
698+
orig_dst = skb->_skb_refdst;
699+
skb_dst_set_noref(skb, &ovs_dst);
700+
IPCB(skb)->frag_max_size = mru;
701+
702+
ip_do_fragment(skb->sk, skb, ovs_vport_output);
703+
refdst_drop(orig_dst);
704+
} else if (ethertype == htons(ETH_P_IPV6)) {
705+
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
706+
unsigned long orig_dst;
707+
struct rt6_info ovs_rt;
708+
709+
if (!v6ops) {
710+
kfree_skb(skb);
711+
return;
712+
}
713+
714+
prepare_frag(vport, skb);
715+
memset(&ovs_rt, 0, sizeof(ovs_rt));
716+
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
717+
DST_OBSOLETE_NONE, DST_NOCOUNT);
718+
ovs_rt.dst.dev = vport->dev;
719+
720+
orig_dst = skb->_skb_refdst;
721+
skb_dst_set_noref(skb, &ovs_rt.dst);
722+
IP6CB(skb)->frag_max_size = mru;
723+
724+
v6ops->fragment(skb->sk, skb, ovs_vport_output);
725+
refdst_drop(orig_dst);
726+
} else {
727+
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
728+
ovs_vport_name(vport), ntohs(ethertype), mru,
729+
vport->dev->mtu);
730+
kfree_skb(skb);
731+
}
732+
}
733+
734+
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
735+
struct sw_flow_key *key)
606736
{
607737
struct vport *vport = ovs_vport_rcu(dp, out_port);
608738

609-
if (likely(vport))
610-
ovs_vport_send(vport, skb);
611-
else
739+
if (likely(vport)) {
740+
u16 mru = OVS_CB(skb)->mru;
741+
742+
if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
743+
ovs_vport_send(vport, skb);
744+
} else if (mru <= vport->dev->mtu) {
745+
__be16 ethertype = key->eth.type;
746+
747+
if (!is_flow_key_valid(key)) {
748+
if (eth_p_mpls(skb->protocol))
749+
ethertype = skb->inner_protocol;
750+
else
751+
ethertype = vlan_get_protocol(skb);
752+
}
753+
754+
ovs_fragment(vport, skb, mru, ethertype);
755+
} else {
756+
kfree_skb(skb);
757+
}
758+
} else {
612759
kfree_skb(skb);
760+
}
613761
}
614762

615763
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
@@ -623,6 +771,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
623771

624772
memset(&upcall, 0, sizeof(upcall));
625773
upcall.cmd = OVS_PACKET_CMD_ACTION;
774+
upcall.mru = OVS_CB(skb)->mru;
626775

627776
for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
628777
a = nla_next(a, &rem)) {
@@ -816,6 +965,11 @@ static int execute_masked_set_action(struct sk_buff *skb,
816965
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
817966
__be32 *));
818967
break;
968+
969+
case OVS_KEY_ATTR_CT_STATE:
970+
case OVS_KEY_ATTR_CT_ZONE:
971+
err = -EINVAL;
972+
break;
819973
}
820974

821975
return err;
@@ -885,7 +1039,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
8851039
struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
8861040

8871041
if (out_skb)
888-
do_output(dp, out_skb, prev_port);
1042+
do_output(dp, out_skb, prev_port, key);
8891043

8901044
prev_port = -1;
8911045
}
@@ -942,6 +1096,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
9421096
case OVS_ACTION_ATTR_SAMPLE:
9431097
err = sample(dp, skb, key, a, attr, len);
9441098
break;
1099+
1100+
case OVS_ACTION_ATTR_CT:
1101+
err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
1102+
nla_data(a));
1103+
1104+
/* Hide stolen IP fragments from user space. */
1105+
if (err == -EINPROGRESS)
1106+
return 0;
1107+
break;
9451108
}
9461109

9471110
if (unlikely(err)) {
@@ -951,7 +1114,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
9511114
}
9521115

9531116
if (prev_port != -1)
954-
do_output(dp, skb, prev_port);
1117+
do_output(dp, skb, prev_port, key);
9551118
else
9561119
consume_skb(skb);
9571120

0 commit comments

Comments
 (0)