Skip to content

Commit

Permalink
PSA/eBPF: Enable passing packets up to the kernel stack (#3691)
Browse files Browse the repository at this point in the history
* Enable passing packets up to kernel stack

* Push P4 program

* Use workaround with bpf_redirect to pass packets up to kernel stack

* Fix IP routing and ARP PTF test cases

* Introduce packet mark

* Use packet mark

* Fix compiler and PTF test

* Fix PTF tests

* Fix cpplint

* Some additions to docs

* Address review

* Fix cpplint

* Fix clang-format
  • Loading branch information
osinstom authored Nov 23, 2022
1 parent 9e33f88 commit 73908eb
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 13 deletions.
14 changes: 14 additions & 0 deletions backends/ebpf/psa/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,20 @@ The TC-based design of PSA for eBPF is depicted in Figure below.

## Packet paths

### NTK (Normal Packet To Kernel)

**WARNING!** The NTK packet path is a custom packet path used for the PSA-eBPF only! It is not a standardized PSA packet path.

The NTK packet path allows integrating P4/PSA programs for eBPF with the standard Linux kernel stack. The main use case is handling
ICMP/ARP requests and sending packet to the userspace process listening on a socket.

The NTK path is enforced if `drop` is set to `false` and `egress_port` is left unchanged or set to 0 (it's a special implicit port number that forwards packets to the kernel stack).
Since packets can be modified in the PSA ingress pipeline before they are sent to the kernel stack, a P4 programmer should make sure that packets use standard headers and are properly formatted.
Otherwise, the kernel stack will drop them.

**NOTE!** There is no symmetric packet path *from kernel* - once a packet enters the kernel network stack, it is further processed exclusively by the kernel.
As a consequence, all packets that have not been processed by the PSA Ingress pipeline (e.g., packets sent from userspace application) will not be handled by the PSA Egress pipeline!

### NFP (Normal Packet From Port)

Packet arriving on an interface is intercepted in the XDP hook by the `xdp-helper` program. It performs pre-processing and
Expand Down
54 changes: 48 additions & 6 deletions backends/ebpf/psa/ebpfPipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,13 @@ void EBPFIngressPipeline::emit(CodeBuilder* builder) {
emitMetadataFromCPUMAP(builder);
builder->newline();

msgStr =
Util::printf_format("%s parser: parsing new packet, path=%%d, pkt_len=%%d", sectionName);
msgStr = Util::printf_format(
"%s parser: parsing new packet, input_port=%%d, path=%%d, "
"pkt_len=%%d",
sectionName);
varStr = Util::printf_format("%s->packet_path", compilerGlobalMetadata);
builder->target->emitTraceMessage(builder, msgStr.c_str(), 2, varStr, lengthVar.c_str());
builder->target->emitTraceMessage(builder, msgStr.c_str(), 3, ifindexVar.c_str(), varStr,
lengthVar.c_str());

// PARSER
parser->emit(builder);
Expand Down Expand Up @@ -395,6 +398,13 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) {
builder->blockStart();

emitGlobalMetadataInitializer(builder);
builder->appendFormat("if (compiler_meta__->mark != %u) ", packetMark);
builder->blockStart();
builder->emitIndent();
builder->append("return TC_ACT_OK");
builder->endOfStatement(true);
builder->blockEnd(true);

emitLocalVariables(builder);
emitUserMetadataInstance(builder);
builder->newline();
Expand All @@ -412,10 +422,13 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) {
emitPSAControlOutputMetadata(builder);
emitPSAControlInputMetadata(builder);

msgStr =
Util::printf_format("%s parser: parsing new packet, path=%%d, pkt_len=%%d", sectionName);
msgStr = Util::printf_format(
"%s parser: parsing new packet, input_port=%%d, path=%%d, "
"pkt_len=%%d",
sectionName);
varStr = Util::printf_format("%s->packet_path", compilerGlobalMetadata);
builder->target->emitTraceMessage(builder, msgStr.c_str(), 2, varStr, lengthVar.c_str());
builder->target->emitTraceMessage(builder, msgStr.c_str(), 3, ifindexVar.c_str(), varStr,
lengthVar.c_str());

// PARSER
parser->emit(builder);
Expand Down Expand Up @@ -458,12 +471,20 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) {
void TCIngressPipeline::emitGlobalMetadataInitializer(CodeBuilder* builder) {
EBPFPipeline::emitGlobalMetadataInitializer(builder);

// if Traffic Manager decided to pass packet to the kernel stack earlier, send it up immediately
builder->emitIndent();
builder->append("if (compiler_meta__->pass_to_kernel == true) return TC_ACT_OK;");
builder->newline();

// workaround to make TC protocol-independent, DO NOT REMOVE
builder->emitIndent();
// replace ether_type only if a packet comes from XDP
builder->appendFormat("if (%s->packet_path == NORMAL) ", compilerGlobalMetadata);
builder->blockStart();
builder->emitIndent();
builder->appendFormat("compiler_meta__->mark = %u", packetMark);
builder->endOfStatement(true);
builder->emitIndent();
if (options.xdp2tcMode == XDP2TC_META) {
emitTCWorkaroundUsingMeta(builder);
} else if (options.xdp2tcMode == XDP2TC_HEAD) {
Expand Down Expand Up @@ -557,6 +578,27 @@ void TCIngressPipeline::emitTrafficManager(CodeBuilder* builder) {
control->outputStandardMetadata->name.name);
builder->newline();

builder->appendFormat("if (!%s.drop && %s.egress_port == 0) ",
control->outputStandardMetadata->name.name,
control->outputStandardMetadata->name.name);
builder->blockStart();
builder->target->emitTraceMessage(builder, "IngressTM: Sending packet up to the kernel stack");
builder->emitIndent();

// Since XDP helper re-writes EtherType for packets other than IPv4 (e.g., ARP)
// we cannot simply return TC_ACT_OK to pass the packet up to the kernel stack,
// because the kernel stack would receive a malformed packet (with invalid skb->protocol).
// The workaround is to send the packet back to the same interface. If we redirect,
// the packet will be re-written back to the original format.
// At the beginning of the pipeline we check if pass_to_kernel is true and,
// if so, the program returns TC_ACT_OK.
builder->newline();
builder->append("compiler_meta__->pass_to_kernel = true;");
builder->newline();
builder->append("return bpf_redirect(skb->ifindex, BPF_F_INGRESS)");
builder->endOfStatement(true);
builder->blockEnd(true);

cstring eg_port =
Util::printf_format("%s.egress_port", control->outputStandardMetadata->name.name);
cstring cos =
Expand Down
3 changes: 3 additions & 0 deletions backends/ebpf/psa/ebpfPipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class EBPFPipeline : public EBPFProgram {
cstring compilerGlobalMetadata;
// A variable name storing "1" value. Used to access BPF array map index.
cstring oneKey;
// A unique mark used to differentiate packets processed by P4/eBPF from others.
unsigned packetMark;

EBPFControlPSA* control;
EBPFDeparserPSA* deparser;
Expand All @@ -71,6 +73,7 @@ class EBPFPipeline : public EBPFProgram {
pktInstanceVar = compilerGlobalMetadata + cstring("->instance");
priorityVar = cstring("skb->priority");
oneKey = EBPFModel::reserved("one");
packetMark = 0x99;
}

/* Check if pipeline does any processing.
Expand Down
7 changes: 2 additions & 5 deletions backends/ebpf/runtime/psa.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,10 @@ struct psa_egress_deparser_input_metadata_t {
* The size of this struct must be less than 32 bytes.
*/
struct psa_global_metadata {
MulticastGroup_t multicast_group; /// set by Ingress, read by PRE
PortId_t egress_port; /// set by Ingress, read by PRE
CloneSessionId_t clone_session_id; /// set by Ingress/Egress, read by PRE
bool clone; /// set by Ingress/Egress, read by PRE
bool drop; /// set by Ingress/Egress, read by PRE
PSA_PacketPath_t packet_path; /// set by eBPF program as helper variable, read by ingress/egress
EgressInstance_t instance; /// set by PRE, read by Egress
__u8 mark; /// packet mark set by PSA/eBPF programs. Used to differentiate between packets processed by PSA/eBPF from other packets.
bool pass_to_kernel; /// internal metadata, forces sending packet up to kernel stack
} __attribute__((aligned(4)));

struct clone_session_entry {
Expand Down
100 changes: 100 additions & 0 deletions backends/ebpf/tests/p4testdata/pass-to-kernel.p4
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#include <core.p4>
#include <psa.p4>
#include "common_headers.p4"

struct metadata {
}

struct headers {
ethernet_t ethernet;
}

parser IngressParserImpl(
packet_in buffer,
out headers parsed_hdr,
inout metadata user_meta,
in psa_ingress_parser_input_metadata_t istd,
in empty_t resubmit_meta,
in empty_t recirculate_meta)
{
state start {
transition accept;
}
}

control ingress(inout headers hdr,
inout metadata user_meta,
in psa_ingress_input_metadata_t istd,
inout psa_ingress_output_metadata_t ostd)
{
apply {
// setting drop=false and egress_port=0 should enforce sending packet up to the kernel stack
ostd.drop = false;
// ostd.egress_port left unspecified
}
}

parser EgressParserImpl(
packet_in buffer,
out headers parsed_hdr,
inout metadata user_meta,
in psa_egress_parser_input_metadata_t istd,
in metadata normal_meta,
in empty_t clone_i2e_meta,
in empty_t clone_e2e_meta)
{
state start {
transition accept;
}
}

control egress(inout headers hdr,
inout metadata user_meta,
in psa_egress_input_metadata_t istd,
inout psa_egress_output_metadata_t ostd)
{
Counter<bit<32>, bit<32>>(1024, PSA_CounterType_t.PACKETS) eg_packets;

apply {
// this counter should not be incremented if packets are not coming from PSA ingress.
eg_packets.count(0);
}
}

control IngressDeparserImpl(
packet_out packet,
out empty_t clone_i2e_meta,
out empty_t resubmit_meta,
out metadata normal_meta,
inout headers hdr,
in metadata meta,
in psa_ingress_output_metadata_t istd)
{
apply {
}
}

control EgressDeparserImpl(
packet_out packet,
out empty_t clone_e2e_meta,
out empty_t recirculate_meta,
inout headers hdr,
in metadata meta,
in psa_egress_output_metadata_t istd,
in psa_egress_deparser_input_metadata_t edstd)
{
apply {
}
}

IngressPipeline(IngressParserImpl(),
ingress(),
IngressDeparserImpl()) ip;

EgressPipeline(EgressParserImpl(),
egress(),
EgressDeparserImpl()) ep;

PSA_Switch(ip, PacketReplicationEngine(), ep, BufferingQueueingEngine()) main;


67 changes: 65 additions & 2 deletions backends/ebpf/tests/ptf/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import copy

from scapy.fields import ShortField, IntField
from scapy.layers.l2 import Ether
from scapy.layers.inet import IP, UDP
from scapy.layers.l2 import Ether, ARP
from scapy.layers.inet import IP, UDP, ICMP
from scapy.packet import Packet, bind_layers, split_layers
from ptf.packet import MPLS
from ptf.mask import Mask
Expand Down Expand Up @@ -587,3 +587,66 @@ def runTest(self):
pkt[IP].dst = 0x11993355 # mask is 0xFF00FFFF
testutils.send_packet(self, PORT0, pkt)
testutils.verify_packet(self, pkt, PORT1)


class PassToKernelStackTest(P4EbpfTest):

p4_file_path = "p4testdata/pass-to-kernel.p4"

def setUp(self):
super(PassToKernelStackTest, self).setUp()
# static route
self.exec_ns_cmd("ip route add 20.0.0.15/32 dev eth1")
# add IP address to interface, so that it can reply with ICMP and ARP
self.exec_ns_cmd("ifconfig eth0 10.0.0.1 up")
# static ARP
self.exec_ns_cmd("arp -s 20.0.0.15 00:00:00:00:00:aa")
self.exec_ns_cmd("arp -s 10.0.0.2 00:00:00:00:00:cc")

def tearDown(self):
self.exec_ns_cmd("arp -d 20.0.0.15")
self.exec_ns_cmd("arp -d 10.0.0.2")
self.exec_ns_cmd("ip route del 20.0.0.15/32")

super(PassToKernelStackTest, self).tearDown()


def runTest(self):
# simple forward by Linux routing
pkt = testutils.simple_tcp_packet(eth_dst="00:00:00:00:00:01", ip_src="10.0.0.2", ip_dst="20.0.0.15")
testutils.send_packet(self, PORT0, pkt)
exp_pkt = pkt.copy()
exp_pkt[Ether].src = "00:00:00:00:00:02" # MAC of eth1
exp_pkt[Ether].dst = "00:00:00:00:00:aa"
exp_pkt[IP].ttl = 63 # routed packet
testutils.verify_packet(self, exp_pkt, PORT1)
self.counter_verify(name="egress_eg_packets", key=[0], packets=0)

# ARP handling
pkt = testutils.simple_arp_packet(pktlen=21, eth_dst="00:00:00:00:00:01", ip_snd="10.0.0.2", ip_tgt="10.0.0.1")
testutils.send_packet(self, PORT0, pkt)
exp_pkt = pkt.copy()
exp_pkt[ARP].op = 2
exp_pkt[ARP].hwsrc = "00:00:00:00:00:01"
exp_pkt[ARP].hwdst = pkt[Ether].src
exp_pkt[ARP].psrc = pkt[ARP].pdst
exp_pkt[ARP].pdst = pkt[ARP].psrc
exp_pkt[Ether].src = "00:00:00:00:00:01"
exp_pkt[Ether].dst = pkt[Ether].src
testutils.verify_packet(self, exp_pkt, PORT0)
self.counter_verify(name="egress_eg_packets", key=[0], packets=0)

pkt = testutils.simple_icmp_packet(eth_dst="00:00:00:00:00:01", ip_src="10.0.0.2", ip_dst="10.0.0.1")
testutils.send_packet(self, PORT0, pkt)
exp_pkt = testutils.simple_icmp_packet(eth_src="00:00:00:00:00:01", # MAC of eth1
eth_dst="00:00:00:00:00:cc",
ip_src="10.0.0.1",
ip_dst="10.0.0.2",
icmp_type=0)
mask = Mask(exp_pkt)
# Linux can generate random IP identification number,
# ignore ID and checksum in the validation
mask.set_do_not_care_scapy(IP, "id")
mask.set_do_not_care_scapy(IP, "chksum")
testutils.verify_packet(self, mask, PORT0)
self.counter_verify(name="egress_eg_packets", key=[0], packets=0)
4 changes: 4 additions & 0 deletions backends/ebpf/tests/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,19 @@ ip netns exec switch ip link add name psa_cpu type dummy
ip netns exec switch ip link set dev psa_cpu up

# Normal ports
idx=1
for intf in "${INTERFACES[@]}" ; do
ip link add "s1-$intf" type veth peer name "$intf" netns switch
ip netns exec switch ip link set "$intf" up
ip netns exec switch ifconfig "$intf" hw ether 00:00:00:00:00:0${idx}
ip link set dev "s1-$intf" up

# Disable trash traffic
sysctl -w net.ipv6.conf."s1-$intf".disable_ipv6=1
sysctl -w net.ipv6.conf."s1-$intf".autoconf=0
sysctl -w net.ipv6.conf."s1-$intf".accept_ra=0

idx=$(expr $idx + 1)
done

# Disable trash traffic
Expand Down

0 comments on commit 73908eb

Please sign in to comment.