diff --git a/backends/ebpf/psa/README.md b/backends/ebpf/psa/README.md index 8f52226b81..b3416df9e3 100644 --- a/backends/ebpf/psa/README.md +++ b/backends/ebpf/psa/README.md @@ -37,6 +37,20 @@ The TC-based design of PSA for eBPF is depicted in Figure below. ## Packet paths +### NTK (Normal Packet To Kernel) + +**WARNING!** The NTK packet path is a custom packet path used for the PSA-eBPF only! It is not a standardized PSA packet path. + +The NTK packet path allows integrating P4/PSA programs for eBPF with the standard Linux kernel stack. The main use case is handling +ICMP/ARP requests and sending packet to the userspace process listening on a socket. + +The NTK path is enforced if `drop` is set to `false` and `egress_port` is left unchanged or set to 0 (it's a special implicit port number that forwards packets to the kernel stack). +Since packets can be modified in the PSA ingress pipeline before they are sent to the kernel stack, a P4 programmer should make sure that packets use standard headers and are properly formatted. +Otherwise, the kernel stack will drop them. + +**NOTE!** There is no symmetric packet path *from kernel* - once a packet enters the kernel network stack, it is further processed exclusively by the kernel. +As a consequence, all packets that have not been processed by the PSA Ingress pipeline (e.g., packets sent from userspace application) will not be handled by the PSA Egress pipeline! + ### NFP (Normal Packet From Port) Packet arriving on an interface is intercepted in the XDP hook by the `xdp-helper` program. It performs pre-processing and diff --git a/backends/ebpf/psa/ebpfPipeline.cpp b/backends/ebpf/psa/ebpfPipeline.cpp index a698215ad3..b2f4911fb7 100644 --- a/backends/ebpf/psa/ebpfPipeline.cpp +++ b/backends/ebpf/psa/ebpfPipeline.cpp @@ -237,10 +237,13 @@ void EBPFIngressPipeline::emit(CodeBuilder* builder) { emitMetadataFromCPUMAP(builder); builder->newline(); - msgStr = - Util::printf_format("%s parser: parsing new packet, path=%%d, pkt_len=%%d", sectionName); + msgStr = Util::printf_format( + "%s parser: parsing new packet, input_port=%%d, path=%%d, " + "pkt_len=%%d", + sectionName); varStr = Util::printf_format("%s->packet_path", compilerGlobalMetadata); - builder->target->emitTraceMessage(builder, msgStr.c_str(), 2, varStr, lengthVar.c_str()); + builder->target->emitTraceMessage(builder, msgStr.c_str(), 3, ifindexVar.c_str(), varStr, + lengthVar.c_str()); // PARSER parser->emit(builder); @@ -395,6 +398,13 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) { builder->blockStart(); emitGlobalMetadataInitializer(builder); + builder->appendFormat("if (compiler_meta__->mark != %u) ", packetMark); + builder->blockStart(); + builder->emitIndent(); + builder->append("return TC_ACT_OK"); + builder->endOfStatement(true); + builder->blockEnd(true); + emitLocalVariables(builder); emitUserMetadataInstance(builder); builder->newline(); @@ -412,10 +422,13 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) { emitPSAControlOutputMetadata(builder); emitPSAControlInputMetadata(builder); - msgStr = - Util::printf_format("%s parser: parsing new packet, path=%%d, pkt_len=%%d", sectionName); + msgStr = Util::printf_format( + "%s parser: parsing new packet, input_port=%%d, path=%%d, " + "pkt_len=%%d", + sectionName); varStr = Util::printf_format("%s->packet_path", compilerGlobalMetadata); - builder->target->emitTraceMessage(builder, msgStr.c_str(), 2, varStr, lengthVar.c_str()); + builder->target->emitTraceMessage(builder, msgStr.c_str(), 3, ifindexVar.c_str(), varStr, + lengthVar.c_str()); // PARSER parser->emit(builder); @@ -458,12 +471,20 @@ void EBPFEgressPipeline::emit(CodeBuilder* builder) { void TCIngressPipeline::emitGlobalMetadataInitializer(CodeBuilder* builder) { EBPFPipeline::emitGlobalMetadataInitializer(builder); + // if Traffic Manager decided to pass packet to the kernel stack earlier, send it up immediately + builder->emitIndent(); + builder->append("if (compiler_meta__->pass_to_kernel == true) return TC_ACT_OK;"); + builder->newline(); + // workaround to make TC protocol-independent, DO NOT REMOVE builder->emitIndent(); // replace ether_type only if a packet comes from XDP builder->appendFormat("if (%s->packet_path == NORMAL) ", compilerGlobalMetadata); builder->blockStart(); builder->emitIndent(); + builder->appendFormat("compiler_meta__->mark = %u", packetMark); + builder->endOfStatement(true); + builder->emitIndent(); if (options.xdp2tcMode == XDP2TC_META) { emitTCWorkaroundUsingMeta(builder); } else if (options.xdp2tcMode == XDP2TC_HEAD) { @@ -557,6 +578,27 @@ void TCIngressPipeline::emitTrafficManager(CodeBuilder* builder) { control->outputStandardMetadata->name.name); builder->newline(); + builder->appendFormat("if (!%s.drop && %s.egress_port == 0) ", + control->outputStandardMetadata->name.name, + control->outputStandardMetadata->name.name); + builder->blockStart(); + builder->target->emitTraceMessage(builder, "IngressTM: Sending packet up to the kernel stack"); + builder->emitIndent(); + + // Since XDP helper re-writes EtherType for packets other than IPv4 (e.g., ARP) + // we cannot simply return TC_ACT_OK to pass the packet up to the kernel stack, + // because the kernel stack would receive a malformed packet (with invalid skb->protocol). + // The workaround is to send the packet back to the same interface. If we redirect, + // the packet will be re-written back to the original format. + // At the beginning of the pipeline we check if pass_to_kernel is true and, + // if so, the program returns TC_ACT_OK. + builder->newline(); + builder->append("compiler_meta__->pass_to_kernel = true;"); + builder->newline(); + builder->append("return bpf_redirect(skb->ifindex, BPF_F_INGRESS)"); + builder->endOfStatement(true); + builder->blockEnd(true); + cstring eg_port = Util::printf_format("%s.egress_port", control->outputStandardMetadata->name.name); cstring cos = diff --git a/backends/ebpf/psa/ebpfPipeline.h b/backends/ebpf/psa/ebpfPipeline.h index 1319735b1a..71f5e1f40f 100644 --- a/backends/ebpf/psa/ebpfPipeline.h +++ b/backends/ebpf/psa/ebpfPipeline.h @@ -47,6 +47,8 @@ class EBPFPipeline : public EBPFProgram { cstring compilerGlobalMetadata; // A variable name storing "1" value. Used to access BPF array map index. cstring oneKey; + // A unique mark used to differentiate packets processed by P4/eBPF from others. + unsigned packetMark; EBPFControlPSA* control; EBPFDeparserPSA* deparser; @@ -71,6 +73,7 @@ class EBPFPipeline : public EBPFProgram { pktInstanceVar = compilerGlobalMetadata + cstring("->instance"); priorityVar = cstring("skb->priority"); oneKey = EBPFModel::reserved("one"); + packetMark = 0x99; } /* Check if pipeline does any processing. diff --git a/backends/ebpf/runtime/psa.h b/backends/ebpf/runtime/psa.h index 3b1022f57d..ae4a85f1c9 100644 --- a/backends/ebpf/runtime/psa.h +++ b/backends/ebpf/runtime/psa.h @@ -116,13 +116,10 @@ struct psa_egress_deparser_input_metadata_t { * The size of this struct must be less than 32 bytes. */ struct psa_global_metadata { - MulticastGroup_t multicast_group; /// set by Ingress, read by PRE - PortId_t egress_port; /// set by Ingress, read by PRE - CloneSessionId_t clone_session_id; /// set by Ingress/Egress, read by PRE - bool clone; /// set by Ingress/Egress, read by PRE - bool drop; /// set by Ingress/Egress, read by PRE PSA_PacketPath_t packet_path; /// set by eBPF program as helper variable, read by ingress/egress EgressInstance_t instance; /// set by PRE, read by Egress + __u8 mark; /// packet mark set by PSA/eBPF programs. Used to differentiate between packets processed by PSA/eBPF from other packets. + bool pass_to_kernel; /// internal metadata, forces sending packet up to kernel stack } __attribute__((aligned(4))); struct clone_session_entry { diff --git a/backends/ebpf/tests/p4testdata/pass-to-kernel.p4 b/backends/ebpf/tests/p4testdata/pass-to-kernel.p4 new file mode 100644 index 0000000000..4f83873b7d --- /dev/null +++ b/backends/ebpf/tests/p4testdata/pass-to-kernel.p4 @@ -0,0 +1,100 @@ +#include +#include +#include "common_headers.p4" + +struct metadata { +} + +struct headers { + ethernet_t ethernet; +} + +parser IngressParserImpl( + packet_in buffer, + out headers parsed_hdr, + inout metadata user_meta, + in psa_ingress_parser_input_metadata_t istd, + in empty_t resubmit_meta, + in empty_t recirculate_meta) +{ + state start { + transition accept; + } +} + +control ingress(inout headers hdr, + inout metadata user_meta, + in psa_ingress_input_metadata_t istd, + inout psa_ingress_output_metadata_t ostd) +{ + apply { + // setting drop=false and egress_port=0 should enforce sending packet up to the kernel stack + ostd.drop = false; + // ostd.egress_port left unspecified + } +} + +parser EgressParserImpl( + packet_in buffer, + out headers parsed_hdr, + inout metadata user_meta, + in psa_egress_parser_input_metadata_t istd, + in metadata normal_meta, + in empty_t clone_i2e_meta, + in empty_t clone_e2e_meta) +{ + state start { + transition accept; + } +} + +control egress(inout headers hdr, + inout metadata user_meta, + in psa_egress_input_metadata_t istd, + inout psa_egress_output_metadata_t ostd) +{ + Counter, bit<32>>(1024, PSA_CounterType_t.PACKETS) eg_packets; + + apply { + // this counter should not be incremented if packets are not coming from PSA ingress. + eg_packets.count(0); + } +} + +control IngressDeparserImpl( + packet_out packet, + out empty_t clone_i2e_meta, + out empty_t resubmit_meta, + out metadata normal_meta, + inout headers hdr, + in metadata meta, + in psa_ingress_output_metadata_t istd) +{ + apply { + } +} + +control EgressDeparserImpl( + packet_out packet, + out empty_t clone_e2e_meta, + out empty_t recirculate_meta, + inout headers hdr, + in metadata meta, + in psa_egress_output_metadata_t istd, + in psa_egress_deparser_input_metadata_t edstd) +{ + apply { + } +} + +IngressPipeline(IngressParserImpl(), + ingress(), + IngressDeparserImpl()) ip; + +EgressPipeline(EgressParserImpl(), + egress(), + EgressDeparserImpl()) ep; + +PSA_Switch(ip, PacketReplicationEngine(), ep, BufferingQueueingEngine()) main; + + diff --git a/backends/ebpf/tests/ptf/test.py b/backends/ebpf/tests/ptf/test.py index 8f69d609c3..780ba5dd49 100644 --- a/backends/ebpf/tests/ptf/test.py +++ b/backends/ebpf/tests/ptf/test.py @@ -19,8 +19,8 @@ import copy from scapy.fields import ShortField, IntField -from scapy.layers.l2 import Ether -from scapy.layers.inet import IP, UDP +from scapy.layers.l2 import Ether, ARP +from scapy.layers.inet import IP, UDP, ICMP from scapy.packet import Packet, bind_layers, split_layers from ptf.packet import MPLS from ptf.mask import Mask @@ -587,3 +587,66 @@ def runTest(self): pkt[IP].dst = 0x11993355 # mask is 0xFF00FFFF testutils.send_packet(self, PORT0, pkt) testutils.verify_packet(self, pkt, PORT1) + + +class PassToKernelStackTest(P4EbpfTest): + + p4_file_path = "p4testdata/pass-to-kernel.p4" + + def setUp(self): + super(PassToKernelStackTest, self).setUp() + # static route + self.exec_ns_cmd("ip route add 20.0.0.15/32 dev eth1") + # add IP address to interface, so that it can reply with ICMP and ARP + self.exec_ns_cmd("ifconfig eth0 10.0.0.1 up") + # static ARP + self.exec_ns_cmd("arp -s 20.0.0.15 00:00:00:00:00:aa") + self.exec_ns_cmd("arp -s 10.0.0.2 00:00:00:00:00:cc") + + def tearDown(self): + self.exec_ns_cmd("arp -d 20.0.0.15") + self.exec_ns_cmd("arp -d 10.0.0.2") + self.exec_ns_cmd("ip route del 20.0.0.15/32") + + super(PassToKernelStackTest, self).tearDown() + + + def runTest(self): + # simple forward by Linux routing + pkt = testutils.simple_tcp_packet(eth_dst="00:00:00:00:00:01", ip_src="10.0.0.2", ip_dst="20.0.0.15") + testutils.send_packet(self, PORT0, pkt) + exp_pkt = pkt.copy() + exp_pkt[Ether].src = "00:00:00:00:00:02" # MAC of eth1 + exp_pkt[Ether].dst = "00:00:00:00:00:aa" + exp_pkt[IP].ttl = 63 # routed packet + testutils.verify_packet(self, exp_pkt, PORT1) + self.counter_verify(name="egress_eg_packets", key=[0], packets=0) + + # ARP handling + pkt = testutils.simple_arp_packet(pktlen=21, eth_dst="00:00:00:00:00:01", ip_snd="10.0.0.2", ip_tgt="10.0.0.1") + testutils.send_packet(self, PORT0, pkt) + exp_pkt = pkt.copy() + exp_pkt[ARP].op = 2 + exp_pkt[ARP].hwsrc = "00:00:00:00:00:01" + exp_pkt[ARP].hwdst = pkt[Ether].src + exp_pkt[ARP].psrc = pkt[ARP].pdst + exp_pkt[ARP].pdst = pkt[ARP].psrc + exp_pkt[Ether].src = "00:00:00:00:00:01" + exp_pkt[Ether].dst = pkt[Ether].src + testutils.verify_packet(self, exp_pkt, PORT0) + self.counter_verify(name="egress_eg_packets", key=[0], packets=0) + + pkt = testutils.simple_icmp_packet(eth_dst="00:00:00:00:00:01", ip_src="10.0.0.2", ip_dst="10.0.0.1") + testutils.send_packet(self, PORT0, pkt) + exp_pkt = testutils.simple_icmp_packet(eth_src="00:00:00:00:00:01", # MAC of eth1 + eth_dst="00:00:00:00:00:cc", + ip_src="10.0.0.1", + ip_dst="10.0.0.2", + icmp_type=0) + mask = Mask(exp_pkt) + # Linux can generate random IP identification number, + # ignore ID and checksum in the validation + mask.set_do_not_care_scapy(IP, "id") + mask.set_do_not_care_scapy(IP, "chksum") + testutils.verify_packet(self, mask, PORT0) + self.counter_verify(name="egress_eg_packets", key=[0], packets=0) diff --git a/backends/ebpf/tests/test.sh b/backends/ebpf/tests/test.sh index d9e9b7d204..d6b711963b 100755 --- a/backends/ebpf/tests/test.sh +++ b/backends/ebpf/tests/test.sh @@ -105,15 +105,19 @@ ip netns exec switch ip link add name psa_cpu type dummy ip netns exec switch ip link set dev psa_cpu up # Normal ports +idx=1 for intf in "${INTERFACES[@]}" ; do ip link add "s1-$intf" type veth peer name "$intf" netns switch ip netns exec switch ip link set "$intf" up + ip netns exec switch ifconfig "$intf" hw ether 00:00:00:00:00:0${idx} ip link set dev "s1-$intf" up # Disable trash traffic sysctl -w net.ipv6.conf."s1-$intf".disable_ipv6=1 sysctl -w net.ipv6.conf."s1-$intf".autoconf=0 sysctl -w net.ipv6.conf."s1-$intf".accept_ra=0 + + idx=$(expr $idx + 1) done # Disable trash traffic