Skip to content

Commit

Permalink
feat(gazer): created a POC scraper using ebpf
Browse files Browse the repository at this point in the history
  • Loading branch information
isala404 committed Oct 2, 2021
1 parent 74cd6e6 commit 088df5b
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 127 deletions.
133 changes: 6 additions & 127 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,129 +1,8 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# IDEA
**/.idea/**

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
# Project Proposal
Project Proposal/*
!Project Proposal/*.tex
!Project Proposal/*.bib
38 changes: 38 additions & 0 deletions gazer/bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include <uapi/linux/ptrace.h>
#include <linux/tcp.h>
#include <net/sock.h>
#include <bcc/proto.h>
#include <linux/ip.h>

BPF_HASH(birth, struct sock *, u64);

// separate data structs for ipv4 and ipv6
struct ipv4_data_t {
u64 ts_us;
u32 pid;
u32 saddr;
u32 daddr;
u64 ports;
u64 rx_b;
u64 tx_b;
u64 span_us;
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv4_events);

struct id_t {
u32 pid;
char task[TASK_COMM_LEN];
};
BPF_HASH(whoami, struct sock *, struct id_t);

typedef struct backlog_key {
u32 backlog;
u32 saddr;
u16 lport;
u64 slot;
} backlog_key_t;

BPF_HASH(currsock, u32, struct sock *);

BPF_HISTOGRAM(syn_backlog, backlog_key_t);
141 changes: 141 additions & 0 deletions gazer/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# tcplife Trace the lifespan of TCP sessions and summarize.
# For Linux, uses BCC, BPF. Embedded C.
#
# USAGE: tcplife [-h] [-C] [-S] [-p PID] [-4 | -6] [interval [count]]
#
# This uses the sock:inet_sock_set_state tracepoint if it exists (added to
# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
# kernel dynamic tracing of tcp_set_state().
#
# While throughput counters are emitted, they are fetched in a low-overhead
# manner: reading members of the tcp_info struct on TCP close. ie, we do not
# trace send/receive.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# IDEA: Julia Evans
#
# 18-Oct-2016 Brendan Gregg Created this.
# 29-Dec-2017 " " Added tracepoint support.

from __future__ import print_function

import time

from bcc import BPF
import argparse
from socket import inet_ntop, AF_INET, inet_aton
from struct import pack, unpack

# arguments
examples = """examples:
./tcplife # trace all TCP connect()s
./tcplife -T # include time column (HH:MM:SS)
./tcplife -w # wider columns (fit IPv6)
./tcplife -stT # csv output, with times & timestamps
./tcplife -p 181 # only trace PID 181
./tcplife -L 80 # only trace local port 80
./tcplife -L 80,81 # only trace local ports 80 and 81
./tcplife -D 80 # only trace remote port 80
./tcplife -4 # only trace IPv4 family
./tcplife -6 # only trace IPv6 family
"""
parser = argparse.ArgumentParser(
description="Trace the lifespan of TCP sessions and summarize",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-w", "--wide", action="store_true",
help="wide column output (fits IPv6 addresses)")
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-a", "--addr",
help="filter for address")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
debug = 0

# define BPF program
with open('bpf.c', 'r') as f:
bpf_text = f.read()

with open('sock_state.c', 'r') as f:
bpf_text += f.read()

with open('syn_backlog.c', 'r') as f:
bpf_text += f.read()

# code substitutions
if args.pid:
bpf_text = bpf_text.replace('FILTER_PID',
'if (pid != %s) { return 0; }' % args.pid)

if args.addr:
bpf_text = bpf_text.replace('ADDRFILTER',
"""if (data4.saddr != {0} || data4.daddr != {0})
return 0;""".format(unpack("=I", inet_aton(args.addr))[0]))

bpf_text = bpf_text.replace('FILTER_PID', '')
bpf_text = bpf_text.replace('ADDRFILTER', '')

#
# Setup output formats
#
# Don't change the default output (next 2 lines): this fits in 80 chars. I
# know it doesn't have NS or UIDs etc. I know. If you really, really, really
# need to add columns, columns that solve real actual problems, I'd start by
# adding an extended mode (-x) to included those columns.
#
header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"


# process event
def print_ipv4_event(cpu, data, size):
event = b["ipv4_events"].event(data)
print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
"",
inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
event.tx_b, event.rx_b, float(event.span_us) / 1000))


# initialize BPF
b = BPF(text=bpf_text)

# header
print(header_string % ("PID", "COMM", "", "LADDR",
"LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))

start_ts = 0

# read events
b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
# b.attach_kprobe(event="tcp_v4_syn_recv_sock", fn_name="update_syn_backlog")
while 1:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
# exit()
break

# while True:
# try:
# time.sleep(999999)
# except KeyboardInterrupt:
# break

dist = b['syn_backlog']
print()
for item in dist.items():
print({"backlog": item[0].backlog,
"slot": item[0].slot,
"saddr": inet_ntop(AF_INET, pack("I", item[0].saddr)),
"lport": item[0].lport,
"value": item[1].value})


29 changes: 29 additions & 0 deletions gazer/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: v1
kind: Pod
metadata:
name: bcc
spec:
containers:
- command: [ "sh", "-c", "sleep 60d" ]
image: zlim/bcc
imagePullPolicy: IfNotPresent
name: bcc
resources: {}
securityContext:
privileged: true
volumeMounts:
- mountPath: /lib/modules
name: host-modules
readOnly: true
- mountPath: /usr/src
name: host-usr-src
readOnly: true
volumes:
- hostPath:
path: /lib/modules
type: Directory
name: host-modules
- hostPath:
path: /usr/src
type: Directory
name: host-usr-src
Loading

0 comments on commit 088df5b

Please sign in to comment.