Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
18 changes: 11 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ TUNED_COMMIT:=bc3f737a0080d38863915217c2e4482bbb77b322
TUNED_DIR:=daemon

# API-related variables
API_TYPES_DIR:=pkg/apis/tuned/v1
API_TYPES:=$(wildcard $(API_TYPES_DIR)/*_types.go)
API_TYPES_DIR:=pkg/apis
API_ZZ_GENERATED:=zz_generated.deepcopy
API_TYPES_GENERATED:=$(API_TYPES_DIR)/$(API_ZZ_GENERATED).go
API_GO_HEADER_FILE:=pkg/apis/header.go.txt

# Container image-related variables
Expand All @@ -48,21 +46,27 @@ clone-tuned:
cd $(TUNED_DIR) && git checkout $(TUNED_COMMIT) && cd .. && \
rm -rf $(TUNED_DIR)/.git)

build: $(BINDATA) pkg/generated
build: $(BINDATA) generate-deepcopy pkg/generated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps at some point add performance-addon-operator-bin target?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure, I will do it once will add PAO dockerfiles.

$(GO_BUILD_RECIPE)
ln -sf $(PACKAGE_BIN) $(OUT_DIR)/openshift-tuned

$(BINDATA): $(GOBINDATA_BIN) $(ASSETS)
$(GOBINDATA_BIN) -mode 420 -modtime 1 -pkg manifests -o $(BINDATA) assets/...
gofmt -s -w $(BINDATA)

pkg/generated: $(API_TYPES)
performance-addon-operator-bin:
$(GO) build -o $(OUT_DIR)/performance-addon-operator -ldflags '-X $(PACKAGE)/version.Version=$(REV)' $(PACKAGE)/cmd/performance-addon-operator

generate-deepcopy:
$(GO) run k8s.io/code-generator/cmd/deepcopy-gen \
--input-dirs $(PACKAGE)/pkg/apis/tuned/v1 \
--input-dirs $(PACKAGE)/$(API_TYPES_DIR)/tuned/v1,$(PACKAGE)/$(API_TYPES_DIR)/performanceprofile/v1,$(PACKAGE)/$(API_TYPES_DIR)/performanceprofile/v2 \
-O $(API_ZZ_GENERATED) \
--go-header-file $(API_GO_HEADER_FILE) \
--bounding-dirs $(PACKAGE)/pkg/apis \
--output-base tmp
tar c tmp | tar x --strip-components=4

pkg/generated:
$(GO) run k8s.io/code-generator/cmd/client-gen \
--clientset-name versioned \
--input-base '' \
Expand All @@ -87,7 +91,7 @@ pkg/generated: $(API_TYPES)

crd-schema-gen:
# TODO: look into using https://github.com/openshift/build-machinery-go/ and yaml patches
$(GO) run ./vendor/sigs.k8s.io/controller-tools/cmd/controller-gen/ schemapatch:manifests=./manifests paths=./pkg/apis/tuned/v1 output:dir=./manifests
$(GO) run ./vendor/sigs.k8s.io/controller-tools/cmd/controller-gen/ schemapatch:manifests=./manifests paths=./pkg/apis/... output:dir=./manifests
yq w -i ./manifests/20-crd-tuned.yaml -s ./manifests/20-crd-tuned.yaml-patch

$(GOBINDATA_BIN):
Expand Down
11 changes: 11 additions & 0 deletions assets/performanceprofile/configs/99-low-latency-hooks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"version": "1.0.0",
"hook": {
"path": "/usr/local/bin/low-latency-hooks.sh",
"args": ["low-latency-hooks.sh", "{{.RPSMask}}"]
},
"when": {
"always": true
},
"stages": ["prestart"]
}
1 change: 1 addition & 0 deletions assets/performanceprofile/configs/99-netdev-rps.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SUBSYSTEM=="net", ACTION=="add", ENV{DEVPATH}!="/devices/virtual/net/veth*", TAG+="systemd", ENV{SYSTEMD_WANTS}="update-rps@%k.service"
20 changes: 20 additions & 0 deletions assets/performanceprofile/configs/99-runtimes.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{{if .ReservedCpus}}
[crio.runtime]
infra_ctr_cpuset = "{{.ReservedCpus}}"
{{end}}

# We should copy paste the default runtime because this snippet will override the whole runtimes section
[crio.runtime.runtimes.runc]
runtime_path = ""
runtime_type = "oci"
runtime_root = "/run/runc"

# The CRI-O will check the allowed_annotations under the runtime handler and apply high-performance hooks when one of
# high-performance annotations presents under it.
# We should provide the runtime_path because we need to inform that we want to re-use runc binary and we
# do not have high-performance binary under the $PATH that will point to it.
[crio.runtime.runtimes.high-performance]
runtime_path = "/bin/runc"
runtime_type = "oci"
runtime_root = "/run/runc"
allowed_annotations = ["cpu-load-balancing.crio.io", "cpu-quota.crio.io", "irq-load-balancing.crio.io"]
26 changes: 26 additions & 0 deletions assets/performanceprofile/scripts/hugepages-allocation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash

set -euo pipefail

nodes_path="/sys/devices/system/node"
hugepages_file="${nodes_path}/node${NUMA_NODE}/hugepages/hugepages-${HUGEPAGES_SIZE}kB/nr_hugepages"

if [ ! -f "${hugepages_file}" ]; then
echo "ERROR: ${hugepages_file} does not exist"
exit 1
fi

timeout=60
sample=1
current_time=0
while [ "$(cat "${hugepages_file}")" -ne "${HUGEPAGES_COUNT}" ]; do
echo "${HUGEPAGES_COUNT}" >"${hugepages_file}"

current_time=$((current_time + sample))
if [ $current_time -gt $timeout ]; then
echo "ERROR: ${hugepages_file} does not have the expected number of hugepages ${HUGEPAGES_COUNT}"
exit 1
fi

sleep $sample
done
35 changes: 35 additions & 0 deletions assets/performanceprofile/scripts/low-latency-hooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

mask="${1}"
[ -n "${mask}" ] || { logger "${0}: The rps-mask parameter is missing" ; exit 0; }

pid=$(jq '.pid' /dev/stdin 2>&1)
[[ $? -eq 0 && -n "${pid}" ]] || { logger "${0}: Failed to extract the pid: ${pid}"; exit 0; }

ns=$(ip netns identify "${pid}" 2>&1)
[[ $? -eq 0 && -n "${ns}" ]] || { logger "${0} Failed to identify the namespace: ${ns}"; exit 0; }

# Updates the container veth RPS mask on the node
netns_link_indexes=$(ip netns exec "${ns}" ip -j link | jq ".[] | select(.link_index != null) | .link_index")
for link_index in ${netns_link_indexes}; do
container_veth=$(ip -j link | jq ".[] | select(.ifindex == ${link_index}) | .ifname" | tr -d '"')
echo ${mask} > /sys/devices/virtual/net/${container_veth}/queues/rx-0/rps_cpus
done

# Updates the RPS mask for the interface inside of the container network namespace
mode=$(ip netns exec "${ns}" [ -w /sys ] && echo "rw" || echo "ro" 2>&1)
[ $? -eq 0 ] || { logger "${0} Failed to determine if the /sys is writable: ${mode}"; exit 0; }

if [ "${mode}" = "ro" ]; then
res=$(ip netns exec "${ns}" mount -o remount,rw /sys 2>&1)
[ $? -eq 0 ] || { logger "${0}: Failed to remount /sys as rw: ${res}"; exit 0; }
fi

# /sys/class/net can't be used recursively to find the rps_cpus file, use /sys/devices instead
res=$(ip netns exec "${ns}" find /sys/devices -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \; 2>&1)
[[ $? -eq 0 && -z "${res}" ]] || logger "${0}: Failed to apply the RPS mask: ${res}"

if [ "${mode}" = "ro" ]; then
ip netns exec "${ns}" mount -o remount,ro /sys
[ $? -eq 0 ] || exit 1 # Error out so the pod will not start with a writable /sys
fi
36 changes: 36 additions & 0 deletions assets/performanceprofile/scripts/set-rps-mask.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash

dev=$1
[ -n "${dev}" ] || { echo "The device argument is missing" >&2 ; exit 1; }

mask=$2
[ -n "${mask}" ] || { echo "The mask argument is missing" >&2 ; exit 1; }

dev_dir="/sys/class/net/${dev}"

function find_dev_dir {
systemd_devs=$(systemctl list-units -t device | grep sys-subsystem-net-devices | cut -d' ' -f1)

for systemd_dev in ${systemd_devs}; do
dev_sysfs=$(systemctl show "${systemd_dev}" -p SysFSPath --value)

dev_orig_name="${dev_sysfs##*/}"
if [ "${dev_orig_name}" = "${dev}" ]; then
dev_name="${systemd_dev##*-}"
dev_name="${dev_name%%.device}"
if [ "${dev_name}" = "${dev}" ]; then # disregard the original device unit
continue
fi

echo "${dev} device was renamed to $dev_name"
dev_dir="/sys/class/net/${dev_name}"
break
fi
done
}

[ -d "${dev_dir}" ] || find_dev_dir # the net device was renamed, find the new name
[ -d "${dev_dir}" ] || { sleep 5; find_dev_dir; } # search failed, wait a little and try again
[ -d "${dev_dir}" ] || { echo "${dev_dir}" directory not found >&2 ; exit 0; } # the interface disappeared, not an error

find "${dev_dir}"/queues -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \;
111 changes: 111 additions & 0 deletions assets/performanceprofile/tuned/openshift-node-performance
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
[main]
summary=Openshift node optimized for deterministic performance at the cost of increased power consumption, focused on low latency network performance. Based on Tuned 2.11 and Cluster node tuning (oc 4.5)
include=openshift-node,cpu-partitioning

# Inheritance of base profiles legend:
# cpu-partitioning -> network-latency -> latency-performance
# https://github.com/redhat-performance/tuned/blob/master/profiles/latency-performance/tuned.conf
# https://github.com/redhat-performance/tuned/blob/master/profiles/network-latency/tuned.conf
# https://github.com/redhat-performance/tuned/blob/master/profiles/cpu-partitioning/tuned.conf

# All values are mapped with a comment where a parent profile contains them.
# Different values will override the original values in parent profiles.

[variables]
# isolated_cores take a list of ranges; e.g. isolated_cores=2,4-7
{{if .IsolatedCpus}}
isolated_cores={{.IsolatedCpus}}
{{end}}

not_isolated_cores_expanded=${f:cpulist_invert:${isolated_cores_expanded}}

[cpu]
force_latency=cstate.id:1|3 # latency-performance (override)
governor=performance # latency-performance
energy_perf_bias=performance # latency-performance
min_perf_pct=100 # latency-performance

[service]
service.stalld=start,enable

[vm]
transparent_hugepages=never # network-latency

{{if not .GloballyDisableIrqLoadBalancing}}
[irqbalance]
# Override the value set by cpu-partitioning with an empty one
banned_cpus=""
{{end}}

[scheduler]
runtime=0
group.ksoftirqd=0:f:11:*:ksoftirqd.*
group.rcuc=0:f:11:*:rcuc.*
{{if not .GloballyDisableIrqLoadBalancing}}
default_irq_smp_affinity = ignore
{{end}}

[sysctl]
kernel.hung_task_timeout_secs = 600 # cpu-partitioning #realtime
kernel.nmi_watchdog = 0 # cpu-partitioning #realtime
kernel.sched_rt_runtime_us = -1 # realtime
kernel.timer_migration = 0 # cpu-partitioning (= 1) #realtime (= 0)
kernel.numa_balancing=0 # network-latency
net.core.busy_read=50 # network-latency
net.core.busy_poll=50 # network-latency
net.ipv4.tcp_fastopen=3 # network-latency
vm.stat_interval = 10 # cpu-partitioning #realtime

# ktune sysctl settings for rhel6 servers, maximizing i/o throughput
#
# Minimal preemption granularity for CPU-bound tasks:
# (default: 1 msec# (1 + ilog(ncpus)), units: nanoseconds)
kernel.sched_min_granularity_ns=10000000 # latency-performance

# If a workload mostly uses anonymous memory and it hits this limit, the entire
# working set is buffered for I/O, and any more write buffering would require
# swapping, so it's time to throttle writes until I/O can catch up. Workloads
# that mostly use file mappings may be able to use even higher values.
#
# The generator of dirty data starts writeback at this percentage (system default
# is 20%)
vm.dirty_ratio=10 # latency-performance

# Start background writeback (via writeback threads) at this percentage (system
# default is 10%)
vm.dirty_background_ratio=3 # latency-performance

# The swappiness parameter controls the tendency of the kernel to move
# processes out of physical memory and onto the swap disk.
# 0 tells the kernel to avoid swapping processes out of physical memory
# for as long as possible
# 100 tells the kernel to aggressively swap processes out of physical memory
# and move them to swap cache
vm.swappiness=10 # latency-performance

# The total time the scheduler will consider a migrated process
# "cache hot" and thus less likely to be re-migrated
# (system default is 500000, i.e. 0.5 ms)
kernel.sched_migration_cost_ns=5000000 # latency-performance

[selinux]
avc_cache_threshold=8192 # Custom (atomic host)

{{if .NetDevices}}
{{.NetDevices}}
{{end}}

[bootloader]
# set empty values to disable RHEL initrd setting in cpu-partitioning
initrd_remove_dir=
initrd_dst_img=
initrd_add_dir=
# overrides cpu-partitioning cmdline
cmdline_cpu_part=+nohz=on rcu_nocbs=${isolated_cores} tuned.non_isolcpus=${not_isolated_cpumask} intel_pstate=disable nosoftlockup
{{if .StaticIsolation}}
cmdline_realtime=+tsc=nowatchdog intel_iommu=on iommu=pt isolcpus=domain,managed_irq,${isolated_cores} systemd.cpu_affinity=${not_isolated_cores_expanded}
{{else}}
cmdline_realtime=+tsc=nowatchdog intel_iommu=on iommu=pt isolcpus=managed_irq,${isolated_cores} systemd.cpu_affinity=${not_isolated_cores_expanded}
{{end}}
cmdline_hugepages=+{{if .DefaultHugepagesSize}} default_hugepagesz={{.DefaultHugepagesSize}} {{end}} {{if .Hugepages}} {{.Hugepages}} {{end}}
cmdline_additionalArg=+{{if .AdditionalArgs}} {{.AdditionalArgs}} {{end}}
Loading