-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Mellanox|FFB]: Add support for Mellanox fast-fast boot #2294
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,26 +7,29 @@ function getMountPoint() | |
|
||
function getBootType() | ||
{ | ||
local TYPE | ||
case "$(cat /proc/cmdline)" in | ||
*SONIC_BOOT_TYPE=fast*) | ||
TYPE='fast' | ||
;; | ||
*SONIC_BOOT_TYPE=warm*) | ||
local BOOT_TYPE | ||
case "$(cat /proc/cmdline | grep -o 'SONIC_BOOT_TYPE=\S*' | cut -d'=' -f2)" in | ||
warm*) | ||
TYPE='warm' | ||
;; | ||
fast-fast) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fastfast |
||
TYPE='fast-fast' | ||
;; | ||
fast*) | ||
TYPE='fast' | ||
;; | ||
*) | ||
TYPE='cold' | ||
esac | ||
echo $TYPE | ||
echo "${TYPE}" | ||
} | ||
|
||
function preStartAction() | ||
{ | ||
{%- if docker_container_name == "database" %} | ||
WARM_DIR=/host/warmboot | ||
if [[ "$BOOT_TYPE" == "warm" && -f $WARM_DIR/dump.rdb ]]; then | ||
# Load redis content from /host/warm-reboot/dump.rdb | ||
# Load redis content from /host/warmboot/dump.rdb | ||
docker cp $WARM_DIR/dump.rdb database:/var/lib/redis/dump.rdb | ||
else | ||
# Create an emtpy file and overwrite any RDB if already there | ||
|
@@ -46,7 +49,6 @@ function postStartAction() | |
until [[ $(/usr/bin/docker exec database redis-cli -s $REDIS_SOCK ping | grep -c PONG) -gt 0 ]]; do | ||
sleep 1; | ||
done | ||
|
||
if [[ "$BOOT_TYPE" == "warm" && -f $WARM_DIR/dump.rdb ]]; then | ||
rm -f $WARM_DIR/dump.rdb | ||
else | ||
|
@@ -59,7 +61,7 @@ function postStartAction() | |
fi | ||
{%- elif docker_container_name == "swss" %} | ||
docker exec swss rm -f /ready # remove cruft | ||
if [[ "$BOOT_TYPE" == "fast" && -d /host/fast-reboot ]]; then | ||
if [[ "$BOOT_TYPE" == "fast" || "$BOOT_TYPE" == "fast-fast" ]] && [[ -d /host/fast-reboot ]]; then | ||
test -e /host/fast-reboot/fdb.json && docker cp /host/fast-reboot/fdb.json swss:/ | ||
test -e /host/fast-reboot/arp.json && docker cp /host/fast-reboot/arp.json swss:/ | ||
test -e /host/fast-reboot/default_routes.json && docker cp /host/fast-reboot/default_routes.json swss:/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,7 +90,11 @@ start() { | |
|
||
# Don't flush DB during warm boot | ||
if [[ x"$WARM_BOOT" != x"true" ]]; then | ||
/usr/bin/docker exec database redis-cli -n 0 FLUSHDB | ||
# Don't flush APP_DB during MLNX fast-fast boot | ||
BOOT_TYPE="$(cat /proc/cmdline | grep -o 'SONIC_BOOT_TYPE=\S*' | cut -d'=' -f2)" | ||
if [[ x"$BOOT_TYPE" != x"fast-fast" ]] && [[ ! -f /var/warmboot/issu_started ]]; then | ||
/usr/bin/docker exec database redis-cli -n 0 FLUSHDB | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do not flush app_db? |
||
fi | ||
/usr/bin/docker exec database redis-cli -n 2 FLUSHDB | ||
/usr/bin/docker exec database redis-cli -n 5 FLUSHDB | ||
clean_up_tables 6 "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*'" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# mellanox fast fast boot script | ||
|
||
MLNX_FFB_SCRIPT = mlnx-ffb.sh | ||
$(MLNX_FFB_SCRIPT)_PATH = platform/mellanox/ | ||
SONIC_COPY_FILES += $(MLNX_FFB_SCRIPT) | ||
|
||
export MLNX_FFB_SCRIPT |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#!/bin/bash | ||
|
||
FFB_SUCCESS=0 | ||
FFB_FAILURE=1 | ||
|
||
# Check if ISSU is enabled on this device | ||
check_issu_enabled() | ||
{ | ||
CHECK_RESULT="${FFB_FAILURE}" | ||
ISSU_CHECK_CMD="show platform mlnx issu" | ||
|
||
# Check whether show ISSU status outputs ENABLED | ||
if [[ `$ISSU_CHECK_CMD` =~ "enabled" ]]; then | ||
# ISSU enabled, return success | ||
CHECK_RESULT="${FFB_SUCCESS}" | ||
fi | ||
|
||
return "${CHECK_RESULT}" | ||
} | ||
|
||
# Check if ISSU upgrade from current SDK to next image SDK is supported | ||
check_sdk_upgrade() | ||
{ | ||
CHECK_RESULT="${FFB_FAILURE}" | ||
|
||
NEXT_SONIC_IMAGE="$(sonic_installer list | grep "Next: " | cut -f2 -d' ')" | ||
CURRENT_SONIC_IMAGE="$(sonic_installer list | grep "Current: " | cut -f2 -d' ')" | ||
|
||
FS_PATH="/host/image-${NEXT_SONIC_IMAGE#SONiC-OS-}/fs.squashfs" | ||
FS_MOUNTPOINT="/tmp/image-${NEXT_SONIC_IMAGE#SONiC-OS-}-fs" | ||
|
||
if [[ "${CURRENT_SONIC_IMAGE}" == "${NEXT_SONIC_IMAGE}" ]]; then | ||
return "${FFB_SUCCESS}" | ||
fi | ||
|
||
while :; do | ||
mkdir -p "${FS_MOUNTPOINT}" | ||
mount -t squashfs "${FS_PATH}" "${FS_MOUNTPOINT}" || { | ||
>&2 echo "Failed to mount next SONiC image" | ||
break; | ||
} | ||
|
||
SDK_VERSION_FILE_PATH="${FS_MOUNTPOINT}/etc/mlnx/sdk-version" | ||
|
||
[ -f "${SDK_VERSION_FILE_PATH}" ] && { | ||
NEXT_SDK_VERSION="$(cat ${FS_MOUNTPOINT}/etc/mlnx/sdk-version)" | ||
} || { | ||
>&2 echo "No SDK version file ${SDK_VERSION_FILE_PATH}" | ||
break; | ||
} | ||
|
||
ISSU_CHECK_CMD="docker exec -t syncd issu --check ${NEXT_SDK_VERSION}" | ||
|
||
${ISS_CHECK_CMD} > /dev/null && CHECK_RESULT="${FFB_SUCCESS}" | ||
|
||
break | ||
done | ||
|
||
umount -rf "${FS_MOUNTPOINT}" 2> /dev/null || true | ||
rm -rf "${FS_MOUNTPOINT}" 2> /dev/null || true | ||
|
||
return "${CHECK_RESULT}" | ||
} | ||
|
||
# Perform ISSU start | ||
issu_start() | ||
{ | ||
ISSU_START_CMD="docker exec -t syncd issu --start" | ||
${ISSU_START_CMD} > /dev/null | ||
|
||
EXIT_CODE=$? | ||
|
||
touch /host/warmboot/issu_started | ||
|
||
return $EXIT_CODE | ||
} | ||
|
||
# Perform ISSU end | ||
issu_end() | ||
{ | ||
ISSU_END_CMD="docker exec -t syncd issu --end" | ||
${ISSU_END_CMD} > /dev/null | ||
|
||
EXIT_CODE=$? | ||
|
||
return $EXIT_CODE | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# issu (SONiC MLNX platform ISSU tool) Debian package | ||
|
||
MLNX_ISSU = python-mlnx-issu_1.0-1_all.deb | ||
$(MLNX_ISSU)_SRC_PATH = $(PLATFORM_PATH)/mlnx-issu | ||
SONIC_PYTHON_STDEB_DEBS += $(MLNX_ISSU) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Part of MLNX platform specific fast-fast boot implementation for warm-boot. | ||
Notifies SYNCD proccess once boot is finished after warm-reboot. | ||
Once SYNCD received such notification it should set appropriate SAI attribute. | ||
Then SAI will notify SDK to end ISSU mode for the FFB. | ||
""" | ||
|
||
|
||
import time | ||
import swsssdk | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use swsscommon python bind, it has notificationconsumer implemented, do not use swssdk. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
from threading import Timer | ||
|
||
|
||
class FFB(object): | ||
"""Provides implementation for MLNX fast-fast boot""" | ||
DB_WARM_TABLE_KEY = 'WARM_RESTART_TABLE|bgp' | ||
DB_STATE_ENTRY_NAME = 'state' | ||
DB_STATE_TYPE_RECONCILED = 'reconciled' | ||
DB_CHANNEL_NAME = 'MLNX_FFB' | ||
DB_CHANNEL_MSG = '["SET","ISSU_END"]' # message should be in the following format: ["<operation>","<data>"] | ||
SUB_THREAD_TIMEOUT = 1 | ||
STOP_TIMER_TIMEOUT = 180 | ||
|
||
def __init__(self): | ||
self.state_db = swsssdk.SonicV2Connector() | ||
self.state_db.connect(self.state_db.STATE_DB) | ||
|
||
self.prevState = self.state_db.get(self.state_db.STATE_DB, self.DB_WARM_TABLE_KEY, self.DB_STATE_ENTRY_NAME) | ||
|
||
self.pubSub = self.state_db.redis_clients[self.state_db.STATE_DB].pubsub() | ||
self.pubSub.psubscribe(**{'__key*@6__:{}'.format(self.DB_WARM_TABLE_KEY): self.eventHandler}) | ||
|
||
self.timeoutTimer = Timer(self.STOP_TIMER_TIMEOUT, self.finish) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not feel the finish is tied to bgp reconciliation logic. let's have an offline discussion. |
||
|
||
def run(self): | ||
# Start event thread in order to get required events | ||
self.eventThread = self.pubSub.run_in_thread(sleep_time=self.SUB_THREAD_TIMEOUT) | ||
# Start oneshot timer in order to exit in case required event is not received during defined timeout | ||
self.timeoutTimer.start() | ||
|
||
def finish(self): | ||
# Stop event thread and timeout timer | ||
self.eventThread.stop() | ||
self.timeoutTimer.cancel() | ||
|
||
# Publish "FFB END" event to SYNCD process | ||
time.sleep(60) # W/A: Wait until configuration is applied to HW since it takes some time | ||
self.state_db.publish(self.state_db.STATE_DB, self.DB_CHANNEL_NAME, self.DB_CHANNEL_MSG) | ||
|
||
def eventHandler(self, msg): | ||
# Only "set" operations are needed so just skip all others | ||
if msg['data'] != 'hset': | ||
return | ||
|
||
state = self.state_db.get(self.state_db.STATE_DB, self.DB_WARM_TABLE_KEY, self.DB_STATE_ENTRY_NAME) | ||
|
||
if (state != self.prevState) and (state == self.DB_STATE_TYPE_RECONCILED): | ||
self.finish() | ||
else: | ||
self.prevState = state | ||
|
||
|
||
def main(): | ||
FFB().run() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should handle both cases for backward-compatible with 201803:
Otherwise we cannot fast-reboot from 201803 into 201811.