diff --git a/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 b/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 index 037dc66ead63..23133706cb6c 100644 --- a/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 +++ b/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 @@ -1,42 +1,40 @@ #!/usr/bin/env bash -function wait_until_iface_ready -{ - IFACE=$1 +STATE_DB_IDX="6" - echo "Waiting until interface $IFACE is up..." - - # Wait for the interface to come up (i.e., 'ip link show' returns 0) - until ip link show dev $IFACE up > /dev/null 2>&1; do - sleep 1 - done +PORT_TABLE_PREFIX="PORT_TABLE" +VLAN_TABLE_PREFIX="VLAN_TABLE" +LAG_TABLE_PREFIX="LAG_TABLE" - echo "Interface $IFACE is up" +function wait_until_iface_ready +{ + TABLE_PREFIX=$1 + IFACE=$2 - echo "Waiting until interface $IFACE has an IPv4 address..." + echo "Waiting until interface $IFACE is ready..." - # Wait until the interface gets assigned an IPv4 address + # Wait for the interface to come up + # (i.e., interface is present in STATE_DB and state is "ok") while true; do - IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1) - - if [ -n "$IP" ]; then + RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) + if [ x"$RESULT" == x"ok" ]; then break fi sleep 1 done - echo "Interface $IFACE is configured with IP $IP" + echo "Interface ${IFACE} is ready!" } -# Wait for all interfaces to come up and have IPv4 addresses assigned +# Wait for all interfaces to be up and ready {% for (name, prefix) in INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }} {% endfor %} {% for (name, prefix) in VLAN_INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }} {% endfor %} {% for (name, prefix) in PORTCHANNEL_INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }} {% endfor %} diff --git a/dockers/docker-orchagent/Dockerfile.j2 b/dockers/docker-orchagent/Dockerfile.j2 index 1597b436c3fe..6958390ed2da 100755 --- a/dockers/docker-orchagent/Dockerfile.j2 +++ b/dockers/docker-orchagent/Dockerfile.j2 @@ -30,6 +30,8 @@ COPY ["files/arp_update", "/usr/bin"] COPY ["enable_counters.py", "/usr/bin"] COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor/"] ## Copy all Jinja2 template files into the templates folder COPY ["*.j2", "/usr/share/sonic/templates/"] diff --git a/dockers/docker-orchagent/critical_processes b/dockers/docker-orchagent/critical_processes new file mode 100644 index 000000000000..d48eb66cda1b --- /dev/null +++ b/dockers/docker-orchagent/critical_processes @@ -0,0 +1,7 @@ +orchagent +portsyncd +intfsyncd +neighsyncd +vlanmgrd +intfmgrd +buffermgrd diff --git a/dockers/docker-orchagent/supervisord.conf b/dockers/docker-orchagent/supervisord.conf index 2369a4c31b05..076c02abd289 100644 --- a/dockers/docker-orchagent/supervisord.conf +++ b/dockers/docker-orchagent/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 @@ -15,7 +21,7 @@ stderr_logfile=syslog command=/usr/sbin/rsyslogd -n priority=2 autostart=false -autorestart=false +autorestart=unexpected stdout_logfile=syslog stderr_logfile=syslog diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index b4d9ceb38f0c..c3edc27bea94 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach ExecStop=/usr/bin/{{ docker_container_name }}.sh stop [Install] -WantedBy=multi-user.target teamd.service +WantedBy=multi-user.target swss.service teamd.service diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index 065f01906076..469cf92108ad 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh attach ExecStop=/usr/bin/{{ docker_container_name }}.sh stop [Install] -WantedBy=multi-user.target +WantedBy=multi-user.target swss.service diff --git a/files/build_templates/snmp.service.j2 b/files/build_templates/snmp.service.j2 index f344f2e805ea..ca2648cd80e8 100644 --- a/files/build_templates/snmp.service.j2 +++ b/files/build_templates/snmp.service.j2 @@ -8,3 +8,6 @@ Before=ntp-config.service ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh attach ExecStop=/usr/bin/{{docker_container_name}}.sh stop + +[Install] +WantedBy=multi-user.target swss.service diff --git a/files/build_templates/swss.service.j2 b/files/build_templates/swss.service.j2 index afd442e8c44a..c122e678c4fe 100644 --- a/files/build_templates/swss.service.j2 +++ b/files/build_templates/swss.service.j2 @@ -14,6 +14,8 @@ After=opennsl-modules-3.16.0-6-amd64.service After=nps-modules-3.16.0-6-amd64.service {% endif %} Before=ntp-config.service +StartLimitInterval=1200 +StartLimitBurst=3 [Service] User=root @@ -52,6 +54,8 @@ ExecStopPost=/usr/bin/mst stop ExecStopPost=/etc/init.d/xpnet.sh stop ExecStopPost=/etc/init.d/xpnet.sh start {% endif %} +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/build_templates/teamd.service.j2 b/files/build_templates/teamd.service.j2 index 5cd36c6748b9..0255e14a34f8 100644 --- a/files/build_templates/teamd.service.j2 +++ b/files/build_templates/teamd.service.j2 @@ -1,7 +1,7 @@ [Unit] Description=TEAMD container -Requires=updategraph.service -After=updategraph.service +Requires=updategraph.service swss.service +After=updategraph.service swss.service Before=ntp-config.service [Service] @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh attach ExecStop=/usr/bin/{{docker_container_name}}.sh stop [Install] -WantedBy=multi-user.target +WantedBy=multi-user.target swss.service diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener new file mode 100755 index 000000000000..6bc62fc400c8 --- /dev/null +++ b/files/scripts/supervisor-proc-exit-listener @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +import os +import signal +import sys +import syslog + +from supervisor import childutils + +# Contents of file should be the names of critical processes (as defined in +# supervisor.conf file), one per line +CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' + +def main(): + # Read the list of critical processes from a file + with open(CRITICAL_PROCESSES_FILE, 'r') as f: + critical_processes = [line.rstrip('\n') for line in f] + + while True: + # Transition from ACKNOWLEDGED to READY + childutils.listener.ready() + + line = sys.stdin.readline() + headers = childutils.get_headers(line) + payload = sys.stdin.read(int(headers['len'])) + + # Transition from READY to ACKNOWLEDGED + childutils.listener.ok() + + # We only care about PROCESS_STATE_EXITED events + if headers['eventname'] == 'PROCESS_STATE_EXITED': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + + expected = int(payload_headers['expected']) + processname = payload_headers['processname'] + + # If a critical process exited unexpectedly, terminate supervisor + if expected == 0 and processname in critical_processes: + MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." + msg = MSG_FORMAT_STR.format(payload_headers['processname']) + syslog.syslog(syslog.LOG_INFO, msg) + os.kill(os.getppid(), signal.SIGTERM) + +if __name__ == "__main__": + main() diff --git a/platform/broadcom/docker-orchagent-brcm.mk b/platform/broadcom/docker-orchagent-brcm.mk index 066973967450..a3bbb3fe4193 100644 --- a/platform/broadcom/docker-orchagent-brcm.mk +++ b/platform/broadcom/docker-orchagent-brcm.mk @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_BRCM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_BRCM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_BRCM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/platform/cavium/docker-orchagent-cavm.mk b/platform/cavium/docker-orchagent-cavm.mk index a171a6c801d7..684376647fa9 100644 --- a/platform/cavium/docker-orchagent-cavm.mk +++ b/platform/cavium/docker-orchagent-cavm.mk @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_CAVM)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_CAVM)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_CAVM)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/platform/centec/docker-orchagent-centec.mk b/platform/centec/docker-orchagent-centec.mk index e1d7fd6cf0d6..253e6be06e9e 100644 --- a/platform/centec/docker-orchagent-centec.mk +++ b/platform/centec/docker-orchagent-centec.mk @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_CENTEC)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_CENTEC)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_CENTEC)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/platform/marvell/docker-orchagent-mrvl.mk b/platform/marvell/docker-orchagent-mrvl.mk index f2cb0c997d75..6b24c5905c0b 100644 --- a/platform/marvell/docker-orchagent-mrvl.mk +++ b/platform/marvell/docker-orchagent-mrvl.mk @@ -15,4 +15,4 @@ $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /host/machine.conf:/host/machine.conf $(DOCKER_ORCHAGENT_MRVL)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_MRVL)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_MRVL)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/platform/mellanox/docker-orchagent-mlnx.mk b/platform/mellanox/docker-orchagent-mlnx.mk index 6dda3c446b06..78bf8d44cf4b 100644 --- a/platform/mellanox/docker-orchagent-mlnx.mk +++ b/platform/mellanox/docker-orchagent-mlnx.mk @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_MLNX)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_MLNX)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_MLNX)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/platform/nephos/docker-orchagent-nephos.mk b/platform/nephos/docker-orchagent-nephos.mk index b21e69d6cf87..4b6ef241f536 100644 --- a/platform/nephos/docker-orchagent-nephos.mk +++ b/platform/nephos/docker-orchagent-nephos.mk @@ -16,4 +16,4 @@ $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT_NEPHOS)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT_NEPHOS)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT_NEPHOS)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/rules/docker-dhcp-relay.mk b/rules/docker-dhcp-relay.mk index 53406ad1e15f..7f960920ec0c 100644 --- a/rules/docker-dhcp-relay.mk +++ b/rules/docker-dhcp-relay.mk @@ -2,7 +2,7 @@ DOCKER_DHCP_RELAY = docker-dhcp-relay.gz $(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/docker-dhcp-relay -$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) +$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(ISC_DHCP_CLIENT) $(REDIS_TOOLS) $(DOCKER_DHCP_RELAY)_LOAD_DOCKERS = $(DOCKER_CONFIG_ENGINE) SONIC_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY) SONIC_INSTALL_DOCKER_IMAGES += $(DOCKER_DHCP_RELAY) diff --git a/rules/scripts.mk b/rules/scripts.mk index fbefdd68d2cd..d5bcc51fd3a9 100644 --- a/rules/scripts.mk +++ b/rules/scripts.mk @@ -5,7 +5,11 @@ $(ARP_UPDATE_SCRIPT)_PATH = files/scripts CONFIGDB_LOAD_SCRIPT = configdb-load.sh $(CONFIGDB_LOAD_SCRIPT)_PATH = files/scripts +SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener +$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts + SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \ - $(ARP_UPDATE_SCRIPT) + $(ARP_UPDATE_SCRIPT) \ + $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) diff --git a/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh b/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh index 3cbde972fe18..3562082647a0 100644 --- a/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh +++ b/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh @@ -1,43 +1,41 @@ #!/usr/bin/env bash -function wait_until_iface_ready -{ - IFACE=$1 +STATE_DB_IDX="6" - echo "Waiting until interface $IFACE is up..." - - # Wait for the interface to come up (i.e., 'ip link show' returns 0) - until ip link show dev $IFACE up > /dev/null 2>&1; do - sleep 1 - done +PORT_TABLE_PREFIX="PORT_TABLE" +VLAN_TABLE_PREFIX="VLAN_TABLE" +LAG_TABLE_PREFIX="LAG_TABLE" - echo "Interface $IFACE is up" +function wait_until_iface_ready +{ + TABLE_PREFIX=$1 + IFACE=$2 - echo "Waiting until interface $IFACE has an IPv4 address..." + echo "Waiting until interface $IFACE is ready..." - # Wait until the interface gets assigned an IPv4 address + # Wait for the interface to come up + # (i.e., interface is present in STATE_DB and state is "ok") while true; do - IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1) - - if [ -n "$IP" ]; then + RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) + if [ x"$RESULT" == x"ok" ]; then break fi sleep 1 done - echo "Interface $IFACE is configured with IP $IP" + echo "Interface ${IFACE} is ready!" } -# Wait for all interfaces to come up and have IPv4 addresses assigned -wait_until_iface_ready Vlan1000 -wait_until_iface_ready PortChannel04 -wait_until_iface_ready PortChannel02 -wait_until_iface_ready PortChannel03 -wait_until_iface_ready PortChannel03 -wait_until_iface_ready PortChannel01 -wait_until_iface_ready PortChannel02 -wait_until_iface_ready PortChannel04 -wait_until_iface_ready PortChannel01 +# Wait for all interfaces to be up and ready +wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01