Skip to content

Commit 483fc6e

Browse files
authored
[techsupport] Added a lock to avoid running techsupport in parallel (sonic-net#2065)
- What I did Added logic to generate_dump script to avoid parallel execution of techsupport. If a second instance of techsupport starts when one is already running, the second one exits with an appropriate error code - Why I did it 1. Running multiple dumps in paralell has no real use case 2. High CPU load 3. saisdkdump is not designed to run in paralell. When run, these sort of logs are seen indicating failure. Signed-off-by: Vivek Reddy Karri <[email protected]>
1 parent 93384ed commit 483fc6e

File tree

1 file changed

+78
-23
lines changed

1 file changed

+78
-23
lines changed

scripts/generate_dump

+78-23
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,14 @@
66

77
set -u
88

9-
ERROR_TAR_FAILED=5
10-
ERROR_PROCFS_SAVE_FAILED=6
11-
ERROR_INVALID_ARGUMENT=10
9+
EXT_SUCCESS=0
10+
EXT_GENERAL=1
11+
EXT_LOCKFAIL=2
12+
EXT_RECVSIG=3
13+
EXT_RETRY=4
14+
EXT_TAR_FAILED=5
15+
EXT_PROCFS_SAVE_FAILED=6
16+
EXT_INVALID_ARGUMENT=10
1217

1318
TAR=tar
1419
MKDIR=mkdir
@@ -39,21 +44,38 @@ USER=${USER:-root}
3944
TIMEOUT_MIN="5"
4045
SKIP_BCMCMD=0
4146
SAVE_STDERR=true
42-
RETURN_CODE=0
47+
RETURN_CODE=$EXT_SUCCESS
4348
DEBUG_DUMP=false
4449

50+
# lock dirs/files
51+
LOCKDIR="/tmp/techsupport-lock"
52+
PIDFILE="${LOCKDIR}/PID"
53+
54+
# Remove lock directory and exit, let user decide if they want to retry
55+
rm_lock_and_exit()
56+
{
57+
$RM $V -rf ${LOCKDIR}
58+
exit $EXT_RETRY
59+
}
60+
61+
handle_exit()
62+
{
63+
ECODE=$?
64+
echo "Removing lock. Exit: $ECODE" >&2
65+
$RM $V -rf ${LOCKDIR}
66+
}
67+
4568
handle_signal()
4669
{
4770
echo "Generate Dump received interrupt" >&2
4871
$RM $V -rf $TARDIR
49-
exit 1
72+
exit $EXT_RECVSIG
5073
}
51-
trap 'handle_signal' SIGINT
5274

5375
handle_error() {
5476
if [ "$1" != "0" ]; then
5577
echo "ERR: RC:-$1 observed on line $2" >&2
56-
RETURN_CODE=1
78+
RETURN_CODE=$EXT_GENERAL
5779
fi
5880
}
5981

@@ -103,7 +125,7 @@ save_bcmcmd() {
103125
filepath="${filepath}.gz"
104126
fi
105127
($TAR $V -rhf $TARFILE -C $DUMPDIR "$tarpath" \
106-
|| abort "${ERROR_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
128+
|| abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
107129
&& $RM $V -rf "$filepath"
108130
end_t=$(date +%s%3N)
109131
echo "[ save_bcmcmd:$cmd ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
@@ -213,7 +235,7 @@ save_cmd() {
213235
fi
214236

215237
($TAR $V -rhf $TARFILE -C $DUMPDIR "$tarpath" \
216-
|| abort "${ERROR_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
238+
|| abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
217239
&& $RM $V -rf "$filepath"
218240
end_t=$(date +%s%3N)
219241
echo "[ save_cmd:$cmd ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
@@ -766,7 +788,7 @@ save_file() {
766788

767789
if $do_tar_append; then
768790
($TAR $V -rhf $TARFILE -C $DUMPDIR "$tar_path" \
769-
|| abort "${ERROR_PROCFS_SAVE_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
791+
|| abort "${EXT_PROCFS_SAVE_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
770792
&& $RM $V -f "$gz_path"
771793
fi
772794
end_t=$(date +%s%3N)
@@ -975,7 +997,7 @@ save_log_files() {
975997

976998
# Append the log folder to the main tarball
977999
($TAR $V -rhf $TARFILE -C $DUMPDIR ${BASE}/log \
978-
|| abort "${ERROR_TAR_FAILED}" "tar append operation failed. Aborting for safety") \
1000+
|| abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting for safety") \
9791001
&& $RM $V -rf $TARDIR/log
9801002
end_t=$(date +%s%3N)
9811003
echo "[ TAR /var/log Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
@@ -1004,7 +1026,7 @@ save_warmboot_files() {
10041026

10051027
($TAR $V --warning=no-file-removed -rhf $TARFILE -C $DUMPDIR --mode=+rw \
10061028
$BASE/warmboot \
1007-
|| abort "${ERROR_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \
1029+
|| abort "${EXT_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \
10081030
&& $RM $V -rf $TARDIR
10091031
fi
10101032
end_t=$(date +%s%3N)
@@ -1144,11 +1166,6 @@ main() {
11441166
trap 'handle_error $? $LINENO' ERR
11451167
local start_t=0
11461168
local end_t=0
1147-
if [ `whoami` != root ] && ! $NOOP;
1148-
then
1149-
echo "$0: must be run as root (or in sudo)" >&2
1150-
exit 10
1151-
fi
11521169
NUM_ASICS=$(get_asic_count)
11531170
${CMD_PREFIX}renice +5 -p $$ >> /dev/null
11541171
${CMD_PREFIX}ionice -c 2 -n 5 -p $$ >> /dev/null
@@ -1174,7 +1191,7 @@ main() {
11741191
/proc/softirqs /proc/stat /proc/swaps /proc/sysvipc /proc/timer_list \
11751192
/proc/uptime /proc/version /proc/vmallocinfo /proc/vmstat \
11761193
/proc/zoneinfo \
1177-
|| abort "${ERROR_PROCFS_SAVE_FAILED}" "Proc saving operation failed. Aborting for safety."
1194+
|| abort "${EXT_PROCFS_SAVE_FAILED}" "Proc saving operation failed. Aborting for safety."
11781195
end_t=$(date +%s%3N)
11791196
echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
11801197

@@ -1307,7 +1324,7 @@ main() {
13071324
--exclude="*/etc/ssl/certs/*" \
13081325
--exclude="*/etc/ssl/private/*" \
13091326
$BASE/etc \
1310-
|| abort "${ERROR_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \
1327+
|| abort "${EXT_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \
13111328
&& $RM $V -rf $TARDIR
13121329
end_t=$(date +%s%3N)
13131330
echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
@@ -1473,7 +1490,7 @@ while getopts ":xnvhzas:t:r:d" opt; do
14731490
;;
14741491
h)
14751492
usage
1476-
exit 0
1493+
exit $EXT_SUCCESS
14771494
;;
14781495
v)
14791496
# echo commands about to be run to stderr
@@ -1501,7 +1518,7 @@ while getopts ":xnvhzas:t:r:d" opt; do
15011518
s)
15021519
SINCE_DATE="${OPTARG}"
15031520
# validate date expression
1504-
date --date="${SINCE_DATE}" &> /dev/null || abort "${ERROR_INVALID_ARGUMENT}" "Invalid date expression passed: '${SINCE_DATE}'"
1521+
date --date="${SINCE_DATE}" &> /dev/null || abort "${EXT_INVALID_ARGUMENT}" "Invalid date expression passed: '${SINCE_DATE}'"
15051522
;;
15061523
t)
15071524
TIMEOUT_MIN="${OPTARG}"
@@ -1514,9 +1531,47 @@ while getopts ":xnvhzas:t:r:d" opt; do
15141531
;;
15151532
/?)
15161533
echo "Invalid option: -$OPTARG" >&2
1517-
exit 1
1534+
exit $EXT_GENERAL
15181535
;;
15191536
esac
15201537
done
15211538

1522-
main
1539+
# Check permissions before proceeding further
1540+
if [ `whoami` != root ] && ! $NOOP;
1541+
then
1542+
echo "$0: must be run as root (or in sudo)" >&2
1543+
exit $EXT_INVALID_ARGUMENT
1544+
fi
1545+
1546+
##
1547+
## Attempt Locking
1548+
##
1549+
1550+
if mkdir "${LOCKDIR}" &>/dev/null; then
1551+
trap 'handle_exit' EXIT
1552+
echo "$$" > "${PIDFILE}"
1553+
# This handler will exit the script upon receiving these interrupts
1554+
# Trap configured on EXIT will be triggered by the exit from handle_signal function
1555+
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
1556+
echo "Lock succesfully accquired and installed signal handlers"
1557+
# Proceed with the actual code
1558+
main
1559+
else
1560+
# lock failed, check if the other PID is alive
1561+
PID_PROG="$(cat "${PIDFILE}")"
1562+
1563+
if [ $? != 0 ]; then
1564+
# Another instance is probably about to remove the lock or PIDfile doesn't exist
1565+
rm_lock_and_exit
1566+
fi
1567+
1568+
if ! kill -0 $PID_PROG &>/dev/null; then
1569+
# Lock is stale
1570+
echo "Removing stale lock of nonexistant PID ${PID_PROG}"
1571+
rm_lock_and_exit
1572+
else
1573+
# Lock is valid and the other instance is active. Exit Now
1574+
echo "Accquiring lock failed, PID ${PID_PROG} is active" >&2
1575+
exit $EXT_LOCKFAIL
1576+
fi
1577+
fi

0 commit comments

Comments
 (0)