From 812343737c672eb5eb8e58f3bbab45e97cbf4927 Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Mon, 6 Dec 2021 18:20:30 +0200 Subject: [PATCH] [pfc_detect] fix RedisReply errors (#2040) **What I did** It could be that queue counters are populated to FLEX COUNTER DB before the COUNTER_QUEUE_INDEX_MAP is populated which is in accordance to PortsOrch implementation. Fixing it by replacing the order - first the COUNTER_QUEUE_INDEX_MAP then the FLEX COUNTER DB might produce other issues if readers start reading the map and try to find the corresponding entry in COUNTERS DB. Such an order is also in alignment with other maps and counters. So seems like fixing it in the reader is more suitable fix. Errors observed only for short period of time at system start when PFC WD is enabled: ``` Nov 10 15:12:19.001472 tgs-sonic-n2-s1 ERR syncd#SDK: :- guard: RedisReply catches system_error: command: *135#015#012$7#015#012EVALSHA#015#012$40#015#012b62081cc93943a4cbfec30de42638b435d31197c#015#012$3#015#012128#015#012$20#015#012oid:0x15000000000290#015#012$20#015#012oid:0x15000000000291#015#012$20#015#012oid:0x150000000002b1#015#012$20#015#012oid:0x150000000002b2#015#012$20#015#012oid:0x150000000002d2#015#012$20#015#012oid:0x150000000002d3#015#012$20#015#012oid:0x150000000002f3#015#012$20#015#012oid:0x150000000002f4#015#012$20#015#012oid:0x15000000000314#015#012$20#015#012oid:0x15000000000315#015#012$20#015#012oid:0x15000000000335#015#012$20#015#012oid:0x15000000000336#015#012$20#015#012oid:0x15000000000356#015#012$20#015#012oid:0x15000000000357#015#012$20#015#012oid:0x15000000000377#015#012$20#015#012oid:0x15000000000378#015#012$20#015#012oid:0x15000000000398#015#012$20#015#012oid:0x15000000000399#015#012$20#015#012oid:0x150000000003b9#015#012$20#015#012oid:0x150000000003ba#015#012$20#015#012oid:0x150000000003da#015#012$20#015#012oid:0x150000000003db#015#012$20#015#012oid:0x150000000003fb#015#012$20#015#012oid:0x150000000003fc#015#012$20#015#012oid:0x1500000000041c#015#012$20#015#012oid:0x1500000000041d#015#012$20#015#012oid:0x1500000000043d#015#012$20#015#012oid:0x1500000000043e#015#012$20#015#012oid:0x1500000000045e#015#012$20#015#012oid:0x1500000000045f#015#012$20#015#012oid:0x1500000000047f#015#012$20#015#012oid:0x15000000000480#015#012$20#015#012oid:0x150000000004a0#015#012$20#015#012oid:0x150000000004a1#015#012$20#015#012oid:0x150000000004c1#015#012$20#015#012oid:0x150000000004c2#015#012$20#015#012oid:0x150000000004e2#015#012$20#015#012oid:0x150000000004e3#015#012$20#015#012oid:0x15000000000503#015#012$20#015#012oid:0x15000000000504#015#012$20#015#012oid:0x15000000000524#015#012$20#015#012oid:0x15000000000525#015#012$20#015#012oid:0x15000000000545#015#012$20#015#012oid:0x15000000000546#015#012$20#015#012oid:0x15000000000566#015#012$20#015#012oid:0x15000000000567#015#012$20#015#012oid:0x15000000000587#015#012$20#015#012oid:0x15000000000588#015#012$20#015#012oid:0x150000000005a8#015#012$20#015#012oid:0x150000000005a9#015#012$20#015#012oid:0x150000000005c9#015#012$20#015#012oid:0x150000000005ca#015#012$20#015#012oid:0x150000000005ea#015#012$20#015#012oid:0x150000000005eb#015#012$20#015#012oid:0x1500000000060b#015#012$20#015#012oid:0x1500000000060c#015#012$20#015#012oid:0x1500000000062c#015#012$20#015#012oid:0x1500000000062d#015#012$20#015#012oid:0x1500000000064d#015#012$20#015#012oid:0x1500000000064e#015#012$20#015#012oid:0x1500000000066e#015#012$20#015#012oid:0x1500000000066f#015#012$20#015#012oid:0x1500000000068f#015#012$20#015#012oid:0x15000000000690#015#012$20#015#012oid:0x150000000006b0#015#012$20#015#012oid:0x150000000006b1#015#012$20#015#012oid:0x150000000006d1#015#012$20#015#012oid:0x150000000006d2#015#012$20#015#012oid:0x150000000006f2#015#012$20#015#012oid:0x150000000006f3#015#012$20#015#012oid:0x15000000000713#015#012$20#015#012oid:0x15000000000714#015#012$20#015#012oid:0x15000000000734#015#012$20#015#012oid:0x15000000000735#015#012$20#015#012oid:0x15000000000755#015#012$20#015#012oid:0x15000000000756#015#012$20#015#012oid:0x15000000000776#015#012$20#015#012oid:0x15000000000777#015#012$20#015#012oid:0x15000000000797#015#012$20#015#012oid:0x15000000000798#015#012$20#015#012oid:0x150000000007b8#015#012$20#015#012oid:0x150000000007b9#015#012$20#015#012oid:0x150000000007d9#015#012$20#015#012oid:0x150000000007da#015#012$20#015#012oid:0x150000000007fa#015#012$20#015#012oid:0x150000000007fb#015#012$20#015#012oid:0x1500000000081b#015#012$20#015#012oid:0x1500000000081c#015#012$20#015#012oid:0x1500000000083c#015#012$20#015#012oid:0x1500000000083d#015#012$20#015#012oid:0x1500000000085d#015#012$20#015#012oid:0x1500000000085e#015#012$20#015#012oid:0x1500000000087e#015#012$20#015#012oid:0x1500000000087f#015#012$20#015#012oid:0x1500000000089f#015#012$20#015#012oid:0x150000000008a0#015#012$20#015#012oid:0x150000000008c0#015#012$20#015#012oid:0x150000000008c1#015#012$20#015#012oid:0x150000000008e1#015#012$20#015#012oid:0x150000000008e2#015#012$20#015#012oid:0x15000000000902#015#012$20#015#012oid:0x15000000000903#015#012$20#015#012oid:0x15000000000923#015#012$20#015#012oid:0x15000000000924#015#012$20#015#012oid:0x15000000000944#015#012$20#015#012oid:0x15000000000945#015#012$20#015#012oid:0x15000000000965#015#012$20#015#012oid:0x15000000000966#015#012$20#015#012oid:0x15000000000986#015#012$20#015#012oid:0x15000000000987#015#012$20#015#012oid:0x150000000009a7#015#012$20#015#012oid:0x150000000009a8#015#012$20#015#012oid:0x150000000009c8#015#012$20#015#012oid:0x150000000009c9#015#012$20#015#012oid:0x150000000009e9#015#012$20#015#012oid:0x150000000009ea#015#012$20#015#012oid:0x15000000000a0a#015#012$20#015#012oid:0x15000000000a0b#015#012$20#015#012oid:0x15000000000a2b#015#012$20#015#012oid:0x15000000000a2c#015#012$20#015#012oid:0x15000000000a4c#015#012$20#015#012oid:0x15000000000a4d#015#012$20#015#012oid:0x15000000000a6d#015#012$20#015#012oid:0x15000000000a6e#015#012$20#015#012oid:0x15000000000a8e#015#012$20#015#012oid:0x15000000000a8f#015#012$20#015#012oid:0x15000000000aaf#015#012$20#015#012oid:0x15000000000ab0#015#012$1#015#0122#015#012$8#015#012COUNTERS#015#012$3#015#012100#015#012$2#015#012''#015#012, reason: ERR Error running script (call to f_b62081cc93943a4cbfec30de42638b435d31197c): @user_script:39: user_script:39: attempt to concatenate local 'queue_index' (a boolean value) : Input/output error ``` **Why I did it** If queue_index or port_id is not defined don't run the rest of the LUA script logic. **How I verified it** Running it on the switch and verify no errors. --- orchagent/pfc_detect_barefoot.lua | 101 ++++++++++++----------- orchagent/pfc_detect_broadcom.lua | 95 +++++++++++----------- orchagent/pfc_detect_innovium.lua | 129 ++++++++++++++++-------------- orchagent/pfc_detect_mellanox.lua | 103 ++++++++++++------------ orchagent/pfc_detect_nephos.lua | 99 ++++++++++++----------- orchagent/pfc_restore.lua | 53 ++++++------ 6 files changed, 305 insertions(+), 275 deletions(-) diff --git a/orchagent/pfc_detect_barefoot.lua b/orchagent/pfc_detect_barefoot.lua index b270549a29bd..c413c5999cdc 100644 --- a/orchagent/pfc_detect_barefoot.lua +++ b/orchagent/pfc_detect_barefoot.lua @@ -36,63 +36,68 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_broadcom.lua b/orchagent/pfc_detect_broadcom.lua index 4f82b933176f..29ed2d163393 100644 --- a/orchagent/pfc_detect_broadcom.lua +++ b/orchagent/pfc_detect_broadcom.lua @@ -35,61 +35,66 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_on2off_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_ON2OFF_RX_PKTS' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_on2off_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_ON2OFF_RX_PKTS' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_on2off = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key) - local queue_pause_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS') + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_on2off = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key) + local queue_pause_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS') - if occupancy_bytes and packets and pfc_rx_packets and pfc_on2off and queue_pause_status then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_on2off = tonumber(pfc_on2off) + if occupancy_bytes and packets and pfc_rx_packets and pfc_on2off and queue_pause_status then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_on2off = tonumber(pfc_on2off) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_on2off_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last') - local queue_pause_status_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last') + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_on2off_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last') + local queue_pause_status_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_on2off_last and queue_pause_status_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_on2off_last = tonumber(pfc_on2off_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_on2off_last and queue_pause_status_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_on2off_last = tonumber(pfc_on2off_last) - -- Check actual condition of queue being in PFC storm - if (pfc_rx_packets - pfc_rx_packets_last > 0 and pfc_on2off - pfc_on2off_last == 0 and queue_pause_status_last == 'true' and queue_pause_status == 'true') or - (debug_storm == "enabled") then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (pfc_rx_packets - pfc_rx_packets_last > 0 and pfc_on2off - pfc_on2off_last == 0 and queue_pause_status_last == 'true' and queue_pause_status == 'true') or + (debug_storm == "enabled") then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last', queue_pause_status) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last', pfc_on2off) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last', queue_pause_status) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last', pfc_on2off) + end end end end diff --git a/orchagent/pfc_detect_innovium.lua b/orchagent/pfc_detect_innovium.lua index cedd51baa327..8deedeaa4f4f 100644 --- a/orchagent/pfc_detect_innovium.lua +++ b/orchagent/pfc_detect_innovium.lua @@ -36,72 +36,77 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) - - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. - - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) - - -- Check actual condition of queue being in PFC storm - -- if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_1', 'YES') - - -- if (debug_storm == "enabled") then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_2', 'YES') - - -- if (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_3', 'YES') - - - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and pfc_rx_packets - pfc_rx_packets_last > 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) + + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. + + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) + + -- Check actual condition of queue being in PFC storm + -- if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_1', 'YES') + + -- if (debug_storm == "enabled") then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_2', 'YES') + + -- if (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_3', 'YES') + + + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and pfc_rx_packets - pfc_rx_packets_last > 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_mellanox.lua b/orchagent/pfc_detect_mellanox.lua index 6df16241e91e..e805ad9cff1e 100644 --- a/orchagent/pfc_detect_mellanox.lua +++ b/orchagent/pfc_detect_mellanox.lua @@ -36,64 +36,69 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) - local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) + local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then - if time_left <= poll_time then - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then + if time_left <= poll_time then + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_nephos.lua b/orchagent/pfc_detect_nephos.lua index d152fc5f8c76..648904e17a55 100644 --- a/orchagent/pfc_detect_nephos.lua +++ b/orchagent/pfc_detect_nephos.lua @@ -35,65 +35,70 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end end return rets - + diff --git a/orchagent/pfc_restore.lua b/orchagent/pfc_restore.lua index 7b137a40d348..4c278526876e 100644 --- a/orchagent/pfc_restore.lua +++ b/orchagent/pfc_restore.lua @@ -32,36 +32,41 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_rx_packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)) - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. - if pfc_rx_packets_last then - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + local pfc_rx_packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)) + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. + if pfc_rx_packets_last then + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - -- Check actual condition of queue being restored from PFC storm - if (pfc_rx_packets - pfc_rx_packets_last == 0) - -- DEBUG CODE START. Uncomment to enable - and (debug_storm ~= "enabled") - -- DEBUG CODE END. - then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') - time_left = restoration_time + -- Check actual condition of queue being restored from PFC storm + if (pfc_rx_packets - pfc_rx_packets_last == 0) + -- DEBUG CODE START. Uncomment to enable + and (debug_storm ~= "enabled") + -- DEBUG CODE END. + then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + time_left = restoration_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time + time_left = restoration_time end - else - time_left = restoration_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_RESTORATION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_RESTORATION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + end end end