Skip to content

Commit

Permalink
[BFN] Reworked BFN platform thermals plugin (#11723)
Browse files Browse the repository at this point in the history
* [BFN] Updated platform.json for wedge100bf_65x

Signed-off-by: Andriy Kokhan <andriyx.kokhan@intel.com>

* Reworked BFN platform thermal logic

* Implemented PSU thermal APIs

* Updated platform.json for accton_wedge100bf_32x

Signed-off-by: Andriy Kokhan <andriyx.kokhan@intel.com>

* Updated BFN platform plugins initialization flow

Signed-off-by: Andriy Kokhan <andriyx.kokhan@intel.com>

Signed-off-by: Andriy Kokhan <andriyx.kokhan@intel.com>
Andriy Kokhan authored Oct 11, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent df93a1b commit 304c6c8
Showing 10 changed files with 851 additions and 175 deletions.
42 changes: 22 additions & 20 deletions device/barefoot/x86_64-accton_as9516_32d-r0/platform.json
Original file line number Diff line number Diff line change
@@ -133,11 +133,31 @@
"psus": [
{
"name": "psu-1",
"temperature": false
"thermals": [
{
"name": "psu_driver-i2c-7-5a:psu1-temp1"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp2"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp3"
}
]
},
{
"name": "psu-2",
"temperature": false
"thermals": [
{
"name": "psu_driver-i2c-7-59:psu2-temp1"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp2"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp3"
}
]
}
],
"thermals": [
@@ -147,24 +167,6 @@
{
"name": "com_e_driver-i2c-4-33:memory-temp"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp1"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp2"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp3"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp1"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp2"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp3"
},
{
"name": "tmp75-i2c-3-48:chip-temp"
},
132 changes: 100 additions & 32 deletions device/barefoot/x86_64-accton_wedge100bf_32x-r0/platform.json
Original file line number Diff line number Diff line change
@@ -9,51 +9,131 @@
"name": "BMC"
}
],
"thermal_manager": false,
"fans": [
{
"name": "counter-rotating-fan-1"
},
{
"name": "counter-rotating-fan-2"
},
{
"name": "counter-rotating-fan-3"
},
{
"name": "counter-rotating-fan-4"
},
{
"name": "counter-rotating-fan-5"
"name": "counter-rotating-fan-1",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-2",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-3",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-4",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-5",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
}
],
"fan_drawers": [
{
"name": "fantray-1",
"status_led": {
"controllable": false
},
"fans": [
{
"name": "counter-rotating-fan-1"
"name": "counter-rotating-fan-1",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-2"
"name": "counter-rotating-fan-2",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-3"
"name": "counter-rotating-fan-3",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-4"
"name": "counter-rotating-fan-4",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
},
{
"name": "counter-rotating-fan-5"
"name": "counter-rotating-fan-5",
"status_led": {
"controllable": false
},
"speed": {
"controllable": false
}
}
]
}
],
"psus": [
{
"name": "psu-1"
"name": "psu-1",
"thermals": [
{
"name": "psu_driver-i2c-7-5a:psu1-temp1"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp2"
}
]
},
{
"name": "psu-2"
"name": "psu-2",
"thermals": [
{
"name": "psu_driver-i2c-7-59:psu2-temp1"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp2"
}
]
}
],
"thermals": [
@@ -63,18 +143,6 @@
{
"name": "com_e_driver-i2c-4-33:memory-temp"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp1"
},
{
"name": "psu_driver-i2c-7-59:psu2-temp2"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp1"
},
{
"name": "psu_driver-i2c-7-5a:psu1-temp2"
},
{
"name": "tmp75-i2c-3-48:chip-temp"
},
471 changes: 442 additions & 29 deletions device/barefoot/x86_64-accton_wedge100bf_65x-r0/platform.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -2,7 +2,9 @@
"chassis": {
"Wedge100BF-65X-O-AC-F-BF": {
"component": {
"BIOS": { },
"BMC": { }
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
{
"thermals": [
{
"com_e_driver-i2c-4-33:cpu-temp" : [99.0, 89.0, 11.0, 1.0]
},
{
"com_e_driver-i2c-4-33:memory-temp" : [85.0, 75.0, 11.0, 1.0]
},
{
"coretemp-isa-0000:core-0" : [99.0, 89.0, 11.0, 1.0]
},
{
"coretemp-isa-0000:core-1" : [99.0, 89.0, 11.0, 1.0]
},
{
"coretemp-isa-0000:core-2" : [99.0, 89.0, 11.0, 1.0]
},
{
"coretemp-isa-0000:core-3" : [99.0, 89.0, 11.0, 1.0]
},
{
"coretemp-isa-0000:package-id-0" : [80.0, 70.0, 11.0, 1.0]
},
{
"max6658-i2c-9-4c:come-board-temp" : [85.0, 70.0, 11.0, 1.0]
},
{
"max6658-i2c-9-4c:max6658-chip-temp" : [85.0, 70.0, 11.0, 1.0]
},
{
"pch_haswell-virtual-0:temp1" : [80.0, 70.0, 11.0, 1.0]
},
{
"psu_driver-i2c-7-5a:psu1-temp1" : [60.0, 40.0, 11.0, 1.0]
},
{
"psu_driver-i2c-7-5a:psu1-temp2" : [80.0, 60.0, 11.0, 1.0]
},
{
"psu_driver-i2c-7-59:psu2-temp1" : [60.0, 40.0, 11.0, 1.0]
},
{
"psu_driver-i2c-7-59:psu2-temp2" : [80.0, 60.0, 11.0, 1.0]
},
{
"tmp75-i2c-3-4a:exhaust-temp" : [90.0, 80.0, 11.0, 1.0]
},
{
"tmp75-i2c-3-4b:intake-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-3-4c:intake2-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-3-48:chip-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-3-49:exhaust2-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-8-48:fan-board-outlet-right-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-8-49:fan-board-outlet-left-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-9-4a:upper-board-intake-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-9-4b:upper-board-tofino-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-9-48:upper-board-intake2-temp" : [90.0, 70.0, 11.0, 1.0]
},
{
"tmp75-i2c-9-49:server-board-temp" : [90.0, 70.0, 11.0, 1.0]
}
]
}
Original file line number Diff line number Diff line change
@@ -12,7 +12,7 @@
from sonic_platform.sfp import Sfp
from sonic_platform.psu import psu_list_get
from sonic_platform.fan_drawer import fan_drawer_list_get
from sonic_platform.thermal import thermal_list_get
from sonic_platform.thermal import chassis_thermals_list_get
from sonic_platform.platform_utils import file_create
from sonic_platform.eeprom import Eeprom

@@ -40,9 +40,9 @@ class Chassis(ChassisBase):
def __init__(self):
ChassisBase.__init__(self)

self._eeprom = Eeprom()
self.__tlv_bin_eeprom = self._eeprom.get_raw_data()
self.__tlv_dict_eeprom = self._eeprom.get_data()
self.__eeprom = None
self.__tlv_bin_eeprom = None
self.__tlv_dict_eeprom = None

self.__fan_drawers = None
self.__fan_list = None
@@ -60,6 +60,28 @@ def __init__(self):
file_create(config_dict['handlers']['file']['filename'], '646')
logging.config.dictConfig(config_dict)

@property
def _eeprom(self):
if self.__eeprom is None:
self.__eeprom = Eeprom()
return self.__eeprom

@_eeprom.setter
def _eeprom(self, value):
pass

@property
def _tlv_bin_eeprom(self):
if self.__tlv_bin_eeprom is None:
self.__tlv_bin_eeprom = self._eeprom.get_raw_data()
return self.__tlv_bin_eeprom

@property
def _tlv_dict_eeprom(self):
if self.__tlv_dict_eeprom is None:
self.__tlv_dict_eeprom = self._eeprom.get_data()
return self.__tlv_dict_eeprom

@property
def _fan_drawer_list(self):
if self.__fan_drawers is None:
@@ -85,7 +107,7 @@ def _fan_list(self, value):
@property
def _thermal_list(self):
if self.__thermals is None:
self.__thermals = thermal_list_get()
self.__thermals = chassis_thermals_list_get()
return self.__thermals

@_thermal_list.setter
@@ -145,7 +167,7 @@ def get_name(self):
Returns:
string: The name of the chassis
"""
return self._eeprom.modelstr(self.__tlv_bin_eeprom)
return self._eeprom.modelstr(self._tlv_bin_eeprom)

def get_presence(self):
"""
@@ -161,23 +183,23 @@ def get_model(self):
Returns:
string: Model/part number of chassis
"""
return self._eeprom.part_number_str(self.__tlv_bin_eeprom)
return self._eeprom.part_number_str(self._tlv_bin_eeprom)

def get_serial(self):
"""
Retrieves the serial number of the chassis (Service tag)
Returns:
string: Serial number of chassis
"""
return self._eeprom.serial_number_str(self.__tlv_bin_eeprom)
return self._eeprom.serial_number_str(self._tlv_bin_eeprom)

def get_revision(self):
"""
Retrieves the revision number of the chassis (Service tag)
Returns:
string: Revision number of chassis
"""
return self.__tlv_dict_eeprom.get(
return self._tlv_dict_eeprom.get(
"0x{:X}".format(Eeprom._TLV_CODE_LABEL_REVISION), 'N/A')

def get_sfp(self, index):
@@ -219,7 +241,7 @@ def get_base_mac(self):
A string containing the MAC address in the format
'XX:XX:XX:XX:XX:XX'
"""
return self._eeprom.base_mac_addr(self.__tlv_bin_eeprom)
return self._eeprom.base_mac_addr(self._tlv_bin_eeprom)

def get_system_eeprom_info(self):
"""
@@ -230,7 +252,7 @@ def get_system_eeprom_info(self):
OCP ONIE TlvInfo EEPROM format and values are their corresponding
values.
"""
return self.__tlv_dict_eeprom
return self._tlv_dict_eeprom

def __get_transceiver_change_event(self, timeout=0):
forever = False
Original file line number Diff line number Diff line change
@@ -182,12 +182,10 @@ def __init__(self, component_index=0):
self.name = self.bpcp.get_components_list()[self.index]
except IndexError as e:
print("Error: No components found in plaform_components.json")

if (self.name == "BMC"):
self.version = get_bmc_version()
self.description = "Chassis BMC"
elif (self.name == "BIOS"):
self.version = get_bios_version()
self.description = "Chassis BIOS"

def get_name(self):
@@ -214,6 +212,12 @@ def get_firmware_version(self):
Returns:
A string containing the firmware version of the component
"""
if self.version == "N/A":
if (self.name == "BMC"):
self.version = get_bmc_version()
elif (self.name == "BIOS"):
self.version = get_bios_version()

return self.version

def install_firmware(self, image_path):
Original file line number Diff line number Diff line change
@@ -12,8 +12,8 @@
from .platform_thrift_client import thrift_try

from sonic_platform_base.psu_base import PsuBase
from sonic_platform.thermal import psu_thermals_list_get
from platform_utils import cancel_on_sigterm

except ImportError as e:
raise ImportError (str(e) + "- required module not found")

@@ -27,6 +27,7 @@ class Psu(PsuBase):
def __init__(self, index):
PsuBase.__init__(self)
self.__index = index
self.__thermals = None
self.__info = None
self.__ts = 0
# STUB IMPLEMENTATION
@@ -225,6 +226,34 @@ def get_position_in_parent(self):
"""
return self.__index

def get_temperature(self):
"""
Retrieves current temperature reading from PSU
Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.get_thermal(0).get_temperature()

def get_temperature_high_threshold(self):
"""
Retrieves the high threshold temperature of PSU
Returns:
A float number, the high threshold temperature of PSU in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.get_thermal(0).get_high_threshold()

@property
def _thermal_list(self):
if self.__thermals is None:
self.__thermals = psu_thermals_list_get(self.get_name())
return self.__thermals

@_thermal_list.setter
def _thermal_list(self, value):
pass

def psu_list_get():
psu_list = []
for i in range(1, Psu.get_num_psus() + 1):
Original file line number Diff line number Diff line change
@@ -13,15 +13,6 @@
QSFP_DD_TYPE = "QSFP_DD"
EEPROM_PAGE_SIZE = 128

try:
from thrift.Thrift import TApplicationException

def cached_num_bytes_get(client):
return client.pltfm_mgr.pltfm_mgr_qsfp_cached_num_bytes_get(1, 0, 0, 0)
thrift_try(cached_num_bytes_get, 1)
EEPROM_CACHED_API_SUPPORT = True
except TApplicationException as e:
EEPROM_CACHED_API_SUPPORT = False

class Sfp(SfpOptoeBase):
"""
@@ -34,15 +25,27 @@ def __init__(self, port_num):
self.port_num = port_num
self.sfp_type = QSFP_TYPE
self.SFP_EEPROM_PATH = "/var/run/platform/sfp/"

if not EEPROM_CACHED_API_SUPPORT:
if not os.path.exists(self.SFP_EEPROM_PATH):
try:
os.makedirs(self.SFP_EEPROM_PATH)
except OSError as e:
if e.errno != errno.EEXIST:
raise
self.eeprom_path = self.SFP_EEPROM_PATH + "sfp{}-eeprom-cache".format(self.index)
self.eeprom_path = None
self.__cached_api_supported = None

@property
def _cached_api_supported(self):
def cached_num_bytes_get(client):
return client.pltfm_mgr.pltfm_mgr_qsfp_cached_num_bytes_get(1, 0, 0, 0)
if self.__cached_api_supported is None:
try:
thrift_try(cached_num_bytes_get, 1)
self.__cached_api_supported = True
except Exception as e:
self.__cached_api_supported = False
if not os.path.exists(self.SFP_EEPROM_PATH):
try:
os.makedirs(self.SFP_EEPROM_PATH)
except OSError as e:
if e.errno != errno.EEXIST:
raise
self.eeprom_path = self.SFP_EEPROM_PATH + "sfp{}-eeprom-cache".format(self.index)
return self.__cached_api_supported

def get_presence(self):
"""
@@ -94,7 +97,7 @@ def read_eeprom(self, offset, num_bytes):
if not self.get_presence():
return None

if not EEPROM_CACHED_API_SUPPORT:
if not self._cached_api_supported:
return super().read_eeprom(offset, num_bytes)

def cached_num_bytes_get(page, offset, num_bytes):
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
try:
import subprocess
import time
import threading
from collections import namedtuple
import json
from bfn_extensions.platform_sensors import platform_sensors_get
@@ -23,56 +25,12 @@
'''
Threshold = namedtuple('Threshold', ['crit', 'max', 'min', 'alarm'])

def _sensors_chip_parsed(data: str):
def kv(line):
k, v, *_ = [t.strip(': ') for t in line.split(':') if t] + ['']
return k, v

chip, *data = data.strip().split('\n')
chip = chip.strip(': ')

sensors = []
for line in data:
if not line.startswith(' '):
sensor_label = line.strip(': ')
sensors.append((sensor_label, {}))
continue

if len(sensors) == 0:
raise RuntimeError(f'invalid data to parse: {data}')

attr, value = kv(line)
sensor_label, sensor_data = sensors[-1]
sensor_data.update({attr: value})

return chip, dict(sensors)

'''
Example of returned dict:
{
'coretemp-isa-0000': {
'Core 1': { "temp1_input": 40, ... },
'Core 2': { ... }
}
}
'''
def _sensors_get() -> dict:
data = platform_sensors_get(['-A', '-u']) or ''
data += subprocess.check_output("/usr/bin/sensors -A -u",
shell=True, text=True)
data = data.split('\n\n')
data = [_sensors_chip_parsed(chip_data) for chip_data in data if chip_data]
data = dict(data)
return data

def _value_get(d: dict, key_prefix, key_suffix=''):
for k, v in d.items():
if k.startswith(key_prefix) and k.endswith(key_suffix):
return v
return None

# Thermal -> ThermalBase -> DeviceBase
class Thermal(ThermalBase):
__sensors_info = None
__timestamp = 0
__lock = threading.Lock()
_thresholds = dict()
_max_temperature = 100.0
_min_temperature = 0.0
@@ -96,6 +54,84 @@ def __init__(self, chip, label, index = 0):
if f is not None:
self.__get_thresholds(f)

@staticmethod
def __sensors_chip_parsed(data: str):
def kv(line):
k, v, *_ = [t.strip(': ') for t in line.split(':') if t] + ['']
return k, v

chip, *data = data.strip().split('\n')
chip = chip.strip(': ')

sensors = []
for line in data:
if not line.startswith(' '):
sensor_label = line.strip(': ')
sensors.append((sensor_label, {}))
continue

if len(sensors) == 0:
raise RuntimeError(f'invalid data to parse: {data}')

attr, value = kv(line)
sensor_label, sensor_data = sensors[-1]
sensor_data.update({attr: value})

return chip, dict(sensors)

@classmethod
def __sensors_get(cls, cached=True) -> dict:
cls.__lock.acquire()
if time.time() > cls.__timestamp + 15:
# Update cache once per 15 seconds
try:
data = platform_sensors_get(['-A', '-u']) or ''
data += subprocess.check_output("/usr/bin/sensors -A -u",
shell=True, text=True)
data = data.split('\n\n')
data = [cls.__sensors_chip_parsed(chip_data) for chip_data in data if chip_data]
cls.__sensors_info = dict(data)
cls.__timestamp = time.time()
except Exception as e:
logging.warning("Failed to update sensors cache: " + str(e))
info = cls.__sensors_info
cls.__lock.release()
return info

@staticmethod
def __sensor_value_get(d: dict, key_prefix, key_suffix=''):
for k, v in d.items():
if k.startswith(key_prefix) and k.endswith(key_suffix):
return v
return None

@staticmethod
def __get_platform_json():
hwsku_path = device_info.get_path_to_platform_dir()
platform_json_path = "/".join([hwsku_path, "platform.json"])
f = open(platform_json_path)
return json.load(f)

@staticmethod
def get_chassis_thermals():
try:
platform_json = Thermal.__get_platform_json()
return platform_json["chassis"]["thermals"]
except Exception as e:
logging.exception("Failed to collect chassis thermals: " + str(e))
return None

@staticmethod
def get_psu_thermals(psu_name):
try:
platform_json = Thermal.__get_platform_json()
for psu in platform_json["chassis"]["psus"]:
if psu["name"] == psu_name:
return psu["thermals"]
except Exception as e:
logging.exception("Failed to collect chassis thermals: " + str(e))
return None

def __get_thresholds(self, f):
def_threshold_json = json.load(f)
all_data = def_threshold_json["thermals"]
@@ -119,8 +155,18 @@ def check_high_threshold(self, temperature, attr_suffix):
return check_range

def __get(self, attr_prefix, attr_suffix):
sensor_data = _sensors_get().get(self.__chip, {}).get(self.__label, {})
value = _value_get(sensor_data, attr_prefix, attr_suffix)
chip_data = Thermal.__sensors_get().get(self.__chip, {})
sensor_data = {}
for sensor, data in chip_data.items():
if sensor.lower().replace(' ', '-') == self.__label:
sensor_data = data
break
value = Thermal.__sensor_value_get(sensor_data, attr_prefix, attr_suffix)

# Can be float value or None
if attr_prefix == 'temp' and attr_suffix == 'input':
return value

if value is not None and self.check_in_range(value) and self.check_high_threshold(value, attr_suffix):
return value
elif self.__name in self._thresholds and attr_prefix == 'temp':
@@ -146,6 +192,8 @@ def __get(self, attr_prefix, attr_suffix):
# ThermalBase interface methods:
def get_temperature(self) -> float:
temp = self.__get('temp', 'input')
if temp is None:
return None
self.__collect_temp.append(float(temp))
self.__collect_temp.sort()
if len(self.__collect_temp) == 3:
@@ -214,13 +262,19 @@ def set_low_threshold(self, temperature):
return True
return False

def thermal_list_get():
l = []
index = 0
for chip, chip_data in _sensors_get().items():
for sensor, sensor_data in chip_data.items():
# add only temperature sensors
if _value_get(sensor_data, "temp") is not None:
l.append(Thermal(chip, sensor, index))
index += 1
return l

def chassis_thermals_list_get():
thermal_list = []
thermals = Thermal.get_chassis_thermals()
for index, thermal in enumerate(thermals):
thermal = thermal["name"].split(':')
thermal_list.append(Thermal(thermal[0], thermal[1], index))
return thermal_list

def psu_thermals_list_get(psu_name):
thermal_list = []
thermals = Thermal.get_psu_thermals(psu_name)
for index, thermal in enumerate(thermals):
thermal = thermal["name"].split(':')
thermal_list.append(Thermal(thermal[0], thermal[1], index))
return thermal_list

0 comments on commit 304c6c8

Please sign in to comment.