Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox]Fix issues found in thermal control #19

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,22 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False):
self.fan_speed_set_path = "fan{}_speed_set".format(self.index)
self.fan_presence_path = "fan{}_status".format(self.drawer_index)
self.fan_max_speed_path = "fan{}_max".format(self.index)
self._name = "fan{}".format(fan_index + 1)
else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_max_speed_path = "psu{}_max".format(self.index)
self._name = "fan(PSU{})".format(fan_index)
self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
self.fan_orange_led_path = "led_fan{}_orange".format(self.drawer_index)
self.fan_pwm_path = "pwm1"
self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index)

def get_name(self):
return self._name

def get_status(self):
"""
Retrieves the operational status of fan
Expand Down Expand Up @@ -123,7 +128,11 @@ def get_speed(self):
speed_in_rpm = int(fan_curr_speed.read())
except (ValueError, IOError):
speed_in_rpm = 0


if self.fan_max_speed_path is None:
# in case of max speed unsupported, we just return speed in unit of RPM.
return speed_in_rpm

max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm

Expand All @@ -136,11 +145,10 @@ def get_target_speed(self):
Returns:
int: percentage of the max fan speed
"""
speed = 0

if self.is_psu_fan:
# Not like system fan, psu fan speed can not be modified, so target speed is N/A
return speed
return self.get_speed()

try:
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm:
pwm = int(fan_pwm.read())
Expand Down Expand Up @@ -243,4 +251,4 @@ def get_speed_tolerance(self):
considered tolerable
"""
# The tolerance value is fixed as 20% for all the Mellanox platform
return 20
return 20
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(self):
self._chassis.initialize_psu()
self._chassis.initialize_fan()
self._chassis.initialize_eeprom()
self._chassis.initialize_thermals()

def _is_host(self):
"""
Expand Down
6 changes: 5 additions & 1 deletion platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(self, psu_index, sku):
psu_oper_status = "thermal/psu{}_pwr_status".format(self.index)
#psu_oper_status should always be present for all SKUs
self.psu_oper_status = os.path.join(self.psu_path, psu_oper_status)
self._name = "PSU{}".format(psu_index + 1)

if sku in hwsku_dict_psu:
filemap = psu_profile_list[hwsku_dict_psu[sku]]
Expand Down Expand Up @@ -92,7 +93,10 @@ def __init__(self, psu_index, sku):

fan = Fan(psu_index, psu_index, True)
if fan.get_presence():
self._fan = fan
self._fan_list.append(fan)

def get_name(self):
return self._name

def _read_generic_file(self, filename, len):
"""
Expand Down
55 changes: 49 additions & 6 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,36 @@

THERMAL_API_GET_TEMPERATURE = "get_temperature"
THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold"
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold"

THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0

HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/"

thermal_api_handler_cpu_core = {
THERMAL_API_GET_TEMPERATURE:"cpu_core{}",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max"
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit"
}
thermal_api_handler_cpu_pack = {
THERMAL_API_GET_TEMPERATURE:"cpu_pack",
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max"
THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit"
}
thermal_api_handler_module = {
THERMAL_API_GET_TEMPERATURE:"module{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit"
THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency"
}
thermal_api_handler_psu = {
THERMAL_API_GET_TEMPERATURE:"psu{}_temp",
THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max"
THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max",
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}",
THERMAL_API_GET_HIGH_THRESHOLD:None
THERMAL_API_GET_HIGH_THRESHOLD:None,
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic",
Expand Down Expand Up @@ -233,6 +241,7 @@
},
]


def initialize_thermals(sku, thermal_list, psu_list):
# create thermal objects for all categories of sensors
tp_index = hwsku_dict_thermal[sku]
Expand Down Expand Up @@ -262,6 +271,8 @@ def initialize_thermals(sku, thermal_list, psu_list):
thermal = Thermal(category, start + index, True)
thermal_list.append(thermal)



class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None):
"""
Expand All @@ -280,9 +291,11 @@ def __init__(self, category, index, has_index, dependency = None, hint = None):
self.category = category
self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE)
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency
self.dependent_hint = hint


def get_name(self):
"""
Retrieves the name of the device
Expand All @@ -292,6 +305,7 @@ def get_name(self):
"""
return self.name


def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
Expand All @@ -304,6 +318,7 @@ def _read_generic_file(self, filename, len):
logger.log_info("Fail to read file {} due to {}".format(filename, repr(e)))
return result


def _get_file_from_api(self, api_name):
if self.category == THERMAL_DEV_CATEGORY_AMBIENT:
if api_name == THERMAL_API_GET_TEMPERATURE:
Expand All @@ -315,9 +330,13 @@ def _get_file_from_api(self, api_name):
if self.category in thermal_device_categories_singleton:
filename = handler
else:
filename = handler.format(self.index)
if handler:
filename = handler.format(self.index)
else:
return None
return join(HW_MGMT_THERMAL_ROOT, filename)


def get_temperature(self):
"""
Retrieves current temperature reading from thermal
Expand All @@ -337,8 +356,11 @@ def get_temperature(self):
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0


def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal
Expand All @@ -353,4 +375,25 @@ def get_high_threshold(self):
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0


def get_high_critical_threshold(self):
"""
Retrieves the high critical threshold temperature of thermal

Returns:
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_critical_threshold is None:
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None:
return None
value_float = float(value_str)
if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD:
return None
return value_float / 1000.0