commit 1f8148346fcb26c638250fa05b1d4155873648e4 Author: Domen Tabernik Date: Fri Mar 8 19:24:59 2019 +0000 Initial commit (working version). diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..24881cf --- /dev/null +++ b/LICENCE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Chris Putnam + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..72c118e --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# NVIDIA GPU-based FAN controller for SUPERMICRO server + +This controller enables automatic adjustments of FANs in SUPERMICRO servers based on GPU temperature. Only NVIDIA GPUs are supported since the tool uses nvidia-smi to parse the GPU temperature. FANs are controlled through IPMI tool (`impitool`) using the modified superfans (https://github.com/putnam/superfans/blob/master/superfans) script. + +# Requirements + +* Linux based only ( +* Python 2.7 +* nvidia drivers/tools (nvidia-smi) +* IPMI tool (impitool) with loaded + + +# Usage + +```bash +python superfans_gpu_controller.py +``` + diff --git a/superfans.py b/superfans.py new file mode 100644 index 0000000..5beca1c --- /dev/null +++ b/superfans.py @@ -0,0 +1,233 @@ +# superfans +# https://github.com/putnam/superfans +# +# 2019: modified by Domen Tabernik +# + +import os, sys, subprocess, time, shutil, shlex + +# list of FAN preset settings +FAN_PRESET_STANDARD=0 +FAN_PRESET_FULL=1 +FAN_PRESET_OPTIMAL=2 +FAN_PRESET_HEAVYIO=4 +FAN_PRESETS=[FAN_PRESET_STANDARD, FAN_PRESET_FULL, FAN_PRESET_OPTIMAL, FAN_PRESET_HEAVYIO] +#FAN_PRESETS_STR={ +# 'standard' : FAN_PRESET_STANDARD, +# 'full' : FAN_PRESET_FULL, +# 'optimal' : FAN_PRESET_OPTIMAL, +# 'heavyio' : FAN_PRESET_HEAVYIO +#} +FAN_PRESETS_DESC={ + FAN_PRESET_STANDARD : "Standard (Temp controlled, target 50%)", + FAN_PRESET_FULL : "Full (All fans at 100%)", + FAN_PRESET_OPTIMAL : "Optimal (Temp controlled, target 30%)", + FAN_PRESET_HEAVYIO : "Heavy IO (Temp controlled, CPU target 50%; Peripherals target 75%" +} + +# list of FAN zones +FAN_ZONE_CPU1=0 # marked as FAN10 for CPU1 (right one) +FAN_ZONE_CPU2=1 # marked as FAN9 for CPU2 (left one) +FAN_ZONE_SYS2=2 # marked as FAN1-4 (right ones) +FAN_ZONE_SYS1=3 # marked as FAN5-8 (left ones) +FAN_ZONES=[FAN_ZONE_CPU1, FAN_ZONE_CPU2, FAN_ZONE_SYS2, FAN_ZONE_SYS1] +FAN_ZONES_STR={ + FAN_ZONE_CPU1:'cpu1', + FAN_ZONE_CPU2:'cpu2', + FAN_ZONE_SYS2:'system2', + FAN_ZONE_SYS1:'system1', +} + +# list of FANs and zone member association +FAN1 ='FAN1' +FAN2 ='FAN2' +FAN3 ='FAN3' +FAN4 ='FAN4' +FAN5 ='FAN5' +FAN6 ='FAN6' +FAN7 ='FAN7' +FAN8 ='FAN8' +FAN9 ='FAN9' +FAN10 ='FAN10' + +FAN_ZONES_MEMBERS= { + FAN_ZONE_CPU1:FAN10, + FAN_ZONE_CPU2:FAN9, + FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4], + FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8], +} + +# based on observations on SUPERMICRO_4029GP_TRT2 the +# SYS1 and SYS2 fans use the following linear equations to +# convert from RPM to % value +def SUPERMICRO_4029GP_TRT2_RPM_to_percent(rpm): + return rpm * 0.0098 - 11.5479 + +def set_fan_with_full_preset(config, speed, zone): + """ + Set fan speed to a fixed %. + Some chassis implement separate fan "zones" named CPU and Peripheral. To target specific zones, use the --zone option. + """ + + # Make sure fans are on Full setting, or else this won't stick for long + s = get_preset(config) + if s is False: + print(time.ctime() +": Unable to get current fan status; exiting") + return False + + if s != FAN_PRESET_FULL: + print(time.ctime() +": The fan controller is currently not set to Full mode (required for manual fan settings, which will otherwise be adjusted by the BMC within minutes); setting it now.") + set_preset(config, preset='full') + print(time.ctime() +": Waiting 5 seconds to let fans spin up...") + time.sleep(5) + + ok = True + if zone == 'all' or zone == 'cpu': + ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x00 0x%02x' % speed, **config) + if ok and (zone == 'all' or zone == 'periph'): + ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x01 0x%02x' % speed, **config) + + if ok: + print(time.ctime() +": Set %s fans on %s to %d%%." % (zone, config['hostname'], speed)) + return True + else: + print(time.ctime() +": Unable to update fans.") + return False + +def set_fan(config, speed, zone): + """ + Set fan speed to a fixed %. + Will be changed by Server if not in FULL preset (need to periodically call this) + """ + + ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x%02x 0x%02x' % (zone, speed), **config) + + if ok: + print(time.ctime() +": Set %s fans on %s to %d%%." % (FAN_ZONES_STR[zone], config['hostname'], speed)) + return True + else: + print(time.ctime() +": Unable to update fans.") + return False + +def get_fan(config, fan): + """ + Get fan speed in % (for one or more fans). + """ + + fan_status_list = ipmi_fan_status(**config) + + if type(fan) == list: + return_list = {} + for f in fan: + if fan_status_list.has_key(f): + return_list[f] = SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[f]) + return return_list + elif fan_status_list.has_key(fan): + return SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[fan]) + else: + return False + + +def _set_preset(config): + """ + Retrieves fan controller preset & fan speed. + """ + status = get_preset(config) + if status is False: + return False + if status in FAN_PRESETS: + s = FAN_PRESETS_DESC[status] + else: + s = "Unknown status code %d" % status + # manual fan ctl get(0)/set(1) cpu(0)/periph(1) duty(0-0x64) + # 0x30 0x70 0x66 0x00 0x00 0x64 + fan_speed = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x00', **config) + if fan_speed is False: + return False + fan_speed2 = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x01', **config) + if fan_speed2 is False: + return False + + print(time.ctime() +": Preset: %s" % s) + print(time.ctime() +": Current fan speed (CPU Zone): %d%%" % int(fan_speed, 16)) + print(time.ctime() +": Current fan speed (Peripheral zone): %d%%" % int(fan_speed2, 16)) + return True + + +def set_preset(config, preset): + if preset not in FAN_PRESETS: + return False + + if ipmi_raw_cmd("0x30 0x45 0x01 0x0%d" % preset, **config): + print(time.ctime() +": Updated preset on %s." % config['hostname']) + return True + + return False + +def ipmi_raw_cmd(raw_cmd, hostname = 'localhost', username=None, password=None, use_env=False): + + if hostname == 'localhost': + if os.geteuid() != 0: + print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.") + sys.exit(1) + cmd = 'ipmitool raw %s' % raw_cmd + else: + if use_env: + cmd_pass = '-E' + else: + cmd_pass = '-P %s' % shlex.quote(password) + cmd = 'ipmitool -I lanplus -U %s %s -H %s raw %s' % (shlex.quote(username), cmd_pass, hostname, raw_cmd) + + try: + s = subprocess.check_output(cmd + " 2>&1", shell=True) + except subprocess.CalledProcessError, ex: + print(time.ctime() +": Error: Problem running ipmitool") + print(time.ctime() +": Command: %s" % cmd) + print(time.ctime() +": Return code: %d" % ex) + return False + + out = s.strip() + if out: + return out + else: + return True + +def ipmi_fan_status(hostname = 'localhost', username=None, password=None, use_env=False): + cmd = 'ipmitool sensor | grep FAN' + + if hostname == 'localhost': + if os.geteuid() != 0: + print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.") + sys.exit(1) + cmd = 'ipmitool sensor | grep FAN ' + else: + if use_env: + cmd_pass = '-E' + else: + cmd_pass = '-P %s' % shlex.quote(password) + cmd = 'ipmitool -I lanplus -U %s %s -H %s sensor | grep FAN' % (shlex.quote(username), cmd_pass, hostname) + try: + s = subprocess.check_output(cmd + " 2>&1", shell=True) + except subprocess.CalledProcessError, ex: + print(time.ctime() +": Error: Problem running ipmitool") + print(time.ctime() +": Command: %s" % cmd) + print(time.ctime() +": Return code: %d" % ex) + return False + + fan_status_return = {} + for fan_str in s.split("\n"): + if len(fan_str.strip()) > 0: + fan_stat = fan_str.split("|") + fan_name = fan_stat[0].strip() + fan_rpm = float(fan_stat[1].strip()) + fan_status_return[fan_name] = fan_rpm + return fan_status_return + +def get_preset(config): + try: + s = ipmi_raw_cmd('0x30 0x45 0x00', **config) + if s is False: + return False + return int(s) + except: + return False diff --git a/superfans_gpu_controller.py b/superfans_gpu_controller.py new file mode 100644 index 0000000..a223fd2 --- /dev/null +++ b/superfans_gpu_controller.py @@ -0,0 +1,130 @@ +# Superfans GPU controller +# +# author: Domen Tabernik +# 2019 + +import time, superfans, subprocess + +def retrieve_nvidia_gpu_temperature(): + cmd = 'nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader' + + s = subprocess.check_output(cmd + " 2>&1", shell=True) + if len(s) <= 0: + return False + + out = [int(x.strip()) for x in s.split("\n") if len(x.strip()) > 0] + if out: + return out + else: + return False + +def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=1.0): + """ + Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`. + After the loop the default preset is restored. + + :param fan_settings: dictionary that maps the temperature in deg C to % of fan speed + :param FAN_INCREASED_MIN_TIME: minimal time before a fan speed is again reduced (based on previous change) default=120 + :param sleep_sec: loop sleep time (default=2 sec) + :param gpu_moving_avg_num: moving average for GPU i.e. the number of last measurements that are averaged (default=5) + :param fan_target_eps: tolerance of fan target w.r.t. the the actual value in deg C (default=1.0) + :return: + """ + superfan_config = dict(hostname= 'localhost') + + # save default present before changing anything + default_preset = superfans.get_preset(superfan_config) + print(time.ctime() + ': Started fan control using GPU temperature.') + print(time.ctime() + ': Using settings:') + for k in sorted(fan_settings.keys()): + print(time.ctime() + ': \t%d C = %d ' % (k, fan_settings[k]) + "%") + print(time.ctime() + ':') + try: + + FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \ + superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2] + + # GPU moving average + previous_target_fan = None + previous_update_time = None + + prev_GPU_temp = [] + + while True: + + # get GPU temperature + GPU_temp = retrieve_nvidia_gpu_temperature() + + prev_GPU_temp.append(GPU_temp) + + # continue until we have enough sampels for moving average + if len(prev_GPU_temp) < gpu_moving_avg_num: + continue + + # retain last 5 mesurements + prev_GPU_temp = prev_GPU_temp[-gpu_moving_avg_num:] + mean_GPU_temp = prev_GPU_temp[0] + for gpu_temp in prev_GPU_temp[1:]: + mean_GPU_temp = [x+y for x,y in zip(gpu_temp, mean_GPU_temp)] + + mean_GPU_temp = [x/len(prev_GPU_temp) for x in mean_GPU_temp] + + max_gpu_temp = max(mean_GPU_temp) + + for key_temp in sorted(fan_settings.keys())[::-1]: + if key_temp <= max_gpu_temp: + target_fan = fan_settings[key_temp] + break + + + current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS) + current_update_time = time.time() + diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1]] + diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]] + + disbale_update = False + + if previous_update_time is not None and previous_target_fan is not None: + has_enough_time_elapsed = current_update_time - previous_update_time > FAN_INCREASED_MIN_TIME + is_level_down_change = target_fan < previous_target_fan + disbale_update = True if is_level_down_change and not has_enough_time_elapsed else False + + if not disbale_update: + # Allow for 1% difference in target + update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan]) + update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan]) + if update_sys1_fan: + superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1) + + if update_sys2_fan: + superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2) + + if update_sys1_fan or update_sys2_fan: + print(time.ctime() + ': \tCurrent GPU measurements: %s' % ','.join(map(str,GPU_temp))) + print(time.ctime() + ': \tMoving average GPU measurements: %s' % ','.join(map(str,mean_GPU_temp))) + print(time.ctime() + ': \tTarget difference: SYS1 fan = %f; SYS2 fan = %f' % (max(diff_sys1_fan), max(diff_sys2_fan))) + print(time.ctime() + ':') + + + previous_target_fan = target_fan + previous_update_time = current_update_time + + time.sleep(sleep_sec) + finally: + # revert back to default preset before finishing + superfans.set_preset(superfan_config, default_preset) + print(time.ctime() + ': Reverted back to system default.') + + + + +if __name__ == "__main__": + # fan settings = {[in deg C]: [% fan], ...} + fan_settings = {0: 25, + 60: 30, + 70: 36, + 80: 40, + 85: 45, + 90: 50} + + superfans_gpu_controller(fan_settings) \ No newline at end of file