Initial commit (working version).

This commit is contained in:
Domen Tabernik 2019-03-08 19:24:59 +00:00
commit 1f8148346f
4 changed files with 402 additions and 0 deletions

21
LICENCE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 Chris Putnam
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

18
README.md Normal file
View File

@ -0,0 +1,18 @@
# NVIDIA GPU-based FAN controller for SUPERMICRO server
This controller enables automatic adjustments of FANs in SUPERMICRO servers based on GPU temperature. Only NVIDIA GPUs are supported since the tool uses nvidia-smi to parse the GPU temperature. FANs are controlled through IPMI tool (`impitool`) using the modified superfans (https://github.com/putnam/superfans/blob/master/superfans) script.
# Requirements
* Linux based only (
* Python 2.7
* nvidia drivers/tools (nvidia-smi)
* IPMI tool (impitool) with loaded
# Usage
```bash
python superfans_gpu_controller.py
```

233
superfans.py Normal file
View File

@ -0,0 +1,233 @@
# superfans
# https://github.com/putnam/superfans
#
# 2019: modified by Domen Tabernik
#
import os, sys, subprocess, time, shutil, shlex
# list of FAN preset settings
FAN_PRESET_STANDARD=0
FAN_PRESET_FULL=1
FAN_PRESET_OPTIMAL=2
FAN_PRESET_HEAVYIO=4
FAN_PRESETS=[FAN_PRESET_STANDARD, FAN_PRESET_FULL, FAN_PRESET_OPTIMAL, FAN_PRESET_HEAVYIO]
#FAN_PRESETS_STR={
# 'standard' : FAN_PRESET_STANDARD,
# 'full' : FAN_PRESET_FULL,
# 'optimal' : FAN_PRESET_OPTIMAL,
# 'heavyio' : FAN_PRESET_HEAVYIO
#}
FAN_PRESETS_DESC={
FAN_PRESET_STANDARD : "Standard (Temp controlled, target 50%)",
FAN_PRESET_FULL : "Full (All fans at 100%)",
FAN_PRESET_OPTIMAL : "Optimal (Temp controlled, target 30%)",
FAN_PRESET_HEAVYIO : "Heavy IO (Temp controlled, CPU target 50%; Peripherals target 75%"
}
# list of FAN zones
FAN_ZONE_CPU1=0 # marked as FAN10 for CPU1 (right one)
FAN_ZONE_CPU2=1 # marked as FAN9 for CPU2 (left one)
FAN_ZONE_SYS2=2 # marked as FAN1-4 (right ones)
FAN_ZONE_SYS1=3 # marked as FAN5-8 (left ones)
FAN_ZONES=[FAN_ZONE_CPU1, FAN_ZONE_CPU2, FAN_ZONE_SYS2, FAN_ZONE_SYS1]
FAN_ZONES_STR={
FAN_ZONE_CPU1:'cpu1',
FAN_ZONE_CPU2:'cpu2',
FAN_ZONE_SYS2:'system2',
FAN_ZONE_SYS1:'system1',
}
# list of FANs and zone member association
FAN1 ='FAN1'
FAN2 ='FAN2'
FAN3 ='FAN3'
FAN4 ='FAN4'
FAN5 ='FAN5'
FAN6 ='FAN6'
FAN7 ='FAN7'
FAN8 ='FAN8'
FAN9 ='FAN9'
FAN10 ='FAN10'
FAN_ZONES_MEMBERS= {
FAN_ZONE_CPU1:FAN10,
FAN_ZONE_CPU2:FAN9,
FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4],
FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8],
}
# based on observations on SUPERMICRO_4029GP_TRT2 the
# SYS1 and SYS2 fans use the following linear equations to
# convert from RPM to % value
def SUPERMICRO_4029GP_TRT2_RPM_to_percent(rpm):
return rpm * 0.0098 - 11.5479
def set_fan_with_full_preset(config, speed, zone):
"""
Set fan speed to a fixed %.
Some chassis implement separate fan "zones" named CPU and Peripheral. To target specific zones, use the --zone option.
"""
# Make sure fans are on Full setting, or else this won't stick for long
s = get_preset(config)
if s is False:
print(time.ctime() +": Unable to get current fan status; exiting")
return False
if s != FAN_PRESET_FULL:
print(time.ctime() +": The fan controller is currently not set to Full mode (required for manual fan settings, which will otherwise be adjusted by the BMC within minutes); setting it now.")
set_preset(config, preset='full')
print(time.ctime() +": Waiting 5 seconds to let fans spin up...")
time.sleep(5)
ok = True
if zone == 'all' or zone == 'cpu':
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x00 0x%02x' % speed, **config)
if ok and (zone == 'all' or zone == 'periph'):
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x01 0x%02x' % speed, **config)
if ok:
print(time.ctime() +": Set %s fans on %s to %d%%." % (zone, config['hostname'], speed))
return True
else:
print(time.ctime() +": Unable to update fans.")
return False
def set_fan(config, speed, zone):
"""
Set fan speed to a fixed %.
Will be changed by Server if not in FULL preset (need to periodically call this)
"""
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x%02x 0x%02x' % (zone, speed), **config)
if ok:
print(time.ctime() +": Set %s fans on %s to %d%%." % (FAN_ZONES_STR[zone], config['hostname'], speed))
return True
else:
print(time.ctime() +": Unable to update fans.")
return False
def get_fan(config, fan):
"""
Get fan speed in % (for one or more fans).
"""
fan_status_list = ipmi_fan_status(**config)
if type(fan) == list:
return_list = {}
for f in fan:
if fan_status_list.has_key(f):
return_list[f] = SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[f])
return return_list
elif fan_status_list.has_key(fan):
return SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[fan])
else:
return False
def _set_preset(config):
"""
Retrieves fan controller preset & fan speed.
"""
status = get_preset(config)
if status is False:
return False
if status in FAN_PRESETS:
s = FAN_PRESETS_DESC[status]
else:
s = "Unknown status code %d" % status
# manual fan ctl get(0)/set(1) cpu(0)/periph(1) duty(0-0x64)
# 0x30 0x70 0x66 0x00 0x00 0x64
fan_speed = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x00', **config)
if fan_speed is False:
return False
fan_speed2 = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x01', **config)
if fan_speed2 is False:
return False
print(time.ctime() +": Preset: %s" % s)
print(time.ctime() +": Current fan speed (CPU Zone): %d%%" % int(fan_speed, 16))
print(time.ctime() +": Current fan speed (Peripheral zone): %d%%" % int(fan_speed2, 16))
return True
def set_preset(config, preset):
if preset not in FAN_PRESETS:
return False
if ipmi_raw_cmd("0x30 0x45 0x01 0x0%d" % preset, **config):
print(time.ctime() +": Updated preset on %s." % config['hostname'])
return True
return False
def ipmi_raw_cmd(raw_cmd, hostname = 'localhost', username=None, password=None, use_env=False):
if hostname == 'localhost':
if os.geteuid() != 0:
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
sys.exit(1)
cmd = 'ipmitool raw %s' % raw_cmd
else:
if use_env:
cmd_pass = '-E'
else:
cmd_pass = '-P %s' % shlex.quote(password)
cmd = 'ipmitool -I lanplus -U %s %s -H %s raw %s' % (shlex.quote(username), cmd_pass, hostname, raw_cmd)
try:
s = subprocess.check_output(cmd + " 2>&1", shell=True)
except subprocess.CalledProcessError, ex:
print(time.ctime() +": Error: Problem running ipmitool")
print(time.ctime() +": Command: %s" % cmd)
print(time.ctime() +": Return code: %d" % ex)
return False
out = s.strip()
if out:
return out
else:
return True
def ipmi_fan_status(hostname = 'localhost', username=None, password=None, use_env=False):
cmd = 'ipmitool sensor | grep FAN'
if hostname == 'localhost':
if os.geteuid() != 0:
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
sys.exit(1)
cmd = 'ipmitool sensor | grep FAN '
else:
if use_env:
cmd_pass = '-E'
else:
cmd_pass = '-P %s' % shlex.quote(password)
cmd = 'ipmitool -I lanplus -U %s %s -H %s sensor | grep FAN' % (shlex.quote(username), cmd_pass, hostname)
try:
s = subprocess.check_output(cmd + " 2>&1", shell=True)
except subprocess.CalledProcessError, ex:
print(time.ctime() +": Error: Problem running ipmitool")
print(time.ctime() +": Command: %s" % cmd)
print(time.ctime() +": Return code: %d" % ex)
return False
fan_status_return = {}
for fan_str in s.split("\n"):
if len(fan_str.strip()) > 0:
fan_stat = fan_str.split("|")
fan_name = fan_stat[0].strip()
fan_rpm = float(fan_stat[1].strip())
fan_status_return[fan_name] = fan_rpm
return fan_status_return
def get_preset(config):
try:
s = ipmi_raw_cmd('0x30 0x45 0x00', **config)
if s is False:
return False
return int(s)
except:
return False

130
superfans_gpu_controller.py Normal file
View File

@ -0,0 +1,130 @@
# Superfans GPU controller
#
# author: Domen Tabernik
# 2019
import time, superfans, subprocess
def retrieve_nvidia_gpu_temperature():
cmd = 'nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader'
s = subprocess.check_output(cmd + " 2>&1", shell=True)
if len(s) <= 0:
return False
out = [int(x.strip()) for x in s.split("\n") if len(x.strip()) > 0]
if out:
return out
else:
return False
def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=1.0):
"""
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`.
After the loop the default preset is restored.
:param fan_settings: dictionary that maps the temperature in deg C to % of fan speed
:param FAN_INCREASED_MIN_TIME: minimal time before a fan speed is again reduced (based on previous change) default=120
:param sleep_sec: loop sleep time (default=2 sec)
:param gpu_moving_avg_num: moving average for GPU i.e. the number of last measurements that are averaged (default=5)
:param fan_target_eps: tolerance of fan target w.r.t. the the actual value in deg C (default=1.0)
:return:
"""
superfan_config = dict(hostname= 'localhost')
# save default present before changing anything
default_preset = superfans.get_preset(superfan_config)
print(time.ctime() + ': Started fan control using GPU temperature.')
print(time.ctime() + ': Using settings:')
for k in sorted(fan_settings.keys()):
print(time.ctime() + ': \t%d C = %d ' % (k, fan_settings[k]) + "%")
print(time.ctime() + ':')
try:
FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \
superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]
# GPU moving average
previous_target_fan = None
previous_update_time = None
prev_GPU_temp = []
while True:
# get GPU temperature
GPU_temp = retrieve_nvidia_gpu_temperature()
prev_GPU_temp.append(GPU_temp)
# continue until we have enough sampels for moving average
if len(prev_GPU_temp) < gpu_moving_avg_num:
continue
# retain last 5 mesurements
prev_GPU_temp = prev_GPU_temp[-gpu_moving_avg_num:]
mean_GPU_temp = prev_GPU_temp[0]
for gpu_temp in prev_GPU_temp[1:]:
mean_GPU_temp = [x+y for x,y in zip(gpu_temp, mean_GPU_temp)]
mean_GPU_temp = [x/len(prev_GPU_temp) for x in mean_GPU_temp]
max_gpu_temp = max(mean_GPU_temp)
for key_temp in sorted(fan_settings.keys())[::-1]:
if key_temp <= max_gpu_temp:
target_fan = fan_settings[key_temp]
break
current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS)
current_update_time = time.time()
diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1]]
diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]]
disbale_update = False
if previous_update_time is not None and previous_target_fan is not None:
has_enough_time_elapsed = current_update_time - previous_update_time > FAN_INCREASED_MIN_TIME
is_level_down_change = target_fan < previous_target_fan
disbale_update = True if is_level_down_change and not has_enough_time_elapsed else False
if not disbale_update:
# Allow for 1% difference in target
update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan])
update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan])
if update_sys1_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1)
if update_sys2_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2)
if update_sys1_fan or update_sys2_fan:
print(time.ctime() + ': \tCurrent GPU measurements: %s' % ','.join(map(str,GPU_temp)))
print(time.ctime() + ': \tMoving average GPU measurements: %s' % ','.join(map(str,mean_GPU_temp)))
print(time.ctime() + ': \tTarget difference: SYS1 fan = %f; SYS2 fan = %f' % (max(diff_sys1_fan), max(diff_sys2_fan)))
print(time.ctime() + ':')
previous_target_fan = target_fan
previous_update_time = current_update_time
time.sleep(sleep_sec)
finally:
# revert back to default preset before finishing
superfans.set_preset(superfan_config, default_preset)
print(time.ctime() + ': Reverted back to system default.')
if __name__ == "__main__":
# fan settings = {[in deg C]: [% fan], ...}
fan_settings = {0: 25,
60: 30,
70: 36,
80: 40,
85: 45,
90: 50}
superfans_gpu_controller(fan_settings)