Initial commit (working version).
This commit is contained in:
commit
1f8148346f
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2018 Chris Putnam
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
|
@ -0,0 +1,18 @@
|
||||||
|
# NVIDIA GPU-based FAN controller for SUPERMICRO server
|
||||||
|
|
||||||
|
This controller enables automatic adjustments of FANs in SUPERMICRO servers based on GPU temperature. Only NVIDIA GPUs are supported since the tool uses nvidia-smi to parse the GPU temperature. FANs are controlled through IPMI tool (`impitool`) using the modified superfans (https://github.com/putnam/superfans/blob/master/superfans) script.
|
||||||
|
|
||||||
|
# Requirements
|
||||||
|
|
||||||
|
* Linux based only (
|
||||||
|
* Python 2.7
|
||||||
|
* nvidia drivers/tools (nvidia-smi)
|
||||||
|
* IPMI tool (impitool) with loaded
|
||||||
|
|
||||||
|
|
||||||
|
# Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python superfans_gpu_controller.py
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,233 @@
|
||||||
|
# superfans
|
||||||
|
# https://github.com/putnam/superfans
|
||||||
|
#
|
||||||
|
# 2019: modified by Domen Tabernik
|
||||||
|
#
|
||||||
|
|
||||||
|
import os, sys, subprocess, time, shutil, shlex
|
||||||
|
|
||||||
|
# list of FAN preset settings
|
||||||
|
FAN_PRESET_STANDARD=0
|
||||||
|
FAN_PRESET_FULL=1
|
||||||
|
FAN_PRESET_OPTIMAL=2
|
||||||
|
FAN_PRESET_HEAVYIO=4
|
||||||
|
FAN_PRESETS=[FAN_PRESET_STANDARD, FAN_PRESET_FULL, FAN_PRESET_OPTIMAL, FAN_PRESET_HEAVYIO]
|
||||||
|
#FAN_PRESETS_STR={
|
||||||
|
# 'standard' : FAN_PRESET_STANDARD,
|
||||||
|
# 'full' : FAN_PRESET_FULL,
|
||||||
|
# 'optimal' : FAN_PRESET_OPTIMAL,
|
||||||
|
# 'heavyio' : FAN_PRESET_HEAVYIO
|
||||||
|
#}
|
||||||
|
FAN_PRESETS_DESC={
|
||||||
|
FAN_PRESET_STANDARD : "Standard (Temp controlled, target 50%)",
|
||||||
|
FAN_PRESET_FULL : "Full (All fans at 100%)",
|
||||||
|
FAN_PRESET_OPTIMAL : "Optimal (Temp controlled, target 30%)",
|
||||||
|
FAN_PRESET_HEAVYIO : "Heavy IO (Temp controlled, CPU target 50%; Peripherals target 75%"
|
||||||
|
}
|
||||||
|
|
||||||
|
# list of FAN zones
|
||||||
|
FAN_ZONE_CPU1=0 # marked as FAN10 for CPU1 (right one)
|
||||||
|
FAN_ZONE_CPU2=1 # marked as FAN9 for CPU2 (left one)
|
||||||
|
FAN_ZONE_SYS2=2 # marked as FAN1-4 (right ones)
|
||||||
|
FAN_ZONE_SYS1=3 # marked as FAN5-8 (left ones)
|
||||||
|
FAN_ZONES=[FAN_ZONE_CPU1, FAN_ZONE_CPU2, FAN_ZONE_SYS2, FAN_ZONE_SYS1]
|
||||||
|
FAN_ZONES_STR={
|
||||||
|
FAN_ZONE_CPU1:'cpu1',
|
||||||
|
FAN_ZONE_CPU2:'cpu2',
|
||||||
|
FAN_ZONE_SYS2:'system2',
|
||||||
|
FAN_ZONE_SYS1:'system1',
|
||||||
|
}
|
||||||
|
|
||||||
|
# list of FANs and zone member association
|
||||||
|
FAN1 ='FAN1'
|
||||||
|
FAN2 ='FAN2'
|
||||||
|
FAN3 ='FAN3'
|
||||||
|
FAN4 ='FAN4'
|
||||||
|
FAN5 ='FAN5'
|
||||||
|
FAN6 ='FAN6'
|
||||||
|
FAN7 ='FAN7'
|
||||||
|
FAN8 ='FAN8'
|
||||||
|
FAN9 ='FAN9'
|
||||||
|
FAN10 ='FAN10'
|
||||||
|
|
||||||
|
FAN_ZONES_MEMBERS= {
|
||||||
|
FAN_ZONE_CPU1:FAN10,
|
||||||
|
FAN_ZONE_CPU2:FAN9,
|
||||||
|
FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4],
|
||||||
|
FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8],
|
||||||
|
}
|
||||||
|
|
||||||
|
# based on observations on SUPERMICRO_4029GP_TRT2 the
|
||||||
|
# SYS1 and SYS2 fans use the following linear equations to
|
||||||
|
# convert from RPM to % value
|
||||||
|
def SUPERMICRO_4029GP_TRT2_RPM_to_percent(rpm):
|
||||||
|
return rpm * 0.0098 - 11.5479
|
||||||
|
|
||||||
|
def set_fan_with_full_preset(config, speed, zone):
|
||||||
|
"""
|
||||||
|
Set fan speed to a fixed %.
|
||||||
|
Some chassis implement separate fan "zones" named CPU and Peripheral. To target specific zones, use the --zone option.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Make sure fans are on Full setting, or else this won't stick for long
|
||||||
|
s = get_preset(config)
|
||||||
|
if s is False:
|
||||||
|
print(time.ctime() +": Unable to get current fan status; exiting")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if s != FAN_PRESET_FULL:
|
||||||
|
print(time.ctime() +": The fan controller is currently not set to Full mode (required for manual fan settings, which will otherwise be adjusted by the BMC within minutes); setting it now.")
|
||||||
|
set_preset(config, preset='full')
|
||||||
|
print(time.ctime() +": Waiting 5 seconds to let fans spin up...")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
ok = True
|
||||||
|
if zone == 'all' or zone == 'cpu':
|
||||||
|
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x00 0x%02x' % speed, **config)
|
||||||
|
if ok and (zone == 'all' or zone == 'periph'):
|
||||||
|
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x01 0x%02x' % speed, **config)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
print(time.ctime() +": Set %s fans on %s to %d%%." % (zone, config['hostname'], speed))
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(time.ctime() +": Unable to update fans.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def set_fan(config, speed, zone):
|
||||||
|
"""
|
||||||
|
Set fan speed to a fixed %.
|
||||||
|
Will be changed by Server if not in FULL preset (need to periodically call this)
|
||||||
|
"""
|
||||||
|
|
||||||
|
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x%02x 0x%02x' % (zone, speed), **config)
|
||||||
|
|
||||||
|
if ok:
|
||||||
|
print(time.ctime() +": Set %s fans on %s to %d%%." % (FAN_ZONES_STR[zone], config['hostname'], speed))
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(time.ctime() +": Unable to update fans.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_fan(config, fan):
|
||||||
|
"""
|
||||||
|
Get fan speed in % (for one or more fans).
|
||||||
|
"""
|
||||||
|
|
||||||
|
fan_status_list = ipmi_fan_status(**config)
|
||||||
|
|
||||||
|
if type(fan) == list:
|
||||||
|
return_list = {}
|
||||||
|
for f in fan:
|
||||||
|
if fan_status_list.has_key(f):
|
||||||
|
return_list[f] = SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[f])
|
||||||
|
return return_list
|
||||||
|
elif fan_status_list.has_key(fan):
|
||||||
|
return SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[fan])
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _set_preset(config):
|
||||||
|
"""
|
||||||
|
Retrieves fan controller preset & fan speed.
|
||||||
|
"""
|
||||||
|
status = get_preset(config)
|
||||||
|
if status is False:
|
||||||
|
return False
|
||||||
|
if status in FAN_PRESETS:
|
||||||
|
s = FAN_PRESETS_DESC[status]
|
||||||
|
else:
|
||||||
|
s = "Unknown status code %d" % status
|
||||||
|
# manual fan ctl get(0)/set(1) cpu(0)/periph(1) duty(0-0x64)
|
||||||
|
# 0x30 0x70 0x66 0x00 0x00 0x64
|
||||||
|
fan_speed = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x00', **config)
|
||||||
|
if fan_speed is False:
|
||||||
|
return False
|
||||||
|
fan_speed2 = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x01', **config)
|
||||||
|
if fan_speed2 is False:
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(time.ctime() +": Preset: %s" % s)
|
||||||
|
print(time.ctime() +": Current fan speed (CPU Zone): %d%%" % int(fan_speed, 16))
|
||||||
|
print(time.ctime() +": Current fan speed (Peripheral zone): %d%%" % int(fan_speed2, 16))
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def set_preset(config, preset):
|
||||||
|
if preset not in FAN_PRESETS:
|
||||||
|
return False
|
||||||
|
|
||||||
|
if ipmi_raw_cmd("0x30 0x45 0x01 0x0%d" % preset, **config):
|
||||||
|
print(time.ctime() +": Updated preset on %s." % config['hostname'])
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def ipmi_raw_cmd(raw_cmd, hostname = 'localhost', username=None, password=None, use_env=False):
|
||||||
|
|
||||||
|
if hostname == 'localhost':
|
||||||
|
if os.geteuid() != 0:
|
||||||
|
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
|
||||||
|
sys.exit(1)
|
||||||
|
cmd = 'ipmitool raw %s' % raw_cmd
|
||||||
|
else:
|
||||||
|
if use_env:
|
||||||
|
cmd_pass = '-E'
|
||||||
|
else:
|
||||||
|
cmd_pass = '-P %s' % shlex.quote(password)
|
||||||
|
cmd = 'ipmitool -I lanplus -U %s %s -H %s raw %s' % (shlex.quote(username), cmd_pass, hostname, raw_cmd)
|
||||||
|
|
||||||
|
try:
|
||||||
|
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||||
|
except subprocess.CalledProcessError, ex:
|
||||||
|
print(time.ctime() +": Error: Problem running ipmitool")
|
||||||
|
print(time.ctime() +": Command: %s" % cmd)
|
||||||
|
print(time.ctime() +": Return code: %d" % ex)
|
||||||
|
return False
|
||||||
|
|
||||||
|
out = s.strip()
|
||||||
|
if out:
|
||||||
|
return out
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def ipmi_fan_status(hostname = 'localhost', username=None, password=None, use_env=False):
|
||||||
|
cmd = 'ipmitool sensor | grep FAN'
|
||||||
|
|
||||||
|
if hostname == 'localhost':
|
||||||
|
if os.geteuid() != 0:
|
||||||
|
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
|
||||||
|
sys.exit(1)
|
||||||
|
cmd = 'ipmitool sensor | grep FAN '
|
||||||
|
else:
|
||||||
|
if use_env:
|
||||||
|
cmd_pass = '-E'
|
||||||
|
else:
|
||||||
|
cmd_pass = '-P %s' % shlex.quote(password)
|
||||||
|
cmd = 'ipmitool -I lanplus -U %s %s -H %s sensor | grep FAN' % (shlex.quote(username), cmd_pass, hostname)
|
||||||
|
try:
|
||||||
|
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||||
|
except subprocess.CalledProcessError, ex:
|
||||||
|
print(time.ctime() +": Error: Problem running ipmitool")
|
||||||
|
print(time.ctime() +": Command: %s" % cmd)
|
||||||
|
print(time.ctime() +": Return code: %d" % ex)
|
||||||
|
return False
|
||||||
|
|
||||||
|
fan_status_return = {}
|
||||||
|
for fan_str in s.split("\n"):
|
||||||
|
if len(fan_str.strip()) > 0:
|
||||||
|
fan_stat = fan_str.split("|")
|
||||||
|
fan_name = fan_stat[0].strip()
|
||||||
|
fan_rpm = float(fan_stat[1].strip())
|
||||||
|
fan_status_return[fan_name] = fan_rpm
|
||||||
|
return fan_status_return
|
||||||
|
|
||||||
|
def get_preset(config):
|
||||||
|
try:
|
||||||
|
s = ipmi_raw_cmd('0x30 0x45 0x00', **config)
|
||||||
|
if s is False:
|
||||||
|
return False
|
||||||
|
return int(s)
|
||||||
|
except:
|
||||||
|
return False
|
|
@ -0,0 +1,130 @@
|
||||||
|
# Superfans GPU controller
|
||||||
|
#
|
||||||
|
# author: Domen Tabernik
|
||||||
|
# 2019
|
||||||
|
|
||||||
|
import time, superfans, subprocess
|
||||||
|
|
||||||
|
def retrieve_nvidia_gpu_temperature():
|
||||||
|
cmd = 'nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader'
|
||||||
|
|
||||||
|
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||||
|
if len(s) <= 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
out = [int(x.strip()) for x in s.split("\n") if len(x.strip()) > 0]
|
||||||
|
if out:
|
||||||
|
return out
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=1.0):
|
||||||
|
"""
|
||||||
|
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`.
|
||||||
|
After the loop the default preset is restored.
|
||||||
|
|
||||||
|
:param fan_settings: dictionary that maps the temperature in deg C to % of fan speed
|
||||||
|
:param FAN_INCREASED_MIN_TIME: minimal time before a fan speed is again reduced (based on previous change) default=120
|
||||||
|
:param sleep_sec: loop sleep time (default=2 sec)
|
||||||
|
:param gpu_moving_avg_num: moving average for GPU i.e. the number of last measurements that are averaged (default=5)
|
||||||
|
:param fan_target_eps: tolerance of fan target w.r.t. the the actual value in deg C (default=1.0)
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
superfan_config = dict(hostname= 'localhost')
|
||||||
|
|
||||||
|
# save default present before changing anything
|
||||||
|
default_preset = superfans.get_preset(superfan_config)
|
||||||
|
print(time.ctime() + ': Started fan control using GPU temperature.')
|
||||||
|
print(time.ctime() + ': Using settings:')
|
||||||
|
for k in sorted(fan_settings.keys()):
|
||||||
|
print(time.ctime() + ': \t%d C = %d ' % (k, fan_settings[k]) + "%")
|
||||||
|
print(time.ctime() + ':')
|
||||||
|
try:
|
||||||
|
|
||||||
|
FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \
|
||||||
|
superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]
|
||||||
|
|
||||||
|
# GPU moving average
|
||||||
|
previous_target_fan = None
|
||||||
|
previous_update_time = None
|
||||||
|
|
||||||
|
prev_GPU_temp = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
# get GPU temperature
|
||||||
|
GPU_temp = retrieve_nvidia_gpu_temperature()
|
||||||
|
|
||||||
|
prev_GPU_temp.append(GPU_temp)
|
||||||
|
|
||||||
|
# continue until we have enough sampels for moving average
|
||||||
|
if len(prev_GPU_temp) < gpu_moving_avg_num:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# retain last 5 mesurements
|
||||||
|
prev_GPU_temp = prev_GPU_temp[-gpu_moving_avg_num:]
|
||||||
|
mean_GPU_temp = prev_GPU_temp[0]
|
||||||
|
for gpu_temp in prev_GPU_temp[1:]:
|
||||||
|
mean_GPU_temp = [x+y for x,y in zip(gpu_temp, mean_GPU_temp)]
|
||||||
|
|
||||||
|
mean_GPU_temp = [x/len(prev_GPU_temp) for x in mean_GPU_temp]
|
||||||
|
|
||||||
|
max_gpu_temp = max(mean_GPU_temp)
|
||||||
|
|
||||||
|
for key_temp in sorted(fan_settings.keys())[::-1]:
|
||||||
|
if key_temp <= max_gpu_temp:
|
||||||
|
target_fan = fan_settings[key_temp]
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS)
|
||||||
|
current_update_time = time.time()
|
||||||
|
diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1]]
|
||||||
|
diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]]
|
||||||
|
|
||||||
|
disbale_update = False
|
||||||
|
|
||||||
|
if previous_update_time is not None and previous_target_fan is not None:
|
||||||
|
has_enough_time_elapsed = current_update_time - previous_update_time > FAN_INCREASED_MIN_TIME
|
||||||
|
is_level_down_change = target_fan < previous_target_fan
|
||||||
|
disbale_update = True if is_level_down_change and not has_enough_time_elapsed else False
|
||||||
|
|
||||||
|
if not disbale_update:
|
||||||
|
# Allow for 1% difference in target
|
||||||
|
update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan])
|
||||||
|
update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan])
|
||||||
|
if update_sys1_fan:
|
||||||
|
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1)
|
||||||
|
|
||||||
|
if update_sys2_fan:
|
||||||
|
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2)
|
||||||
|
|
||||||
|
if update_sys1_fan or update_sys2_fan:
|
||||||
|
print(time.ctime() + ': \tCurrent GPU measurements: %s' % ','.join(map(str,GPU_temp)))
|
||||||
|
print(time.ctime() + ': \tMoving average GPU measurements: %s' % ','.join(map(str,mean_GPU_temp)))
|
||||||
|
print(time.ctime() + ': \tTarget difference: SYS1 fan = %f; SYS2 fan = %f' % (max(diff_sys1_fan), max(diff_sys2_fan)))
|
||||||
|
print(time.ctime() + ':')
|
||||||
|
|
||||||
|
|
||||||
|
previous_target_fan = target_fan
|
||||||
|
previous_update_time = current_update_time
|
||||||
|
|
||||||
|
time.sleep(sleep_sec)
|
||||||
|
finally:
|
||||||
|
# revert back to default preset before finishing
|
||||||
|
superfans.set_preset(superfan_config, default_preset)
|
||||||
|
print(time.ctime() + ': Reverted back to system default.')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# fan settings = {[in deg C]: [% fan], ...}
|
||||||
|
fan_settings = {0: 25,
|
||||||
|
60: 30,
|
||||||
|
70: 36,
|
||||||
|
80: 40,
|
||||||
|
85: 45,
|
||||||
|
90: 50}
|
||||||
|
|
||||||
|
superfans_gpu_controller(fan_settings)
|
Loading…
Reference in New Issue