Initial commit (working version).
This commit is contained in:
commit
1f8148346f
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2018 Chris Putnam
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,18 @@
|
|||
# NVIDIA GPU-based FAN controller for SUPERMICRO server
|
||||
|
||||
This controller enables automatic adjustments of FANs in SUPERMICRO servers based on GPU temperature. Only NVIDIA GPUs are supported since the tool uses nvidia-smi to parse the GPU temperature. FANs are controlled through IPMI tool (`impitool`) using the modified superfans (https://github.com/putnam/superfans/blob/master/superfans) script.
|
||||
|
||||
# Requirements
|
||||
|
||||
* Linux based only (
|
||||
* Python 2.7
|
||||
* nvidia drivers/tools (nvidia-smi)
|
||||
* IPMI tool (impitool) with loaded
|
||||
|
||||
|
||||
# Usage
|
||||
|
||||
```bash
|
||||
python superfans_gpu_controller.py
|
||||
```
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
# superfans
|
||||
# https://github.com/putnam/superfans
|
||||
#
|
||||
# 2019: modified by Domen Tabernik
|
||||
#
|
||||
|
||||
import os, sys, subprocess, time, shutil, shlex
|
||||
|
||||
# list of FAN preset settings
|
||||
FAN_PRESET_STANDARD=0
|
||||
FAN_PRESET_FULL=1
|
||||
FAN_PRESET_OPTIMAL=2
|
||||
FAN_PRESET_HEAVYIO=4
|
||||
FAN_PRESETS=[FAN_PRESET_STANDARD, FAN_PRESET_FULL, FAN_PRESET_OPTIMAL, FAN_PRESET_HEAVYIO]
|
||||
#FAN_PRESETS_STR={
|
||||
# 'standard' : FAN_PRESET_STANDARD,
|
||||
# 'full' : FAN_PRESET_FULL,
|
||||
# 'optimal' : FAN_PRESET_OPTIMAL,
|
||||
# 'heavyio' : FAN_PRESET_HEAVYIO
|
||||
#}
|
||||
FAN_PRESETS_DESC={
|
||||
FAN_PRESET_STANDARD : "Standard (Temp controlled, target 50%)",
|
||||
FAN_PRESET_FULL : "Full (All fans at 100%)",
|
||||
FAN_PRESET_OPTIMAL : "Optimal (Temp controlled, target 30%)",
|
||||
FAN_PRESET_HEAVYIO : "Heavy IO (Temp controlled, CPU target 50%; Peripherals target 75%"
|
||||
}
|
||||
|
||||
# list of FAN zones
|
||||
FAN_ZONE_CPU1=0 # marked as FAN10 for CPU1 (right one)
|
||||
FAN_ZONE_CPU2=1 # marked as FAN9 for CPU2 (left one)
|
||||
FAN_ZONE_SYS2=2 # marked as FAN1-4 (right ones)
|
||||
FAN_ZONE_SYS1=3 # marked as FAN5-8 (left ones)
|
||||
FAN_ZONES=[FAN_ZONE_CPU1, FAN_ZONE_CPU2, FAN_ZONE_SYS2, FAN_ZONE_SYS1]
|
||||
FAN_ZONES_STR={
|
||||
FAN_ZONE_CPU1:'cpu1',
|
||||
FAN_ZONE_CPU2:'cpu2',
|
||||
FAN_ZONE_SYS2:'system2',
|
||||
FAN_ZONE_SYS1:'system1',
|
||||
}
|
||||
|
||||
# list of FANs and zone member association
|
||||
FAN1 ='FAN1'
|
||||
FAN2 ='FAN2'
|
||||
FAN3 ='FAN3'
|
||||
FAN4 ='FAN4'
|
||||
FAN5 ='FAN5'
|
||||
FAN6 ='FAN6'
|
||||
FAN7 ='FAN7'
|
||||
FAN8 ='FAN8'
|
||||
FAN9 ='FAN9'
|
||||
FAN10 ='FAN10'
|
||||
|
||||
FAN_ZONES_MEMBERS= {
|
||||
FAN_ZONE_CPU1:FAN10,
|
||||
FAN_ZONE_CPU2:FAN9,
|
||||
FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4],
|
||||
FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8],
|
||||
}
|
||||
|
||||
# based on observations on SUPERMICRO_4029GP_TRT2 the
|
||||
# SYS1 and SYS2 fans use the following linear equations to
|
||||
# convert from RPM to % value
|
||||
def SUPERMICRO_4029GP_TRT2_RPM_to_percent(rpm):
|
||||
return rpm * 0.0098 - 11.5479
|
||||
|
||||
def set_fan_with_full_preset(config, speed, zone):
|
||||
"""
|
||||
Set fan speed to a fixed %.
|
||||
Some chassis implement separate fan "zones" named CPU and Peripheral. To target specific zones, use the --zone option.
|
||||
"""
|
||||
|
||||
# Make sure fans are on Full setting, or else this won't stick for long
|
||||
s = get_preset(config)
|
||||
if s is False:
|
||||
print(time.ctime() +": Unable to get current fan status; exiting")
|
||||
return False
|
||||
|
||||
if s != FAN_PRESET_FULL:
|
||||
print(time.ctime() +": The fan controller is currently not set to Full mode (required for manual fan settings, which will otherwise be adjusted by the BMC within minutes); setting it now.")
|
||||
set_preset(config, preset='full')
|
||||
print(time.ctime() +": Waiting 5 seconds to let fans spin up...")
|
||||
time.sleep(5)
|
||||
|
||||
ok = True
|
||||
if zone == 'all' or zone == 'cpu':
|
||||
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x00 0x%02x' % speed, **config)
|
||||
if ok and (zone == 'all' or zone == 'periph'):
|
||||
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x01 0x%02x' % speed, **config)
|
||||
|
||||
if ok:
|
||||
print(time.ctime() +": Set %s fans on %s to %d%%." % (zone, config['hostname'], speed))
|
||||
return True
|
||||
else:
|
||||
print(time.ctime() +": Unable to update fans.")
|
||||
return False
|
||||
|
||||
def set_fan(config, speed, zone):
|
||||
"""
|
||||
Set fan speed to a fixed %.
|
||||
Will be changed by Server if not in FULL preset (need to periodically call this)
|
||||
"""
|
||||
|
||||
ok = ipmi_raw_cmd('0x30 0x70 0x66 0x01 0x%02x 0x%02x' % (zone, speed), **config)
|
||||
|
||||
if ok:
|
||||
print(time.ctime() +": Set %s fans on %s to %d%%." % (FAN_ZONES_STR[zone], config['hostname'], speed))
|
||||
return True
|
||||
else:
|
||||
print(time.ctime() +": Unable to update fans.")
|
||||
return False
|
||||
|
||||
def get_fan(config, fan):
|
||||
"""
|
||||
Get fan speed in % (for one or more fans).
|
||||
"""
|
||||
|
||||
fan_status_list = ipmi_fan_status(**config)
|
||||
|
||||
if type(fan) == list:
|
||||
return_list = {}
|
||||
for f in fan:
|
||||
if fan_status_list.has_key(f):
|
||||
return_list[f] = SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[f])
|
||||
return return_list
|
||||
elif fan_status_list.has_key(fan):
|
||||
return SUPERMICRO_4029GP_TRT2_RPM_to_percent(fan_status_list[fan])
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def _set_preset(config):
|
||||
"""
|
||||
Retrieves fan controller preset & fan speed.
|
||||
"""
|
||||
status = get_preset(config)
|
||||
if status is False:
|
||||
return False
|
||||
if status in FAN_PRESETS:
|
||||
s = FAN_PRESETS_DESC[status]
|
||||
else:
|
||||
s = "Unknown status code %d" % status
|
||||
# manual fan ctl get(0)/set(1) cpu(0)/periph(1) duty(0-0x64)
|
||||
# 0x30 0x70 0x66 0x00 0x00 0x64
|
||||
fan_speed = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x00', **config)
|
||||
if fan_speed is False:
|
||||
return False
|
||||
fan_speed2 = ipmi_raw_cmd('0x30 0x70 0x66 0x00 0x01', **config)
|
||||
if fan_speed2 is False:
|
||||
return False
|
||||
|
||||
print(time.ctime() +": Preset: %s" % s)
|
||||
print(time.ctime() +": Current fan speed (CPU Zone): %d%%" % int(fan_speed, 16))
|
||||
print(time.ctime() +": Current fan speed (Peripheral zone): %d%%" % int(fan_speed2, 16))
|
||||
return True
|
||||
|
||||
|
||||
def set_preset(config, preset):
|
||||
if preset not in FAN_PRESETS:
|
||||
return False
|
||||
|
||||
if ipmi_raw_cmd("0x30 0x45 0x01 0x0%d" % preset, **config):
|
||||
print(time.ctime() +": Updated preset on %s." % config['hostname'])
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def ipmi_raw_cmd(raw_cmd, hostname = 'localhost', username=None, password=None, use_env=False):
|
||||
|
||||
if hostname == 'localhost':
|
||||
if os.geteuid() != 0:
|
||||
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
|
||||
sys.exit(1)
|
||||
cmd = 'ipmitool raw %s' % raw_cmd
|
||||
else:
|
||||
if use_env:
|
||||
cmd_pass = '-E'
|
||||
else:
|
||||
cmd_pass = '-P %s' % shlex.quote(password)
|
||||
cmd = 'ipmitool -I lanplus -U %s %s -H %s raw %s' % (shlex.quote(username), cmd_pass, hostname, raw_cmd)
|
||||
|
||||
try:
|
||||
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||
except subprocess.CalledProcessError, ex:
|
||||
print(time.ctime() +": Error: Problem running ipmitool")
|
||||
print(time.ctime() +": Command: %s" % cmd)
|
||||
print(time.ctime() +": Return code: %d" % ex)
|
||||
return False
|
||||
|
||||
out = s.strip()
|
||||
if out:
|
||||
return out
|
||||
else:
|
||||
return True
|
||||
|
||||
def ipmi_fan_status(hostname = 'localhost', username=None, password=None, use_env=False):
|
||||
cmd = 'ipmitool sensor | grep FAN'
|
||||
|
||||
if hostname == 'localhost':
|
||||
if os.geteuid() != 0:
|
||||
print(time.ctime() +": In order to communicate with the kernel's IPMI module, you must be root.")
|
||||
sys.exit(1)
|
||||
cmd = 'ipmitool sensor | grep FAN '
|
||||
else:
|
||||
if use_env:
|
||||
cmd_pass = '-E'
|
||||
else:
|
||||
cmd_pass = '-P %s' % shlex.quote(password)
|
||||
cmd = 'ipmitool -I lanplus -U %s %s -H %s sensor | grep FAN' % (shlex.quote(username), cmd_pass, hostname)
|
||||
try:
|
||||
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||
except subprocess.CalledProcessError, ex:
|
||||
print(time.ctime() +": Error: Problem running ipmitool")
|
||||
print(time.ctime() +": Command: %s" % cmd)
|
||||
print(time.ctime() +": Return code: %d" % ex)
|
||||
return False
|
||||
|
||||
fan_status_return = {}
|
||||
for fan_str in s.split("\n"):
|
||||
if len(fan_str.strip()) > 0:
|
||||
fan_stat = fan_str.split("|")
|
||||
fan_name = fan_stat[0].strip()
|
||||
fan_rpm = float(fan_stat[1].strip())
|
||||
fan_status_return[fan_name] = fan_rpm
|
||||
return fan_status_return
|
||||
|
||||
def get_preset(config):
|
||||
try:
|
||||
s = ipmi_raw_cmd('0x30 0x45 0x00', **config)
|
||||
if s is False:
|
||||
return False
|
||||
return int(s)
|
||||
except:
|
||||
return False
|
|
@ -0,0 +1,130 @@
|
|||
# Superfans GPU controller
|
||||
#
|
||||
# author: Domen Tabernik
|
||||
# 2019
|
||||
|
||||
import time, superfans, subprocess
|
||||
|
||||
def retrieve_nvidia_gpu_temperature():
|
||||
cmd = 'nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader'
|
||||
|
||||
s = subprocess.check_output(cmd + " 2>&1", shell=True)
|
||||
if len(s) <= 0:
|
||||
return False
|
||||
|
||||
out = [int(x.strip()) for x in s.split("\n") if len(x.strip()) > 0]
|
||||
if out:
|
||||
return out
|
||||
else:
|
||||
return False
|
||||
|
||||
def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=1.0):
|
||||
"""
|
||||
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`.
|
||||
After the loop the default preset is restored.
|
||||
|
||||
:param fan_settings: dictionary that maps the temperature in deg C to % of fan speed
|
||||
:param FAN_INCREASED_MIN_TIME: minimal time before a fan speed is again reduced (based on previous change) default=120
|
||||
:param sleep_sec: loop sleep time (default=2 sec)
|
||||
:param gpu_moving_avg_num: moving average for GPU i.e. the number of last measurements that are averaged (default=5)
|
||||
:param fan_target_eps: tolerance of fan target w.r.t. the the actual value in deg C (default=1.0)
|
||||
:return:
|
||||
"""
|
||||
superfan_config = dict(hostname= 'localhost')
|
||||
|
||||
# save default present before changing anything
|
||||
default_preset = superfans.get_preset(superfan_config)
|
||||
print(time.ctime() + ': Started fan control using GPU temperature.')
|
||||
print(time.ctime() + ': Using settings:')
|
||||
for k in sorted(fan_settings.keys()):
|
||||
print(time.ctime() + ': \t%d C = %d ' % (k, fan_settings[k]) + "%")
|
||||
print(time.ctime() + ':')
|
||||
try:
|
||||
|
||||
FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \
|
||||
superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]
|
||||
|
||||
# GPU moving average
|
||||
previous_target_fan = None
|
||||
previous_update_time = None
|
||||
|
||||
prev_GPU_temp = []
|
||||
|
||||
while True:
|
||||
|
||||
# get GPU temperature
|
||||
GPU_temp = retrieve_nvidia_gpu_temperature()
|
||||
|
||||
prev_GPU_temp.append(GPU_temp)
|
||||
|
||||
# continue until we have enough sampels for moving average
|
||||
if len(prev_GPU_temp) < gpu_moving_avg_num:
|
||||
continue
|
||||
|
||||
# retain last 5 mesurements
|
||||
prev_GPU_temp = prev_GPU_temp[-gpu_moving_avg_num:]
|
||||
mean_GPU_temp = prev_GPU_temp[0]
|
||||
for gpu_temp in prev_GPU_temp[1:]:
|
||||
mean_GPU_temp = [x+y for x,y in zip(gpu_temp, mean_GPU_temp)]
|
||||
|
||||
mean_GPU_temp = [x/len(prev_GPU_temp) for x in mean_GPU_temp]
|
||||
|
||||
max_gpu_temp = max(mean_GPU_temp)
|
||||
|
||||
for key_temp in sorted(fan_settings.keys())[::-1]:
|
||||
if key_temp <= max_gpu_temp:
|
||||
target_fan = fan_settings[key_temp]
|
||||
break
|
||||
|
||||
|
||||
current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS)
|
||||
current_update_time = time.time()
|
||||
diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1]]
|
||||
diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]]
|
||||
|
||||
disbale_update = False
|
||||
|
||||
if previous_update_time is not None and previous_target_fan is not None:
|
||||
has_enough_time_elapsed = current_update_time - previous_update_time > FAN_INCREASED_MIN_TIME
|
||||
is_level_down_change = target_fan < previous_target_fan
|
||||
disbale_update = True if is_level_down_change and not has_enough_time_elapsed else False
|
||||
|
||||
if not disbale_update:
|
||||
# Allow for 1% difference in target
|
||||
update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan])
|
||||
update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan])
|
||||
if update_sys1_fan:
|
||||
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1)
|
||||
|
||||
if update_sys2_fan:
|
||||
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2)
|
||||
|
||||
if update_sys1_fan or update_sys2_fan:
|
||||
print(time.ctime() + ': \tCurrent GPU measurements: %s' % ','.join(map(str,GPU_temp)))
|
||||
print(time.ctime() + ': \tMoving average GPU measurements: %s' % ','.join(map(str,mean_GPU_temp)))
|
||||
print(time.ctime() + ': \tTarget difference: SYS1 fan = %f; SYS2 fan = %f' % (max(diff_sys1_fan), max(diff_sys2_fan)))
|
||||
print(time.ctime() + ':')
|
||||
|
||||
|
||||
previous_target_fan = target_fan
|
||||
previous_update_time = current_update_time
|
||||
|
||||
time.sleep(sleep_sec)
|
||||
finally:
|
||||
# revert back to default preset before finishing
|
||||
superfans.set_preset(superfan_config, default_preset)
|
||||
print(time.ctime() + ': Reverted back to system default.')
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# fan settings = {[in deg C]: [% fan], ...}
|
||||
fan_settings = {0: 25,
|
||||
60: 30,
|
||||
70: 36,
|
||||
80: 40,
|
||||
85: 45,
|
||||
90: 50}
|
||||
|
||||
superfans_gpu_controller(fan_settings)
|
Loading…
Reference in New Issue