Compare commits

...

1 Commits
v0.1 ... master

Author SHA1 Message Date
Nicolas 601943451c cpu fan controll added, fitted to chassie 2022-03-18 23:44:06 +00:00
3 changed files with 47 additions and 36 deletions

View File

@ -1,8 +1,13 @@
{ {
"fan_settings" : {"0": 20, "fan_settings" : {"0": 10,
"60": 25, "30": 15,
"70": 30, "35": 20,
"80": 35, "40": 25,
"87": 40, "50": 30,
"90": 43} "60": 35,
"70": 40,
"75": 45,
"80": 55,
"83": 65,
"87": 80}
} }

View File

@ -51,10 +51,7 @@ FAN9 ='FAN9'
FAN10 ='FAN10' FAN10 ='FAN10'
FAN_ZONES_MEMBERS= { FAN_ZONES_MEMBERS= {
FAN_ZONE_CPU1:[FAN10], FAN_ZONE_CPU1:[FAN1,FAN2,FAN4,FAN6,FAN7,FAN8],
FAN_ZONE_CPU2:[FAN9],
FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4],
FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8],
} }
# based on observations on SUPERMICRO_4029GP_TRT2 the # based on observations on SUPERMICRO_4029GP_TRT2 the

View File

@ -6,13 +6,13 @@
import time, superfans, subprocess, signal, sys, json import time, superfans, subprocess, signal, sys, json
class GracefulKiller: class GracefulKiller:
kill_now = False kill_now = False
def __init__(self): def __init__(self):
signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGINT, self.exit_gracefully)
signal.signal(signal.SIGTERM, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully)
def exit_gracefully(self,signum, frame): def exit_gracefully(self,signum, frame):
self.kill_now = True self.kill_now = True
def enable_persistance_nvidia(): def enable_persistance_nvidia():
cmd = 'nvidia-smi -pm 1' cmd = 'nvidia-smi -pm 1'
@ -32,7 +32,7 @@ def retrieve_nvidia_gpu_temperature():
else: else:
return False return False
def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=2.0): def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=4, fan_target_eps=2.0):
""" """
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`. Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`.
After the loop the default preset is restored. After the loop the default preset is restored.
@ -57,15 +57,18 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
print('\t%d C = %d ' % (k, fan_settings[k]) + "%") print('\t%d C = %d ' % (k, fan_settings[k]) + "%")
print('\n') print('\n')
#open file for cpu temperature
cpu_t_f = open("/sys/class/thermal/thermal_zone0/temp", "r")
# put GPUs into persistance mode so that nvidia-smi will retern immediately # put GPUs into persistance mode so that nvidia-smi will retern immediately
enable_persistance_nvidia() enable_persistance_nvidia()
try: #try:
FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \ for gege in [0]:
superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2] FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1]
# GPU moving average # GPU moving average
previous_target_fan = None previous_target_fan = 0
previous_update_time = None previous_update_time = None
prev_GPU_temp = [] prev_GPU_temp = []
@ -93,16 +96,24 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
max_gpu_temp = max(mean_GPU_temp) max_gpu_temp = max(mean_GPU_temp)
#read cpu temperature
cpu_t_f.seek(0,0)
cpu_temp = int(cpu_t_f.read()) / 1000 #°C
max_temp = max(max_gpu_temp, cpu_temp)
for key_temp in sorted(fan_settings.keys())[::-1]: for key_temp in sorted(fan_settings.keys())[::-1]:
if key_temp <= max_gpu_temp: if key_temp <= max_temp:
target_fan = fan_settings[key_temp] target_fan = fan_settings[key_temp]
break break
current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS) current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS)
current_update_time = time.time() current_update_time = time.time()
diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] if FAN in current_fan_levels and current_fan_levels[FAN] > 0] #diff_cpu1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1] if FAN in current_fan_levels and current_fan_levels[FAN] > 0]
diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2] if FAN in current_fan_levels and current_fan_levels[FAN] > 0]
#print(list(superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1]))
#print(list(current_fan_levels[FAN] for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1]))
# TODO: ignore outlier FANs in case they are faulty # TODO: ignore outlier FANs in case they are faulty
@ -115,28 +126,26 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
if not disbale_update: if not disbale_update:
# Allow for 1% difference in target # Allow for 1% difference in target
update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan]) update_cpu1_fan = abs(previous_target_fan - target_fan) > fan_target_eps
update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan])
if update_sys1_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1)
if update_sys2_fan: if update_cpu1_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2) superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_CPU1)
if update_sys1_fan or update_sys2_fan: if update_cpu1_fan:
print('\tCurrent GPU measurements (in C): %s' % ','.join(map(str,GPU_temp))) print('\tCurrent GPU measurements (in C): %s' % ','.join(map(str,GPU_temp)))
print('\tMoving average GPU measurements (in C): %s (max=%d)' % (','.join(map(str,mean_GPU_temp)),max_gpu_temp)) print('\tMoving average GPU measurements (in C): %s (max=%d)' % (','.join(map(str,mean_GPU_temp)),max_gpu_temp))
print('\tTarget FAN speed: %d C => FAN %d %% (difference: SYS1 fan = %.2f; SYS2 fan = %.2f)' % (max_gpu_temp, target_fan, max(diff_sys1_fan), max(diff_sys2_fan))) print('\tCPU measurement (in C): %.1f ' % cpu_temp)
print('\tTarget FAN speed: %d C => FAN %d %% (difference: CPU1 fan = %.2f)' % (max_temp, target_fan, -1)) #max(diff_cpu1_fan)
print('\n\n') print('\n\n')
previous_target_fan = target_fan previous_target_fan = target_fan
previous_update_time = current_update_time previous_update_time = current_update_time
time.sleep(sleep_sec) time.sleep(sleep_sec)
finally: #finally:
# revert back to default preset before finishing # revert back to default preset before finishing
superfans.set_preset(superfan_config, default_preset) # superfans.set_preset(superfan_config, default_preset)
print('Reverted back to system default.') # print('Reverted back to system default.')
def main(): def main():
if len(sys.argv) != 2: if len(sys.argv) != 2: