Compare commits

..

No commits in common. "master" and "v0.1" have entirely different histories.
master ... v0.1

3 changed files with 36 additions and 47 deletions

View File

@ -1,13 +1,8 @@
{ {
"fan_settings" : {"0": 10, "fan_settings" : {"0": 20,
"30": 15, "60": 25,
"35": 20, "70": 30,
"40": 25, "80": 35,
"50": 30, "87": 40,
"60": 35, "90": 43}
"70": 40,
"75": 45,
"80": 55,
"83": 65,
"87": 80}
} }

View File

@ -51,7 +51,10 @@ FAN9 ='FAN9'
FAN10 ='FAN10' FAN10 ='FAN10'
FAN_ZONES_MEMBERS= { FAN_ZONES_MEMBERS= {
FAN_ZONE_CPU1:[FAN1,FAN2,FAN4,FAN6,FAN7,FAN8], FAN_ZONE_CPU1:[FAN10],
FAN_ZONE_CPU2:[FAN9],
FAN_ZONE_SYS2:[FAN1,FAN2,FAN3,FAN4],
FAN_ZONE_SYS1:[FAN5,FAN6,FAN7,FAN8],
} }
# based on observations on SUPERMICRO_4029GP_TRT2 the # based on observations on SUPERMICRO_4029GP_TRT2 the

View File

@ -6,13 +6,13 @@
import time, superfans, subprocess, signal, sys, json import time, superfans, subprocess, signal, sys, json
class GracefulKiller: class GracefulKiller:
kill_now = False kill_now = False
def __init__(self): def __init__(self):
signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGINT, self.exit_gracefully)
signal.signal(signal.SIGTERM, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully)
def exit_gracefully(self,signum, frame): def exit_gracefully(self,signum, frame):
self.kill_now = True self.kill_now = True
def enable_persistance_nvidia(): def enable_persistance_nvidia():
cmd = 'nvidia-smi -pm 1' cmd = 'nvidia-smi -pm 1'
@ -32,7 +32,7 @@ def retrieve_nvidia_gpu_temperature():
else: else:
return False return False
def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=4, fan_target_eps=2.0): def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec=2, gpu_moving_avg_num=5, fan_target_eps=2.0):
""" """
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`. Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided `fan_settings`.
After the loop the default preset is restored. After the loop the default preset is restored.
@ -57,18 +57,15 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
print('\t%d C = %d ' % (k, fan_settings[k]) + "%") print('\t%d C = %d ' % (k, fan_settings[k]) + "%")
print('\n') print('\n')
#open file for cpu temperature
cpu_t_f = open("/sys/class/thermal/thermal_zone0/temp", "r")
# put GPUs into persistance mode so that nvidia-smi will retern immediately # put GPUs into persistance mode so that nvidia-smi will retern immediately
enable_persistance_nvidia() enable_persistance_nvidia()
#try: try:
for gege in [0]: FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] + \
FAN_MEMBERS = superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1] superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2]
# GPU moving average # GPU moving average
previous_target_fan = 0 previous_target_fan = None
previous_update_time = None previous_update_time = None
prev_GPU_temp = [] prev_GPU_temp = []
@ -96,24 +93,16 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
max_gpu_temp = max(mean_GPU_temp) max_gpu_temp = max(mean_GPU_temp)
#read cpu temperature
cpu_t_f.seek(0,0)
cpu_temp = int(cpu_t_f.read()) / 1000 #°C
max_temp = max(max_gpu_temp, cpu_temp)
for key_temp in sorted(fan_settings.keys())[::-1]: for key_temp in sorted(fan_settings.keys())[::-1]:
if key_temp <= max_temp: if key_temp <= max_gpu_temp:
target_fan = fan_settings[key_temp] target_fan = fan_settings[key_temp]
break break
current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS) current_fan_levels = superfans.get_fan(superfan_config, FAN_MEMBERS)
current_update_time = time.time() current_update_time = time.time()
#diff_cpu1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1] if FAN in current_fan_levels and current_fan_levels[FAN] > 0] diff_sys1_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS1] if FAN in current_fan_levels and current_fan_levels[FAN] > 0]
diff_sys2_fan = [abs(current_fan_levels[FAN] - target_fan) for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_SYS2] if FAN in current_fan_levels and current_fan_levels[FAN] > 0]
#print(list(superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1]))
#print(list(current_fan_levels[FAN] for FAN in superfans.FAN_ZONES_MEMBERS[superfans.FAN_ZONE_CPU1]))
# TODO: ignore outlier FANs in case they are faulty # TODO: ignore outlier FANs in case they are faulty
@ -126,26 +115,28 @@ def superfans_gpu_controller(fan_settings, FAN_INCREASED_MIN_TIME=120, sleep_sec
if not disbale_update: if not disbale_update:
# Allow for 1% difference in target # Allow for 1% difference in target
update_cpu1_fan = abs(previous_target_fan - target_fan) > fan_target_eps update_sys1_fan = any([d > fan_target_eps for d in diff_sys1_fan])
update_sys2_fan = any([d > fan_target_eps for d in diff_sys2_fan])
if update_sys1_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS1)
if update_cpu1_fan: if update_sys2_fan:
superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_CPU1) superfans.set_fan(superfan_config, target_fan, superfans.FAN_ZONE_SYS2)
if update_cpu1_fan: if update_sys1_fan or update_sys2_fan:
print('\tCurrent GPU measurements (in C): %s' % ','.join(map(str,GPU_temp))) print('\tCurrent GPU measurements (in C): %s' % ','.join(map(str,GPU_temp)))
print('\tMoving average GPU measurements (in C): %s (max=%d)' % (','.join(map(str,mean_GPU_temp)),max_gpu_temp)) print('\tMoving average GPU measurements (in C): %s (max=%d)' % (','.join(map(str,mean_GPU_temp)),max_gpu_temp))
print('\tCPU measurement (in C): %.1f ' % cpu_temp) print('\tTarget FAN speed: %d C => FAN %d %% (difference: SYS1 fan = %.2f; SYS2 fan = %.2f)' % (max_gpu_temp, target_fan, max(diff_sys1_fan), max(diff_sys2_fan)))
print('\tTarget FAN speed: %d C => FAN %d %% (difference: CPU1 fan = %.2f)' % (max_temp, target_fan, -1)) #max(diff_cpu1_fan)
print('\n\n') print('\n\n')
previous_target_fan = target_fan previous_target_fan = target_fan
previous_update_time = current_update_time previous_update_time = current_update_time
time.sleep(sleep_sec) time.sleep(sleep_sec)
#finally: finally:
# revert back to default preset before finishing # revert back to default preset before finishing
# superfans.set_preset(superfan_config, default_preset) superfans.set_preset(superfan_config, default_preset)
# print('Reverted back to system default.') print('Reverted back to system default.')
def main(): def main():
if len(sys.argv) != 2: if len(sys.argv) != 2: