2019-03-08 19:24:59 +00:00
# Superfans GPU controller
#
# author: Domen Tabernik
# 2019
2020-07-27 14:56:37 +00:00
import time , superfans , subprocess , signal , sys , json
2019-03-08 21:27:41 +00:00
class GracefulKiller :
kill_now = False
def __init__ ( self ) :
signal . signal ( signal . SIGINT , self . exit_gracefully )
signal . signal ( signal . SIGTERM , self . exit_gracefully )
def exit_gracefully ( self , signum , frame ) :
self . kill_now = True
2019-03-08 19:24:59 +00:00
2019-12-10 16:09:49 +00:00
def enable_persistance_nvidia ( ) :
cmd = ' nvidia-smi -pm 1 '
s = subprocess . check_output ( cmd + " 2>&1 " , shell = True )
2019-03-08 19:24:59 +00:00
def retrieve_nvidia_gpu_temperature ( ) :
cmd = ' nvidia-smi --query-gpu=temperature.gpu --format=csv,noheader '
s = subprocess . check_output ( cmd + " 2>&1 " , shell = True )
if len ( s ) < = 0 :
return False
2020-07-27 14:56:37 +00:00
s = s . decode ( ' ascii ' )
2019-03-08 19:24:59 +00:00
out = [ int ( x . strip ( ) ) for x in s . split ( " \n " ) if len ( x . strip ( ) ) > 0 ]
if out :
return out
else :
return False
2019-03-08 22:28:13 +00:00
def superfans_gpu_controller ( fan_settings , FAN_INCREASED_MIN_TIME = 120 , sleep_sec = 2 , gpu_moving_avg_num = 5 , fan_target_eps = 2.0 ) :
2019-03-08 19:24:59 +00:00
"""
Controller function that monitors GPU temperature in constant loop and adjusts FAN speeds based on provided ` fan_settings ` .
After the loop the default preset is restored .
: param fan_settings : dictionary that maps the temperature in deg C to % of fan speed
: param FAN_INCREASED_MIN_TIME : minimal time before a fan speed is again reduced ( based on previous change ) default = 120
: param sleep_sec : loop sleep time ( default = 2 sec )
: param gpu_moving_avg_num : moving average for GPU i . e . the number of last measurements that are averaged ( default = 5 )
: param fan_target_eps : tolerance of fan target w . r . t . the the actual value in deg C ( default = 1.0 )
: return :
"""
superfan_config = dict ( hostname = ' localhost ' )
2020-07-27 14:56:37 +00:00
# convert fan_settings keys from string to int
fan_settings = { int ( k ) : fan_settings [ k ] for k in sorted ( fan_settings . keys ( ) ) }
2019-03-08 19:24:59 +00:00
# save default present before changing anything
default_preset = superfans . get_preset ( superfan_config )
2019-03-08 21:27:41 +00:00
print ( ' Started fan control using GPU temperature. ' )
print ( ' Using settings: ' )
2019-03-08 19:24:59 +00:00
for k in sorted ( fan_settings . keys ( ) ) :
2019-03-08 21:27:41 +00:00
print ( ' \t %d C = %d ' % ( k , fan_settings [ k ] ) + " % " )
2019-03-08 21:30:39 +00:00
print ( ' \n ' )
2019-03-08 19:24:59 +00:00
2019-12-10 16:09:49 +00:00
# put GPUs into persistance mode so that nvidia-smi will retern immediately
enable_persistance_nvidia ( )
2019-03-08 21:27:41 +00:00
try :
2019-03-08 19:24:59 +00:00
FAN_MEMBERS = superfans . FAN_ZONES_MEMBERS [ superfans . FAN_ZONE_SYS1 ] + \
superfans . FAN_ZONES_MEMBERS [ superfans . FAN_ZONE_SYS2 ]
# GPU moving average
previous_target_fan = None
previous_update_time = None
prev_GPU_temp = [ ]
2019-03-08 21:27:41 +00:00
# ensure correct ending when SIGINT and SIGTERM are received
k = GracefulKiller ( )
while not k . kill_now :
2019-03-08 19:24:59 +00:00
# get GPU temperature
GPU_temp = retrieve_nvidia_gpu_temperature ( )
prev_GPU_temp . append ( GPU_temp )
# continue until we have enough sampels for moving average
if len ( prev_GPU_temp ) < gpu_moving_avg_num :
continue
# retain last 5 mesurements
prev_GPU_temp = prev_GPU_temp [ - gpu_moving_avg_num : ]
mean_GPU_temp = prev_GPU_temp [ 0 ]
for gpu_temp in prev_GPU_temp [ 1 : ] :
mean_GPU_temp = [ x + y for x , y in zip ( gpu_temp , mean_GPU_temp ) ]
mean_GPU_temp = [ x / len ( prev_GPU_temp ) for x in mean_GPU_temp ]
max_gpu_temp = max ( mean_GPU_temp )
for key_temp in sorted ( fan_settings . keys ( ) ) [ : : - 1 ] :
if key_temp < = max_gpu_temp :
target_fan = fan_settings [ key_temp ]
break
current_fan_levels = superfans . get_fan ( superfan_config , FAN_MEMBERS )
current_update_time = time . time ( )
2020-07-28 13:05:14 +00:00
diff_sys1_fan = [ abs ( current_fan_levels [ FAN ] - target_fan ) for FAN in superfans . FAN_ZONES_MEMBERS [ superfans . FAN_ZONE_SYS1 ] if FAN in current_fan_levels and current_fan_levels [ FAN ] > 0 ]
diff_sys2_fan = [ abs ( current_fan_levels [ FAN ] - target_fan ) for FAN in superfans . FAN_ZONES_MEMBERS [ superfans . FAN_ZONE_SYS2 ] if FAN in current_fan_levels and current_fan_levels [ FAN ] > 0 ]
2019-12-10 16:09:49 +00:00
# TODO: ignore outlier FANs in case they are faulty
2019-03-08 19:24:59 +00:00
disbale_update = False
if previous_update_time is not None and previous_target_fan is not None :
has_enough_time_elapsed = current_update_time - previous_update_time > FAN_INCREASED_MIN_TIME
is_level_down_change = target_fan < previous_target_fan
disbale_update = True if is_level_down_change and not has_enough_time_elapsed else False
if not disbale_update :
# Allow for 1% difference in target
update_sys1_fan = any ( [ d > fan_target_eps for d in diff_sys1_fan ] )
update_sys2_fan = any ( [ d > fan_target_eps for d in diff_sys2_fan ] )
2020-07-28 12:47:13 +00:00
if update_sys1_fan :
superfans . set_fan ( superfan_config , target_fan , superfans . FAN_ZONE_SYS1 )
2019-03-08 19:24:59 +00:00
2020-07-28 12:47:13 +00:00
if update_sys2_fan :
superfans . set_fan ( superfan_config , target_fan , superfans . FAN_ZONE_SYS2 )
2019-03-08 19:24:59 +00:00
if update_sys1_fan or update_sys2_fan :
2019-12-10 16:09:49 +00:00
print ( ' \t Current GPU measurements (in C): %s ' % ' , ' . join ( map ( str , GPU_temp ) ) )
print ( ' \t Moving average GPU measurements (in C): %s (max= %d ) ' % ( ' , ' . join ( map ( str , mean_GPU_temp ) ) , max_gpu_temp ) )
2020-07-27 14:56:37 +00:00
print ( ' \t Target FAN speed: %d C => FAN %d %% (difference: SYS1 fan = %.2f ; SYS2 fan = %.2f ) ' % ( max_gpu_temp , target_fan , max ( diff_sys1_fan ) , max ( diff_sys2_fan ) ) )
2019-12-10 16:09:49 +00:00
print ( ' \n \n ' )
2019-03-08 19:24:59 +00:00
2019-03-08 22:13:01 +00:00
previous_target_fan = target_fan
previous_update_time = current_update_time
2019-03-08 19:24:59 +00:00
time . sleep ( sleep_sec )
finally :
# revert back to default preset before finishing
superfans . set_preset ( superfan_config , default_preset )
2019-03-08 21:27:41 +00:00
print ( ' Reverted back to system default. ' )
2019-03-08 19:24:59 +00:00
2020-07-27 14:56:37 +00:00
def main ( ) :
if len ( sys . argv ) != 2 :
print ( ' Invalid number of arguments: missing configuration file!! ' )
print ( ' ' )
print ( ' Usage: %s [PATH_TO_JSON_CONFIG] ' % sys . argv [ 0 ] )
print ( ' ' )
print ( ' Configuration file in JSON format should include " fan_settings " = { [in deg C]: [ % f an], ...} ' )
print
exit ( 0 )
with open ( sys . argv [ 1 ] ) as cfg_file :
cfg = json . load ( cfg_file )
superfans_gpu_controller ( cfg [ ' fan_settings ' ] )
2019-03-08 19:24:59 +00:00
if __name__ == " __main__ " :
2020-07-27 14:56:37 +00:00
main ( )