Программный вотчдог для майнинг-рига на картах Nvidia написан на bash.
Предварительно нужно дать возможность управлять вентиляторами
sudo nvidia-xconfig --cool-bits=31 --allow-empty-initial-configuration --enable-all-gpus --registry-dwords=PerfLevelSrc=0x2222
sudo reboot
Код вотчдога писался для серии 10хх
#!/bin/bash
# ========================================================
# Watchdog for Nvidia cards
# v 2.7
# written by kofesutra
# https://kofesutra.ru
# ========================================================
# Use the script on your own risk, but donating are welcomed :)
# ETH 0x0ca2616532eeD5ddbC952d429a92098E5d050CfF
# ========================================================
# Переменные (укажите свои):
# Название воркера
worker_name=1070Ti
# Разгон памяти
mem=1200
# Разгон процессора
gpu=200
# Powerlimit в ваттах
pl=115
# Низкая активность gpu в процентах
low_gpu_activity=30
# Низкая температура в градусах
low_temp=35
# Высокая температура в градусах
high_temp=70
# Количество карт в риге
cards=3
# Silent mode (без отчетов по электронной почте) On=1, Off=0
silence=1
# Указать какой майнер используется:
# EWBF: miner, Claymores: ethdcrminer64, Ccminer: ccminer, Xmrig: xmrig-nvidia, Gminer: miner
miner_type=miner
# Путь к файлу запуска майнера
miner_path=$HOME/Dropbox/Miners_SH_1070Ti/eth_nicehash_gminer_1070Ti.sh
# Путь к файлу лога майнера
miner_log=$HOME/Dropbox/Miners_SH_1070Ti/"$worker_name"_WD_log_$(date +"%y-%m-%d").txt
# Пауза ожидания загрузки майнера в секундах (например, чтобы прогрузились DAG для всех карт)
miner_pause=60
# IP роутера (для проверки работы сети)
router_adress=192.168.0.1
# Адрес и пароль e-mail
email=Этот адрес электронной почты защищен от спам-ботов. У вас должен быть включен JavaScript для просмотра.
email_pwd=passwordformail
# Root пароль рига
rootpass=passwordforrig
# ========================================================
# Не изменять эти переменные, они объявлены для использования ниже
email_subj="none"; email_body="none"; num_cards=0; gpu_power=0; gpu_util=0; gpu_temp=0; fan_speed=0; fan_speed_check=0
# При значении "1" - перезагрузить майнер, при значении "2" - перезагрузить риг
err_code=0
# ========================================================
# Функции
# Разгон карт
overclocking() {
echo "*Overclocking"
export mem
export gpu
for (( u=0; u < $cards; u++ )); do
xterm -e 'nvidia-settings -a '[gpu:$u]/GPUMemoryTransferRateOffset[1]='$mem;
nvidia-settings -a '[gpu:$u]/GPUMemoryTransferRateOffset[2]='$mem;
nvidia-settings -a '[gpu:$u]/GPUMemoryTransferRateOffset[3]='$mem;
nvidia-settings -a '[gpu:$u]/GPUGraphicsClockOffset[1]='$gpu;
nvidia-settings -a '[gpu:$u]/GPUGraphicsClockOffset[2]='$gpu;
nvidia-settings -a '[gpu:$u]/GPUGraphicsClockOffset[3]='$gpu;
nvidia-settings -a '[gpu:$u]/GPUPowerMizerMode=1';
nvidia-settings -a '[gpu:$u]/GPUFanControlState=1';
nvidia-settings -a '[fan:$u]/GPUTargetFanSpeed=60''
done
# Корректировка значений отдельных карт
#xterm -e 'nvidia-settings -a '[gpu:2]/GPUGraphicsClockOffset[1]=250';
#nvidia-settings -a '[gpu:3]/GPUGraphicsClockOffset[1]=250''
echo "_Overclocking done"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Overclocking done" >> $miner_log
}
# Powerlimit
powerlimit() {
echo "*Powerlimit"
export pl
export rootpass
xterm -e 'echo $rootpass | sudo -S sudo nvidia-smi -pm 1 &&
sudo nvidia-smi -pl $pl'
echo "_Powerlimit done"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Powerlimit done" >> $miner_log
}
# Отправка сообщения по почте
sendmyemails() {
if [ "$silence" != "1" ]; then
sendEmail -f $email -t $email -u "$worker_name: $email_subj" -m "$worker_name: $email_body" -s smtp.gmail.com:587 -xu $email -xp $email_pwd -o tls=yes -q
fi
}
# Проверка подключения к сети при загрузке
network() {
echo "*Network check"
w=1
while [[ $w -eq 1 ]]; do
ppp=$(ping -c 1 -W 1 $router_adress)
if [[ $? -ne 0 ]]; then
echo "-------------------"
echo "*Network error, NetworkManager reboot"
echo "$(date +"%y-%m-%d %T") *Network error, NetworkManager reboot" >> $miner_log
w=1
echo $rootpass | sudo -S sudo systemctl restart NetworkManager.service
sleep 60s
else
echo "_Network is OK"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Network is OK" >> $miner_log
w=0
fi
done
}
# Проверка подключения к интернету, сокращённая версия
# для использования в основном цикле
network_2() {
w=1
while [[ $w -eq 1 ]]; do
ppp=$(ping -c 1 -W 1 $router_adress)
if [ $? -ne 0 ]; then
echo "-------------------"
echo "$(date +"%y-%m-%d %T") *Network error, NetworkManager reboot"
echo "$(date +"%y-%m-%d %T") *Network error, NetworkManager reboot" >> $miner_log
w=1
echo $rootpass | sudo -S sudo systemctl restart NetworkManager.service
sleep 60s
echo "_NetworkManager rebooted"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _NetworkManager rebooted" >> $miner_log
else
w=0
fi
done
}
# Проверка количества работающих карт при загрузке
numofcards() {
echo "*Checking number of the cards"
num_cards=$(nvidia-smi -i 1 --query-gpu=count --format=csv,noheader,nounits);
if [[ $num_cards != $cards ]]; then
email_subj="Too small cards"
email_body="Booted, but number of the cards is not $cards, rig will rebooted"
sendmyemails
echo "_Number of the cards is not $cards, rig will rebooted"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Number of the cards is not $cards, rig will rebooted" >> $miner_log
sleep 5s
rebootrig
else
echo "_Number of the cards is $cards"
email_subj="boot complete"
email_body="booted sucsessfully"
sendmyemails
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Number of the cards is $cards" >> $miner_log
fi
}
# Проверка количества работающих карт, сокращённая версия
# для использования в основном цикле
numofcards_2() {
num_cards=$(nvidia-smi -i 1 --query-gpu=count --format=csv,noheader,nounits);
if [[ $num_cards != $cards ]]; then
email_subj="Not $cards cards"
email_body="Number of the cards is not $cards, rig will rebooted"
# sendmyemails
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Number of the cards is not $cards, rig will rebooted"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Number of the cards is not $cards, rig will rebooted" >> $miner_log
sleep 5s
rebootrig
fi
}
# Перезагрузка майнера
restartminer() {
# Десять попыток убить майнер
echo "-------------------"
echo "Miner reboot"
echo "Miner reboot" >> $miner_log
for (( w=1; w <= 10; w++ )); do
if pgrep $miner_type; then
echo "_Attempt no $w"
echo "_Attempt no $w" >> $miner_log
killall $miner_type
sleep 15s
else
w=10
echo "_Done"
fi
done
echo "-------------------"
# Если майнер всё ещё активен (не убит), то перезагрузка рига
if pgrep $miner_type; then
echo "_Cant kill $miner_type, $worker_name will rebooted"
email_subj="Cant kill $miner_type"
email_body="_Cant kill $miner_type, $worker_name will rebooted"
sendmyemails
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Cant kill $miner_type, $worker_name will rebooted" >> $miner_log
rebootrig
else
# запускаем майнер
echo "*Starting miner"
xterm -e $miner_path &
sleep $miner_pause
echo "_Miner started"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Miner started" >> $miner_log
fi
}
# Reboot rig
rebootrig() {
echo "_Rig reboot"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Rig reboot" >> $miner_log
echo $rootpass | sudo -S sudo shutdown -r now
}
# Minerstart
minerstart() {
echo "*Starting miner"
number_of_miners=$(pgrep -c $miner_type)
if [[ $number_of_miners -eq 0 ]]; then
xterm -e $miner_path &
sleep $miner_pause
echo "_Miner started"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Miner started" >> $miner_log
else
restartminer
fi
}
# Dropbox
check_dropbox() {
echo "*Checking Dropbox"
number_of_dropbox=$(pgrep -c dropbox)
if [[ $number_of_dropbox -eq 0 ]]; then
xterm -e -x dbus-launch dropbox start -i
echo "_Dropbox loaded"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Dropbox loaded" >> $miner_log
else
echo "_Dropbox is OK"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Dropbox is OK" >> $miner_log
fi
}
# Fan_Speed_Table
fanspeedtable() {
if [ $gpu_temp -le 35 ]; then fan_speed=30
elif [ $gpu_temp -ge 36 ] && [ $gpu_temp -le 40 ]; then fan_speed=35
elif [ $gpu_temp -ge 41 ] && [ $gpu_temp -le 45 ]; then fan_speed=40
elif [ $gpu_temp -ge 46 ] && [ $gpu_temp -le 50 ]; then fan_speed=45
elif [ $gpu_temp -ge 51 ] && [ $gpu_temp -le 55 ]; then fan_speed=50
elif [ $gpu_temp -ge 56 ] && [ $gpu_temp -le 57 ]; then fan_speed=55
elif [ $gpu_temp -ge 58 ] && [ $gpu_temp -le 60 ]; then fan_speed=60
elif [ $gpu_temp -ge 61 ] && [ $gpu_temp -le 65 ]; then fan_speed=70
elif [ $gpu_temp -eq 66 ] || [ $gpu_temp -eq 67 ]; then fan_speed=80
elif [ $gpu_temp -eq 68 ] || [ $gpu_temp -eq 69 ]; then fan_speed=90
elif [ $gpu_temp -ge 70 ]; then fan_speed=100
fi
}
# ========================================================
# Загрузка
echo "-------------------"
echo "$(date +"%y-%m-%d %T") *Watchdog $worker_name ready ;)"
echo "-------------------"
echo "-------------------" >> $miner_log
echo "$(date +"%y-%m-%d %T") *Watchdog $worker_name ready ;)" >> $miner_log
echo "-------------------" >> $miner_log
network
check_dropbox
numofcards
powerlimit
overclocking
minerstart
echo "_Monitoring started"
echo "-------------------"
echo "$(date +"%y-%m-%d %T") _Monitoring started" >> $miner_log
# Загрузка завершена
# ========================================================
# Watchdog в бесконечном цикле
while true; do
# Проверяем сеть
network_2
# Получаем количество запущенных майнеров
number_of_miners=$(pgrep -c $miner_type)
# Если количество майнеров не равно одному
if [ $number_of_miners -ne 1 ]; then
echo "$(date +"%y-%m-%d %T") _Number of miners is not equal one"
echo "_Restarting miner"
echo "$(date +"%y-%m-%d %T") _Number of miners is not equal one. Restarting miner" >> $miner_log
email_subj="Number of miners is not equal one"
email_body="Number of miners is not equal one. Restarting miner"
sendmyemails
echo "-------------------"
restartminer
fi
# Получаем количество работающих карт
numofcards_2
# Начало цикла перебора карт по очереди
for ((v=0; v<$num_cards; v++)); do
# Получаем значение потребляемой мощности
gpu_power=$(nvidia-smi -i $v --query-gpu=power.draw --format=csv,noheader)
if [[ $? -ne 0 ]]; then
echo "$(date +"%y-%m-%d %T") _Info request error of card number $v"
echo "_Rig will rebooted"
echo "$(date +"%y-%m-%d %T") _Info request error of card number $v, rig will rebooted" >> $miner_log
email_subj="Info request error"
email_body="Info request error of card number $v, rig will rebooted"
sendmyemails
echo "-------------------"
# rebootrig
err_code=2
break
else
# Отбросим символ W в выводе gpu_power (пример: 131.12 W)
gpu_power=${gpu_power%?}
# Округление до целого числа (bash не работает с дробными числами)
gpu_power=`echo "scale=0;$gpu_power/1" | bc`
if [[ $gpu_power -lt 10 ]]; then
echo "$(date +"%y-%m-%d %T") _Measured power on card number $v is low"
echo "_Rig will rebooted"
echo "$(date +"%y-%m-%d %T") _Measured power on card number $v is low, rig will rebooted" >> $miner_log
email_subj="Measured power is low"
email_body="Measured power on card number $v is low, rig will rebooted"
sendmyemails
echo "-------------------"
# rebootrig
err_code=2
break
fi
fi
# Получаем процент использования GPU видеокарты
gpu_util=$(nvidia-smi -i $v --query-gpu=utilization.gpu --format=csv,noheader,nounits)
# Если активность меньше указанного порога
if [[ $gpu_util -lt $low_gpu_activity ]]; then
echo "$(date +"%y-%m-%d %T") _Inactive card number $v"
echo "_Restarting miner"
echo "$(date +"%y-%m-%d %T") _Inactive card number $v. Restarting miner" >> $miner_log
email_subj="Inactive card"
email_body="Inactive card number $v. Restarting miner"
# sendmyemails
echo "-------------------"
# restartminer
err_code=1
break
fi
# Проверяем скорость вращения вентиляторов
fan_speed_check=$(nvidia-smi -i $v --query-gpu=fan.speed --format=csv,noheader,nounits)
if [[ $? -ne 0 ]]; then
echo "$(date +"%y-%m-%d %T") _Fan speed request error of card number $v"
echo "_Rig will rebooted"
echo "$(date +"%y-%m-%d %T") _Fan speed request error of card number $v, rig will rebooted" >> $miner_log
email_subj="Fan speed request error"
email_body="Fan speed request error of card number $v, rig will rebooted"
sendmyemails
echo "-------------------"
# rebootrig
err_code=2
break
fi
# Получаем температуру карты
gpu_temp=$(nvidia-smi -i $v --query-gpu=temperature.gpu --format=csv,noheader,nounits)
if [[ $? -ne 0 ]]; then
echo "$(date +"%y-%m-%d %T") _Temperature request error of card number $v"
echo "_Rig will rebooted"
echo "$(date +"%y-%m-%d %T") _Temperature request error of card number $v, rig will rebooted" >> $miner_log
email_subj="Temperature request error"
email_body="Temperature request error of card number $v, rig will rebooted"
sendmyemails
echo "-------------------"
# rebootrig
err_code=2
break
fi
# Управление вентиляторами карт
if [[ $gpu_temp -gt $low_temp ]] && [[ $gpu_temp -lt $high_temp ]]; then
fanspeedtable
else
if [[ $gpu_temp -le $low_temp ]]; then
echo "$(date +"%y-%m-%d %T") _Temperature is low ($gpu_temp) on card $v"
echo "_Restarting miner"
echo "$(date +"%y-%m-%d %T") _Temperature is low ($gpu_temp) card $v. Restarting miner" >> $miner_log
email_subj="Temperature is low on card $v"
email_body="Temperature is low ($gpu_temp) card $v. Restarting miner"
sendmyemails
echo "-------------------"
# restartminer
err_code=1
break
else
if [[ $gpu_temp -ge $high_temp ]]; then
fan_speed=100
nvidia-settings -a [fan:$v]/GPUTargetFanSpeed=100
echo "$(date +"%y-%m-%d %T") _Temperature is high ($gpu_temp) on card $v"
echo "_Rig will rebooted"
echo "$(date +"%y-%m-%d %T") _Temperature is high ($gpu_temp) card $v. Rig will rebooted" >> $miner_log
email_subj="Temperature is high on card $v"
email_body="Temperature is high ($gpu_temp) card $v. Rig will rebooted"
sendmyemails
echo "-------------------"
# rebootrig
err_code=2
break
fi
fi
fi
if [[ $fan_speed -ne $fan_speed_check ]]; then
nvidia-settings -a [fan:$v]/GPUTargetFanSpeed=$fan_speed > /dev/null
fi
# Конец цикла перебора карт
done
# Если в цикле перебора карт зарегистрирована ошибка
if [[ $err_code -eq 1 ]]; then
restartminer
err_code=0
else
if [[ $err_code -eq 2 ]]; then
rebootrig
err_code=0
fi
fi
# Цикл повторяется каждую минуту
sleep 60s
done
# End of watchdog