%PDF- %PDF-
Direktori : /lib64/nagios/plugins/nccustom/ |
Current File : //lib64/nagios/plugins/nccustom/check_smart_logs.sh |
#!/bin/bash ##################################################### # check smartctl output for errors in selftest logs # # # # Created by Bogdan Kukharskiy # # Namecheap # ##################################################### # This script is for checking if there are errors or failures in smart selftest logs, with the possibility to exclude drives from check (by serial number) # # Usage: "check_smart_logs.sh [-c path_to_smartd.conf] [-x exclude drives (can be as list separated by ',')] [-s include short test] [-v verbose output]" # # Returns the nagios native status codes: # Nagios Status # 0 = OK (All drives have completed smartctl selftest without errors) # 1 = WARNING (one or more drives have no self-tests have been logged) # 2 = CRITICAL (one or more drives have failed self-tests) # 3 = UNKNOWN (Wrong usage) # # Now with NVME support # declare -a EXCLUDE_ARRAY RXSAS="Background long" #default regexp pattern to grep tests (SAS disks) RXATA="Extended offline" #default regexp pattern to grep tests (ATA disks) OKSAS="Completed|test in progress" #default regexp pattern for passed selftest (SAS disks) or test in progress OKATA="Completed without error|routine in progress" #default regexp pattern for passed selftest (ATA disks) or test in progress NVME_HEALTH="PASSED" #default regexp pattern for passed selftest (NVME disks) NVME_PERCENT=98 #default percent of NVME usage (wear-out), it can be more than 100, up to 255 verbose=0 LOGTYPE="selftest" #default type of the log, in some cases it is needed to use 'xselftest' ## USAGE MESSAGE usage() { cat << EOF usage: $0 options Now with NVME support This script is for checking if there are errors or failures in smart selftest logs, with the possibility to exclude drives from check (by serial number) OPTIONS: -h Show this message -x exclude drives by serial number (can be as list separated by ',') -c path_to_smartd.conf, default '/etc/smartmontools/smartd.conf' and '/etc/smartd.conf' -s include short tests -p percent of NVME usage (wear-out), it can be more than 100, up to 255 (default 98) -l quantity ERR LOG ENTRIES -v be more verbose EOF } #Function for checking an existance in an array ($1) of an element ($2) #'n' means NOT found in array, 'y' is for found) function contains() { local n=$# local value=${!n} for ((i=1;i < $#;i++)) { if [ "${!i}" == "${value}" ]; then echo "y" return 0 fi } echo "n" return 1 } ## FETCH ARGUMENTS while getopts "hvsx:c:p:l:" OPTION; do case "${OPTION}" in h) usage exit 3 ;; x) IFS=, read -r -a EXCLUDE_ARRAY <<<"${OPTARG}" unset IFS ;; c) SMARTDPATH=${OPTARG} ;; p) NVME_PERCENT=${OPTARG} ;; l) ERR_LOG_ENTRIES_param=${OPTARG} ;; s) RXSAS="${RXSAS}\|Background short" RXATA="${RXATA}\|Short offline" ;; v) verbose=1 ;; \?) echo "No reasonable options found!" exit 3 ;; esac done ## CHECK ARGUMENTS if [[ -z ${SMARTDPATH} ]]; then if [ -f /etc/smartmontools/smartd.conf ]; then SMARTDPATH="/etc/smartmontools/smartd.conf" else SMARTDPATH="/etc/smartd.conf" fi fi if [[ ! -e ${SMARTDPATH} ]]; then echo "Error! File '${SMARTDPATH}' does not exists" exit 3 fi if [[ ! -f ${SMARTDPATH} ]]; then echo "Error! '${SMARTDPATH}' is not a file" exit 3 fi if [[ ! -r ${SMARTDPATH} ]]; then echo "Error! File '${SMARTDPATH}' is not readable" exit 3 fi if [[ ${NVME_PERCENT} -lt 1 ]] || [[ ${NVME_PERCENT} -gt 255 ]]; then echo "Error! Treshold for percent of NVME usage (wear-out) should be > 0 and < 255; ${NVME_PERCENT}" exit 3 fi ## MAIN ROUTINE #echo "-= DEBUG start =-" #while read FLINE; do # DRIVE=$(echo ${FLINE} | cut -d' ' -f1,2,3) # if [[ ${DRIVE} == *"nvme"* ]]; then # DRIVE=$(echo ${DRIVE} | cut -d' ' -f2) # echo "NVME drive found:${DRIVE}:" # echo -n "FOR ${DRIVE} : " # SERIAL=$(sudo smartctl -i ${DRIVE} | grep -i "Serial number" | awk '{print$3}') # echo ${SERIAL} # fi #done < ${SMARTDPATH} #echo "EA: ${EXCLUDE_ARRAY[*]}" #echo "RXATA: ${RXATA}" #echo "RXSAS: ${RXSAS}" #echo "-= DEBUG end =-" declare -a WARNING_ARRAY declare -a CRITICAL_ARRAY while read -r FLINE; do DRIVELINE=$(echo "${FLINE}" | cut -d' ' -f1,2,3) if [[ ${DRIVELINE} == *"nvme"* ]]; then DRIVELINE=$(echo "${DRIVELINE}" | cut -d' ' -f2) fi SERIAL=$(sudo smartctl -i ${DRIVELINE} | grep -i "Serial number" | awk '{print$3}') if [[ $(contains "${EXCLUDE_ARRAY[@]}" "${SERIAL}") == "n" ]]; then #checking that drive serial number is NOT in excluded array if [[ ${DRIVELINE} == *"nvme"* ]]; then # NVME device found PERCENT_USED=$(sudo smartctl -a ${DRIVELINE} | grep -i "Percentage Used" | awk '{print$3}' | cut -d"%" -f1) OVERALL_HEALTH=$(sudo smartctl -H ${DRIVELINE} | grep "SMART overall-health self-assessment test result" | cut -d":" -f2 | tr -d " ") ERR_LOG_ENTRIES=$(sudo smartctl -a ${DRIVELINE} | grep -i "Error Information Log Entries" | awk '{print$5}') if [[ ${verbose} == 1 ]]; then echo "FOR ${DRIVELINE} |Serial: ${SERIAL}| Health-check: ${OVERALL_HEALTH}| Wear-out,%: ${PERCENT_USED}| ErrLogEntries: ${ERR_LOG_ENTRIES}" # if [[ ${ERR_LOG_ENTRIES} -nq 0 ]]; then # echo "${ERR_LOG}" fi if [[ ${ERR_LOG_ENTRIES} -gt ${ERR_LOG_ENTRIES_param} ]] || ! [[ ${OVERALL_HEALTH} =~ ${NVME_HEALTH} ]]; then CRITICAL_ARRAY+=("${SERIAL}") #adding serial to critical array elif [[ ${PERCENT_USED} -gt ${NVME_PERCENT} ]]; then WARNING_ARRAY+=("${SERIAL}") #adding serial to warning array fi # Non-NVME devices else #let's check if the device supports simple selftest log if sudo smartctl -l selftest ${DRIVELINE} | grep -q -i "SMART Self-test Log not supported"; then LOGTYPE="xselftest"; else LOGTYPE="selftest"; fi RESULTSTRATA=$(sudo smartctl -l ${LOGTYPE} ${DRIVELINE} | grep -e "${RXATA}" |head -n 1) RESULTSTRSAS=$(sudo smartctl -l ${LOGTYPE} ${DRIVELINE} | grep -e "${RXSAS}" |head -n 1) if [[ ${verbose} == 1 ]]; then echo "FOR ${DRIVELINE} |${SERIAL}| : ${RESULTSTRATA} ${RESULTSTRSAS}" fi if [[ -z ${RESULTSTRATA} ]] && [[ -z ${RESULTSTRSAS} ]] ; then #no acceptable results were found # echo "WARNING! For drive with serial ${SERIAL} no self-tests have been logged" WARNING_ARRAY+=("${SERIAL}") #adding serial to warning array elif ! [[ ${RESULTSTRATA} =~ ${OKATA} ]] && ! [[ ${RESULTSTRSAS} =~ ${OKSAS} ]]; then CRITICAL_ARRAY+=("${SERIAL}") #adding serial to critical array fi fi fi done < ${SMARTDPATH} if ! [[ ${#CRITICAL_ARRAY[@]} -eq 0 ]]; then echo "CRITICAL! Following drives have failed SMART selftest: ${CRITICAL_ARRAY[*]}" exit 2 elif ! [[ ${#WARNING_ARRAY[@]} -eq 0 ]]; then echo "WARNING! Following drives have no SMART selftest logged or NVME devices close to end of usage: ${WARNING_ARRAY[*]}" exit 1 else echo "OK! All drives passed SMART selftest succesfully" if ! [[ ${#EXCLUDE_ARRAY[@]} -eq 0 ]]; then echo "Drives excluded: ${EXCLUDE_ARRAY[*]}" fi exit 0 fi