#! /usr/bin/ksh 
# ===========================================================
#                                                           =
#  gstat2430d                          (Get c2430d STATus)  =
#                                                           =
#  HP disk array monitor daemon script                      =
#                                                           =
#  takes array device files (contents of the array file)    =
#                                                           =
#  calls scripts that                                       =
#        examine status of drives                           =
#        examine spindle sync status of drives              =
#                                                           =
#  syntax:                                                  =
#      gstat2430d <list of arrays>                          =
#                                                           =
#  calls:                                                   =
#     utilities                                             =
#            $UTIL_DIR/dsp -p (display status of drives)    =
#            $UTIL_DIR/sss -d (sync status of drives)       =
#                                                           =
#     utility programs                                      =
#            $DAEMON_DIR/arraydrv.awk                       =
#            $DAEMON_DIR/aml                                =
#            $DAEMON_DIR/arraycomp                          =
#                                                           =
#     system utilities                                      =
#            $UXTOOL_DIR/awk                                =
#            $CMNDS_DIR/cat                                 =
#            $CMNDS_DIR/cp                                  =
#            $CMNDS_DIR/echo                                =
#            $CMNDS_DIR/expr                                =
#            $CMNDS_DIR/grep                                =
#            $CMNDS_DIR/mktemp                              =
#            $CMNDS_DIR/ps                                  =
#            $CMNDS_DIR/rm                                  =
#            $CMNDS_DIR/sort                                =
#            $CMNDS_DIR/wc                                  =
#                                                           =
#   Revision history                                        =
#     09-15-94 DCP Renamed this file.                       =
#                  Increase number of parms passed in       =
#                  via the command line to 6.               =
#                  Delete all references to updated_file,   =
#                  which was moved to arraymond.            =
#                                                           =
#  (c)Copyright 1993, 1995 Hewlett-Packard Company          =
#     All rights reserved.                                  =
#                                                           =
# ===========================================================

hdr="@(#) $Header: gstat2430d,v 78.3 96/03/26 19:25:02 ssa Exp $"

.  /usr/lbin/hpC2400/arraymon.hdr

   if test $# -lt $DEVS_ENTRY_CNT
   then 
      exit 2 
   fi

   sendmail=$FALSE

#------------------------------------------------------------
# set up the variables used for messages in the message
# catalog file
#------------------------------------------------------------

   C2430set=3
   DrvErr=1
   DrvFail=2
   DrvWarn=3
   MisMatch=4
   WrongDrv=5
   SyncErr=6
   NoSync=7
   SlSync=8
   NoSlSync=9
   MCSync=10
   NoMCSync=11
   MSync=12
   NoMSync=13
   SyncLoss=14
   SyncTime=15

#------------------------------------------------------------
# =====  MAIN  =====               go through array list file
#------------------------------------------------------------

   out_one=$(${CMNDS_DIR}/mktemp -d ${TMP_DIR} -c)
   out_two=$(${CMNDS_DIR}/mktemp -d ${TMP_DIR} -c)

   vendor=$1
   prod_id=$2
   array=$3
   arr_status=$4
   prog_status=$5
   sync_status=$6

#------------------------------------------------------------
# don't continue if pscan's running
#------------------------------------------------------------

   cnt=$(${CMNDS_DIR}/ps -ef | ${CMNDS_DIR}/grep -v grep | ${CMNDS_DIR}/grep pscan | ${CMNDS_DIR}/wc -l)

   if [ $cnt -ne 0 ]
   then
     ${CMNDS_DIR}/rm $out_one 2> /dev/null
     ${CMNDS_DIR}/rm $out_two 2> /dev/null
     exit 3
   fi

#------------------------------------------------------------
# get the host system node name
#------------------------------------------------------------
   host_node=$(${CMNDS_DIR}/uname -n)

#------------------------------------------------------------
#  get LUN status by examining drive status
#                                          
#  if we can't talk to the array after 3 tries, notify
#  system admin, but only 3 times, max                            
#------------------------------------------------------------

   ${UTIL_DIR}/dsp -p -h $array > $out_one 2> /dev/null 

   if [ $? -ne 0 ]
   then 
      prog_status=$(${CMNDS_DIR}/expr $prog_status + 1)
      if [ $prog_status -ge $MIN_PROG_ERR ]
      then
         if [ $prog_status -le $MAX_PROG_ERR ]
         then
            ${DAEMON_DIR}/aml -s $MonSet -m $NoResponse -p $array -n $host_node -f $msgFile

            if [ -s $msgFile ]
            then
               $NOTIFY $sendmail $msgFile `cat $NOTIFY_WHOM` > /dev/null 2>&1
               ${CMNDS_DIR}/rm $msgFile 2> /dev/null
            fi
         fi
      fi
   else
      prog_status=0
      ${UXTOOL_DIR}/awk -f $AWK_BAD_DRIVES $out_one > $out_two
   fi

#------------------------------------------------------------
#  grep the output file for any status indicating drive trouble
#                                       
#  other status are: AOK, nonexistent, UNCONFIGURED and CONTROLLER
#                                          
#  problem?  notify system admin, but only 3 times, max
#
#  since "FORMATTING" and "REPLACED" status indicate that
#  a problem has been addressed, don't report them as
#  problems
#------------------------------------------------------------

   if [ $prog_status -eq 0 ]
   then

      ${CMNDS_DIR}/grep FAILED     $out_two >  $out_one
      ${CMNDS_DIR}/grep WARNING    $out_two >> $out_one
      ${CMNDS_DIR}/grep MISMATCH   $out_two >> $out_one
      ${CMNDS_DIR}/grep WRONG      $out_two >> $out_one
   
      lines=$(${CMNDS_DIR}/wc -l $out_one)
  
      if [[ $(${CMNDS_DIR}/echo $lines | ${UXTOOL_DIR}/awk '{print $1}') -gt 0 ]]
      then 
         ((arr_status = arr_status + 1))
         if [ $arr_status -le $MIN_PROG_ERR ]
         then

            ${DAEMON_DIR}/aml -s $C2430set -m $DrvErr -p $array -n $host_node -f $msgFile

            while read line
            do
              set $line
              ch=$2
              id=$4
              stat=$5
              case $stat in
                 "FAILED") ${DAEMON_DIR}/aml -h -s $C2430set -m $DrvFail -L $ch -L $id -f $msgFile;;
                 "WARNING") ${DAEMON_DIR}/aml -h -s $C2430set -m $DrvWarn -L $ch -L $id -f $msgFile;;
                 "PARM") ${DAEMON_DIR}/aml -h -s $C2430set -m $MisMatch -L $ch -L $id -f $msgFile;;
                 "WRONG") ${DAEMON_DIR}/aml -h -s $C2430set -m $WrongDrv -L $ch -L $id -f $msgFile;;
              esac

            done < $out_one 

         fi
      else
         arr_status=0
      fi

      if [ -s $msgFile ]
      then
        $NOTIFY $sendmail $msgFile `cat $NOTIFY_WHOM` > /dev/null 2>&1
        ${CMNDS_DIR}/rm $msgFile 2> /dev/null
      fi

#------------------------------------------------------------
#     get drive spindle sync status 
#                                          
#     problem?  notify system admin, but only 3 times, max                            
#------------------------------------------------------------

      ${UTIL_DIR}/sss -d $array > $out_one 2> /dev/null 

      if [ $? -ne 0 ]
      then
         ((prog_status = prog_status + 1))
         if [ $prog_status -ge $MIN_PROG_ERR ]
         then
            if [ $prog_status -le $MAX_PROG_ERR ]
            then
               ${DAEMON_DIR}/aml -s $MonSet -m $NoResponse -p $array -n $host_node -f $msgFile

               if [ -s $msgFile ]
               then
                  $NOTIFY $sendmail $msgFile `cat $NOTIFY_WHOM` > /dev/null 2>&1
                  ${CMNDS_DIR}/rm $msgFile 2> /dev/null
               fi
            fi
         fi
      else

#------------------------------------------------------------
#     got sync status, now process output file
#
#     first, are there any critical sync problems?
#
#     lacking a master and having a drive that is neither
#     master nor slave is a critical sync problem (if 
#     there is an "extra" out there, it had better be 
#     sync'd) and should be reported with some urgency
#                                          
#     problem?  notify system admin, but only 3 times, max                            
#------------------------------------------------------------

         prog_status=0
         local_sync_status=0

#------------------------------------------------------------
#        check for only one master 
#        ("MASTER " vs. "MASTER_CTRL")
#------------------------------------------------------------

         ${CMNDS_DIR}/grep "MASTER " $out_one > $out_two 2> /dev/null

         if [ $? -ne 0 ]
         then
            ((local_sync_status = local_sync_status + 1))
         else
            lines=$(${CMNDS_DIR}/wc -l $out_two)
            if [[ $(${CMNDS_DIR}/echo $lines | ${UXTOOL_DIR}/awk '{print $1}') -ne 1 ]]
            then 
               ((local_sync_status = local_sync_status + 1))
            fi 
         fi 

#------------------------------------------------------------
#        check for drives which are neither master nor slave
#------------------------------------------------------------

         ${CMNDS_DIR}/grep "UNSYNC" $out_one > $out_two 2> /dev/null

         if [ $? -eq 0 ]
         then
            ((local_sync_status = local_sync_status + 1))
         fi 

         ${CMNDS_DIR}/grep "MASTER_CTRL" $out_one > $out_two 2> /dev/null

         if [ $? -eq 0 ]
         then
            ((local_sync_status = local_sync_status + 1))
         fi 

         if [ $local_sync_status -ne 0 ]
         then
            ((sync_status = sync_status + 1))
            if [ $sync_status -le $MAX_SYNC_ERR ]
            then
               ${DAEMON_DIR}/aml -s $C2430set -m $SyncErr -p $array -n $host_node -f $msgFile

               while read line
               do
                 set $line
                 ch=$4
                 id=$6
                 str1=$7
                 str2=$8
                 str3=$9
                  
                 case $str1 in
                    "UNSYNC") ${DAEMON_DIR}/aml -h -s $C2430set -m $NoSync -L $ch -L $id -f $msgFile;;

                    "SLAVE") if [[ $str3 = "sync" ]]
                             then
                                ${DAEMON_DIR}/aml -h -s $C2430set -m $SlSync -L $ch -L $id -f $msgFile
                             else
                                ${DAEMON_DIR}/aml -h -s $C2430set -m $NoSlSync -L $ch -L $id -f $msgFile
                             fi;;

                    "MASTER") if [[ $str2 = "CONTROL" ]]
                              then
                                 if [[ $str3 = "in" ]]
                                 then
                                    ${DAEMON_DIR}/aml -h -s $C2430set -m $MCSync -L $ch -L $id -f $msgFile
                                 else
                                    ${DAEMON_DIR}/aml -h -s $C2430set -m $NoMCSync -L $ch -L $id -f $msgFile
                                 fi

                              elif [[ $str2 = "in" ]]
                              then
                                 ${DAEMON_DIR}/aml -h -s $C2430set -m $MSync -L $ch -L $id -f $msgFile

                              elif [[ $str2 = "not" ]]
                              then
                                 ${DAEMON_DIR}/aml -h -s $C2430set -m $NoMSync -L $ch -L $id -f $msgFile
                              fi;;
                 esac

               done < $out_one 

               if [ -s $msgFile ]
               then
                  $NOTIFY $sendmail $msgFile `cat $NOTIFY_WHOM` > /dev/null 2>&1
                  ${CMNDS_DIR}/rm $msgFile 2> /dev/null
               fi
            fi
         fi

#------------------------------------------------------------
#        if there are no critical sync errors, check for
#        drives out of sync
#                                          
#        this is only a big deal if it persists over several
#        times - since the daemon only runs every XXX minutes,
#        it could get unlucky and find one or more drives 
#        unsync'd each time
#
#        if the same drive is not in sync 3 times in a row,
#        call it a problem
#------------------------------------------------------------


         if [ $local_sync_status -eq 0 ]
         then

            sync_status=0
            ${CMNDS_DIR}/grep "not" $out_one > $out_two 2> /dev/null
            if [ $? -eq 0 ]
            then

#------------------------------------------------------------
#           create a sorted file whose entries are of the
#           same form as those of the existing sync error 
#           file (if any):
#               3      0        1
#           (chan #) (ID #) (# times)
#------------------------------------------------------------

                  temp=$(${CMNDS_DIR}/mktemp -d ${TMP_DIR} -c)
                  ${CMNDS_DIR}/cat $out_two | ${UXTOOL_DIR}/awk '{ print $4 "   " $6 "   1"}' > $temp
                  ${CMNDS_DIR}/cat $temp | ${UXTOOL_DIR}/awk '{ gsub(":"," ",$2) ; print $0 }' > $out_two
                  ${CMNDS_DIR}/sort $out_two > $temp
                  ${CMNDS_DIR}/cp $temp $out_two

#------------------------------------------------------------
#                 if no file exists, make the new one "it"
#------------------------------------------------------------

                  badsyncfile=$(${CMNDS_DIR}/echo $array | ${UXTOOL_DIR}/awk '{ z = substr($1, 11, 6) ; print z }')
                  badsyncfile="${MISC_DIR}/"$badsyncfile".nosync"
                  if [ ! -s $badsyncfile ]
                  then
                     ${CMNDS_DIR}/cat $out_two > $badsyncfile
                  else

#------------------------------------------------------------
#                 bad sync file exists; make a new one from 
#                 the permanent one and the current one and 
#                 put it in "temp"
#                                          
#                 if the file compare fails for some reason,
#                 just make the current file the permanent one
#                 send notification if the drive has been out 
#                 of sync for ~1 hour, as Cascade does
#                                       
#                 make 3 notifications then quit 
#------------------------------------------------------------

                     $COMP_SYNC_LIST $out_two $badsyncfile > $temp 2> /dev/null
                     if [ $? -ne 0 ]
                     then
                        ${CMNDS_DIR}/cat $out_two > $badsyncfile
                     else
                        ${CMNDS_DIR}/cat $temp > $badsyncfile

#------------------------------------------------------------
# ##### IMPORTANT NOTICE #####                             
# NOTICE: If daemon's period changes, the MIN_SYNC_COUNT and
#         MAX_SYNC_COUNT variables in arraymon.hdr for the if
#         statement immediately below needs to be adjusted so 
#         that the first notification occurs after an hour. 
#         The quantity ($3) represents the number of times
#         in a row that the daemon has found the drive in
#         the unsync'd state.
#------------------------------------------------------------

                        ((minutes = SLEEP_SECS / 60))
                        exerr=0

                        while read line
                        do
                          set $line
                          ch=$1
                          id=$2
                          ntime=$3

                          if (( (ntime >= MIN_SYNC_COUNT) && (ntime < MAX_SYNC_COUNT) ))
                          then
                            ((exerr = exerr + 1))
                            ((mins = minutes * ntime))

                            if [ $exerr -eq 1 ]
                            then
                               ${DAEMON_DIR}/aml -s $C2430set -m $SyncLoss -p $array -n $host_node -f $msgFile
                            fi

                            ${DAEMON_DIR}/aml -h -s $C2430set -m $SyncTime -L $ch -L $id -L $mins -f $msgFile
                          fi

                        done < $badsyncfile 

                        if [ -s $msgFile ]
                        then
                           $NOTIFY $sendmail $msgFile `cat $NOTIFY_WHOM` > /dev/null 2>&1
                           ${CMNDS_DIR}/rm $msgFile 2> /dev/null
                        fi
                     fi
                  fi
                  ${CMNDS_DIR}/rm $temp 2> /dev/null

            else

#------------------------------------------------------------
#           no drives out of sync - if a sync error file 
#           exists for the array, remove it
#------------------------------------------------------------

               badsyncfile=$(${CMNDS_DIR}/echo $array | ${UXTOOL_DIR}/awk '{ z = substr($1, 11, 6) ; print z }')
               badsyncfile="${MISC_DIR}/"$badsyncfile".nosync"
               if [ -s $badsyncfile ]
               then
                  ${CMNDS_DIR}/rm $badsyncfile 2> /dev/null
               fi
            fi 
         fi
      fi
   fi   

   ${CMNDS_DIR}/echo $vendor "  " $prod_id "  " $array "  " $arr_status "  " $prog_status "  " $sync_status
      
   ${CMNDS_DIR}/rm $out_one 2> /dev/null
   ${CMNDS_DIR}/rm $out_two 2> /dev/null

#------------------------------------------------------------
# the end of gstat2430d
#------------------------------------------------------------
