#! /usr/bin/ksh 
# ===========================================================
#                                                           =
#  garraystat                           (Get array STATus)  =
#                                                           =
#  HP disk array monitor daemon script                      =
#                                                           =
#  takes array device files (contents of the array file)    =
#                                                           =
#  calls scripts that                                       =
#        examine status of drives                           =
#        examine spindle sync status of drives              =
#                                                           =
#  syntax:                                                  =
#      garraystat <list of arrays>                          =
#                                                           =
#  calls:                                                   =
#     utilities                                             =
#            $UTIL_DIR/dsp -p (display status of drives)    =
#            $UTIL_DIR/sss -d (sync status of drives)       =
#                                                           =
#     utility programs                                      =
#            $DAEMON_DIR/arraydrv.awk                       =
#            $UTIL_DIR/dfileinfo                            =
#            $UTIL_DIR/arraycomp                            =
#                                                           =
#     system utilities                                      =
#            /usr/bin/awk                                   =
#            /usr/bin/cat                                   =
#            /usr/bin/cp                                    =
#            /usr/bin/echo                                  =
#            /usr/bin/expr                                  =
#            /usr/bin/grep                                  =
#            /usr/bin/mktemp                                =
#            /usr/bin/rm                                    =
#            /usr/bin/sort                                  =
#            /usr/bin/wc                                    =
#                                                           =
#  (c) Copyright 1993 Hewlett-Packard Company               =
#      All rights reserved.                                 =
#                                                           =
# ===========================================================

hdr="@(#) $Header: garraystat,v 72.3 94/11/18 12:09:05 ssa Exp $"

.  /usr/lbin/hpC2400/arraymon.hdr

   if test $# -lt 4
   then 
          exit 1 
   fi

#------------------------------------------------------------
# =====  MAIN  =====               go through array list file
#------------------------------------------------------------

   updated_file=`/usr/bin/mktemp -d /var/tmp -c`
   out_one=`/usr/bin/mktemp -d /var/tmp -c`
   out_two=`/usr/bin/mktemp -d /var/tmp -c`

   while [ $# -gt 0 ]
   do

      if [ $# -lt 4 ]
      then  
         exit 1 
      fi

      array=$1
      arr_status=$2
      prog_status=$3
      sync_status=$4

#------------------------------------------------------------
#  get LUN status by examining drive status
#                                          
#  if we can't talk to the array after 3 tries, notify
#  system admin, but only 3 times, max                            
#------------------------------------------------------------

      ${UTIL_DIR}/dsp -p -h $array > $out_one 2> /dev/null 

      if [ $? -ne 0 ]
      then 
         prog_status=`/usr/bin/expr $prog_status + 1`
         if [ $prog_status -ge 3 ]
         then
            if [ $prog_status -le 5 ]
            then
               notice=`/usr/bin/mktemp -d /var/tmp -c`
               /usr/bin/echo "==================================================" > $notice
               /usr/bin/date >> $notice
               /usr/bin/echo "array monitor daemon" >> $notice
               /usr/bin/echo "==================================================" >> $notice
               /usr/bin/echo "access error: " $array " not responding"  >> $notice
	       /usr/bin/echo "" >> $notice
               $NOTIFY $notice `cat $NOTIFY_WHOM` > /dev/null 2>&1
               /usr/bin/rm $notice
            fi
         fi
      else
         prog_status=0
         /usr/bin/awk -f $AWK_BAD_DRIVES $out_one > $out_two
      fi

#------------------------------------------------------------
#  grep the output file for any status indicating drive trouble
#                                       
#  other status are: AOK, nonexistent, UNCONFIGURED and CONTROLLER
#                                          
#  problem?  notify system admin, but only 3 times, max
#
#  since "FORMATTING" and "REPLACED" status indicate that
#  a problem has been addressed, don't report them as
#  problems
#------------------------------------------------------------

      if [ $prog_status -eq 0 ]
      then

         /usr/bin/grep FAILED     $out_two >  $out_one
         /usr/bin/grep WARNING    $out_two >> $out_one
         /usr/bin/grep MISMATCH   $out_two >> $out_one
         /usr/bin/grep WRONG      $out_two >> $out_one
   
         lines=`/usr/bin/wc -l $out_one`
  
         if test `/usr/bin/echo $lines | /usr/bin/awk '{print $1}'` -gt 0
         then 
            arr_status=`/usr/bin/expr $arr_status + 1`
            if [ $arr_status -le 3 ]
            then
               notice=`/usr/bin/mktemp -d /var/tmp -c`
               /usr/bin/echo "==================================================" > $notice
               /usr/bin/date >> $notice
               /usr/bin/echo "array monitor daemon" >> $notice
               /usr/bin/echo "==================================================" >> $notice
               hostadd=`$GET_ARRAY_INFO -p ${array}`
               /usr/bin/echo "drive error" $hostadd >> $notice
               /usr/bin/cat $out_one >> $notice
	       /usr/bin/echo "" >> $notice
               $NOTIFY $notice `cat $NOTIFY_WHOM` > /dev/null 2>&1
               /usr/bin/rm $notice
            fi
         else
            arr_status=0
         fi

#------------------------------------------------------------
#     get drive spindle sync status 
#                                          
#     problem?  notify system admin, but only 3 times, max                            
#------------------------------------------------------------

         ${UTIL_DIR}/sss -d $array > $out_one 2> /dev/null 

         if [ $? -ne 0 ]
         then
            prog_status=`/usr/bin/expr $prog_status + 1`
            if [ $prog_status -ge 3 ]
            then
               if [ $prog_status -le 5 ]
               then
                  notice=`/usr/bin/mktemp -d /var/tmp -c`
                  /usr/bin/echo "==================================================" > $notice
                  /usr/bin/date >> $notice
                  /usr/bin/echo "array monitor daemon" >> $notice
                  /usr/bin/echo "==================================================" >> $notice
                  /usr/bin/echo "access error: " $array " not responding"  >> $notice
	          /usr/bin/echo "" >> $notice
                  $NOTIFY $notice `cat $NOTIFY_WHOM` > /dev/null 2>&1
                  /usr/bin/rm $notice
               fi
            fi
         else

#------------------------------------------------------------
#     got sync status, now process output file
#
#     first, are there any critical sync problems?
#
#     lacking a master and having a drive that is neither
#     master nor slave is a critical sync problem (if 
#     there is an "extra" out there, it had better be 
#     sync'd) and should be reported with some urgency
#                                          
#     problem?  notify system admin, but only 3 times, max                            
#------------------------------------------------------------

            prog_status=0
            local_sync_status=0

#------------------------------------------------------------
#        check for only one master 
#        ("MASTER " vs. "MASTER_CTRL")
#------------------------------------------------------------

            /usr/bin/grep "MASTER " $out_one > $out_two 2> /dev/null

            if [ $? -ne 0 ]
            then
               local_sync_status=`/usr/bin/expr $local_sync_status + 1`
            else
               lines=`wc -l $out_two`
               if test `/usr/bin/echo $lines | /usr/bin/awk '{print $1}'` -ne 1
               then 
                  local_sync_status=`/usr/bin/expr $local_sync_status + 1`
               fi 
            fi 

#------------------------------------------------------------
#        check for drives which are neither master nor slave
#------------------------------------------------------------

            /usr/bin/grep "UNSYNC" $out_one > $out_two 2> /dev/null

            if [ $? -eq 0 ]
            then
               local_sync_status=`/usr/bin/expr $local_sync_status + 1`
            fi 

            /usr/bin/grep "MASTER_CTRL" $out_one > $out_two 2> /dev/null

            if [ $? -eq 0 ]
            then
               local_sync_status=`/usr/bin/expr $local_sync_status + 1`
            fi 

            if [ $local_sync_status -ne 0 ]
            then
               sync_status=`/usr/bin/expr $sync_status + 1`
               if [ $sync_status -le 3 ]
               then
                  notice=`/usr/bin/mktemp -d /var/tmp -c`
                  /usr/bin/echo "==================================================" > $notice
                  /usr/bin/date >> $notice
                  /usr/bin/echo "array monitor daemon" >> $notice
                  /usr/bin/echo "==================================================" >> $notice
                  hostadd=`$GET_ARRAY_INFO -p ${array}`
                  /usr/bin/echo "sync configuration error" $hostadd >> $notice
                  /usr/bin/cat $out_one >> $notice
	          /usr/bin/echo "" >> $notice
                  $NOTIFY $notice `cat $NOTIFY_WHOM` > /dev/null 2>&1
                  /usr/bin/rm $notice
               fi
            fi

#------------------------------------------------------------
#        if there are no critical sync errors, check for
#        drives out of sync
#                                          
#        this is only a big deal if it persists over several
#        times - since the daemon only runs every XXX minutes,
#        it could get unlucky and find one or more drives 
#        unsync'd each time
#
#        if the same drive is not in sync 3 times in a row,
#        call it a problem
#------------------------------------------------------------


            if [ $local_sync_status -eq 0 ]
            then

               sync_status=0
               /usr/bin/grep "not" $out_one > $out_two 2> /dev/null
               if [ $? -eq 0 ]
               then

#------------------------------------------------------------
#           create a sorted file whose entries are of the
#           same form as those of the existing sync error 
#           file (if any):
#               3      0        1
#           (chan #) (ID #) (# times)
#------------------------------------------------------------

                     temp=`/usr/bin/mktemp -d /var/tmp -c`
                     /usr/bin/cat $out_two | /usr/bin/awk '{ print $4 "   " $6 "   1"}' > $temp
                     /usr/bin/cat $temp | /usr/bin/awk '{ gsub(":"," ",$2) ; print $0 }' > $out_two
                     /usr/bin/sort $out_two > $temp
                     /usr/bin/cp $temp $out_two

#------------------------------------------------------------
#                 if no file exists, make the new one "it"
#------------------------------------------------------------

                     badsyncfile=`/usr/bin/echo $array | /usr/bin/awk '{ z = substr( $1, 11, 5) ; print z }'`
                     badsyncfile="$MISC_DIR/"$badsyncfile".nosync"
                     if [ ! -s $badsyncfile ]
                     then
                        /usr/bin/cat $out_two > $badsyncfile
                     else

#------------------------------------------------------------
#                 bad sync file exists; make a new one from 
#                 the permanent one and the current one and 
#                 put it in "temp"
#                                          
#                 if the file compare fails for some reason,
#                 just make the current file the permanent one
#                 send notification if the drive has been out 
#                 of sync for ~1 hour, as Cascade does
#                                       
#                 make 3 notifications then quit 
#------------------------------------------------------------

                        $COMP_SYNC_LIST $out_two $badsyncfile > $temp 2> /dev/null
                        if [ $? -ne 0 ]
                        then
                           /usr/bin/cat $out_two > $badsyncfile
                        else
                           /usr/bin/cat $temp > $badsyncfile
                           temp2=`/usr/bin/mktemp -d /var/tmp -c`

#------------------------------------------------------------
# ##### IMPORTANT NOTICE #####                             
# NOTICE: If daemon's period changes, the # of times in the 
#         test statement ($3 >= X) of the awk program 
#         immediately below needs to be adjusted so that 
#         the first notification occurs after an hour. 
#         This quantity ($3) represents the number of times
#         in a row that the daemon has found the drive in
#         the unsync'd state
#------------------------------------------------------------

                           minutes=`/usr/bin/expr $SLEEP_SECS / 60`

                           /usr/bin/cat $badsyncfile | /usr/bin/awk '{ if ( ($3 >= 4) && ($3 < 7) )   \
                               { print "   drive on channel " $1 " ID " $2         \
                              " has been out of sync for " $3 * mins " minutes" }}' \
			      mins=$minutes > $temp2

                           if [ -s $temp2 ]
			   then
                              notice=`/usr/bin/mktemp -d /var/tmp -c`
                              /usr/bin/echo "==================================================" > $notice
                              /usr/bin/date >> $notice
                              /usr/bin/echo "array monitor daemon" >> $notice
                              /usr/bin/echo "==================================================" >> $notice
                              hostadd=`$GET_ARRAY_INFO -p ${array}`
                              /usr/bin/echo "sync loss error" $hostadd >> $notice
                              /usr/bin/cat $temp2 >> $notice
	                      /usr/bin/echo "" >> $notice
                              $NOTIFY $notice `cat $NOTIFY_WHOM` > /dev/null 2>&1
                              /usr/bin/rm $notice
                           fi
			   /usr/bin/rm $temp2
                        fi
                     fi
                     /usr/bin/rm $temp

               else

#------------------------------------------------------------
#           no drives out of sync - if a sync error file 
#           exists for the array in /etc/hpC2400/, remove it
#------------------------------------------------------------

                  badsyncfile=`/usr/bin/echo $array | /usr/bin/awk '{ z = substr( $1, 11, 6) ; print z }'`
                  badsyncfile="/etc/hpC2400/"$badsyncfile".nosync"
                  if [ -s $badsyncfile ]
                  then
                     /usr/bin/rm $badsyncfile
                  fi
               fi 
            fi
         fi
      fi   
      
      /usr/bin/echo $array "  " $arr_status "  " $prog_status "  " $sync_status  >> $updated_file
      shift
      shift
      shift
      shift

   done

   /usr/bin/cp  $updated_file  $DEVS_LIST

   /usr/bin/rm $updated_file
   /usr/bin/rm $out_one
   /usr/bin/rm $out_two

#------------------------------------------------------------
# the end of garraystat
#------------------------------------------------------------
