#!/bin/ksh ## ## SCRIPT NAME: rmtcmds4sar_locdisk_sortOFallhosts ## ## Where: in $FEDIR/scripts where $FEDIR=/apps/nns_com/fea ## ############################################################################ ## PURPOSE: To show the TOP LOCAL DISK ACITIVITY on SGI hosts ## --- 'A SORTED OVERVIEW OF ALL HOSTS ON THE NETWORK'. ## ## Uses a 'sar -d' command on all hosts, like 'sar -d 3 1' --- ## three seconds of data collection, and one output interval (sample). ## ## ************************************************************* ## ** Helps find 'forgotten' processes, in a tight processing ** ## ** generating lots of file I/O, at disk local to the host. ** ## ************************************************************* ## ## (These 'forgotten' processes MAY NOT HAVE AN ADVERSE IMPACT ## on the network, even if they are in a tight processing loop, ## gobbling LOTS OF CPU CYCLES, on the host --- ## IF those resource-gobblers ARE NOT generating network I/O. ## ## This report in conjunction with other 'sortOFallhosts' ## reports can help determine if high local-disk-processing ## processes are disturbing the network.) ## ## This script puts report output in a file and displays it. ## ## The script puts the hostname beside each 'disk-device' line ## --- and it sorts the resulting list, for all hosts, ## by %busy (and/or several other 'sar -d' columns --- ## so the TOP HOSTS ON THE NETWORK ## (in terms of local-disk-I/O activity) ## POP UP TO THE TOP OF THE LIST. ## ############################################################################ ## CALLED BY: nethosts_tools -> nethosts_tools.chestdef ## in $FEDIR/scripts ## ############################################################################ ## CALL FORMAT: ## /apps/nns_com/fea/scripts/rmtcmds4sar_locdisk_sortOFallhosts ## ## ## See nethosts_tools.chestdef in $FEDIR/scripts ## for a call initiated within a 'winterm'. ## ############################################################################ ## MAINTENANCE HISTORY: ## Written by: B.Montandon O06 19Apr2000 Based on ## 'rmtcmds4sar_syscalls_sortOFallhosts' ## in $FEDIR/scripts ## Updated by: B.Montandon O06 19Apr2000 ## ############################################################################ if test "$FEDIR" = "" then FEDIR="/apps/nns_com/fea" fi ############################################################################## ## GENERATE THE NAME OF THE OUTPUT-REPORT FILE. ############################################################################## . $FEDIR/scripts/set_localoutlist OUTLIST=${OUTLIST}_sar_locdisk OUTLIST_PRESORT=${OUTLIST}_presort rm -f $OUTLIST rm -f $OUTLIST_PRESORT ############################################################################## ## GENERATE THE HOSTLIST from NIS (ypcat hosts). ############################################################################## ## . /apps/nns_com/fea/scripts/alarm_sethostlist ## . /apps/ideas/cron/set_hostlist HOSTLIST=`ypcat hosts | grep -v "^#" | grep iaw | awk '{print $3}' | sort` HOSTLIST=$HOSTLIST" engvis00 engprd00 sgia sgib" ## FOR TESTING: # HOSTLIST="iaw005 iaw021 iaw030 iaw141 engvis00" ############################################################################# ## PREPARE A HEADER FOR THE REPORT. ############################################################################# ## SAMPLE 'sar -d' OUTPUT: ## $ sar -d 3 1 ## ## IRIX64 engvis00 6.5 10181058 IP27 04/19/00 ## ## 18:49:07 device %busy avque r+w/s blks/s w/s wblks/s avwait avserv ## 18:49:10 dks0d1 1 1.0 1.0 48 1.0 48 0.0 10.0 ## dks0d6 0 0.0 0.0 0 0.0 0 0.0 0.0 ## dks3d1 0 0.0 0.0 0 0.0 0 0.0 0.0 ## dks3d2 68 9.7 17.6 9738 17.6 9738 341.1 38.7 ## dks3d3 61 8.9 16.3 8703 16.3 8703 228.6 37.6 ## dks3d5 0 0.0 0.0 0 0.0 0 0.0 0.0 ############################################################################# SAMPLE_DURATION=3 SAMPLES_TAKEN=1 COLHEAD0=" 1 2 3 4 5 6 7 8 9 10 11" COLHEAD1="HOST END-TIME Rd+Wr RWBlks Writes WrBlks" COLHEAD2="NAME OF SAMPLE DEVICE %BUSY AveQue /sec /sec /sec /sec AvWait AvServ" COLHEAD3="---------- --------- ---------- ------ ------ ----- ----- ------- ------- ------ ------" echo "\ ************************* `date '+%Y %b %d %T%p'` ************************************* NNS SGI NETWORK HOSTS SORTED BY 'LOCAL-DISK' ACTIVITY (%busy,reads+writes,writes) --- BUSIEST HOST-DEVICES AT THE TOP. SEVERAL DIFFERENT SORTS. (Data was collected for $SAMPLE_DURATION seconds on each host. See comments at bottom of this report.) " > $OUTLIST ############################################################################## ## IF WE WERE GOING TO EXECUTE A COMMAND/SCRIPT ON EACH HOST, ## WE COULD ASSURE THE USER HAD A .rhosts FILE. ############################################################################## # echo "+ $USER" > $HOME/.rhosts . $FEDIR/scripts/mak_rhosts ############################################################################## ## LOOP THRU HOSTS -- TO EXECUTE THE 'sar-awk' COMMAND PIPE, with 'rsh'. ############################################################################## ## Use 'ping' to check the accessibility of the host before issuing ## 'rsh'. 'ping' returns faster on an inaccessible host than 'rsh'. ############################################################################## ## CATCH THE OUTPUT FROM A SINGLE PING WITH A SMALL 4-BYTE PACKET. ############################################################################## for HOST in $HOSTLIST do echo "\nPinging $HOST to check availability." ## FOR TESTING: # set -x ########################################################################### ## PING THE HOST BEFORE TRYING 'rsh' --- and CATCH THE OUTPUT ## FROM A SINGLE PING WITH A SMALL 4-BYTE PACKET --- to use to ## determine whether to do 'rsh' on the host. ########################################################################### PINGOUT=`/usr/etc/ping -s 4 -c 1 $HOST` ## FOR TESTING: # echo "$PINGOUT # # " ## FOR TESTING: # set - PINGCHECK=`echo $PINGOUT | grep '100.0% packet loss'` if test "$PINGCHECK" = "" then ####################################################### ## Get IP address of host. ## (better way? with 'netstat -in'? 'ifconfig'?) ####################################################### ## NOT USED AT THIS TIME. ####################################################### ## ## HOSTNAME=$HOST ## HOSTNAME_ARP=`/usr/etc/arp $HOSTNAME` ## HOSTNAME2=`echo "$HOSTNAME_ARP" |sed "s|-- no entry||"` ## ## BOOTTIME=`who -b` ## # DATETIME=`date` ## DATETIME=`date '+%Y %b %d %T%p'` ####################################################### ################################################################################# ## Using 'rsh $HOST', ## get the 'sar -d' output and reformat it. ################################################################################# ## SAMPLE 'sar -d' OUTPUT: ## ## $ sar -d 3 1 ## ## IRIX64 engvis00 6.5 10181058 IP27 04/19/00 ## ## 18:49:07 device %busy avque r+w/s blks/s w/s wblks/s avwait avserv ## 18:49:10 dks0d1 1 1.0 1.0 48 1.0 48 0.0 10.0 ## dks0d6 0 0.0 0.0 0 0.0 0 0.0 0.0 ## dks3d1 0 0.0 0.0 0 0.0 0 0.0 0.0 ## dks3d2 68 9.7 17.6 9738 17.6 9738 341.1 38.7 ## dks3d3 61 8.9 16.3 8703 16.3 8703 228.6 37.6 ## dks3d5 0 0.0 0.0 0 0.0 0 0.0 0.0 ## ## $1 $2 $3 $4 $5 $6 $7 $8 $9 $10 ################################################################################# ## Reformat the 'sar -d 3 1' output. ## Put this in an environment variable, WINMSG0. ## Could use a work file if necessary. ################################################################################# ## FOR TESTING: # set -x # WINMSG0=`rsh $HOST sar -d $SAMPLE_DURATION $SAMPLES_TAKEN | \ rsh $HOST sar -d $SAMPLE_DURATION $SAMPLES_TAKEN | \ awk -v HOST="$HOST" 'BEGIN { } # end of BEGIN NR == 1 {next} NR == 2 {next} NR == 3 {next} NR == 4 {next} NR == 5 {printf ("%-10s %-8s %-10s %6s %6s %5s %5s %7s %7s %6s %6s \n", \ HOST, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10 ) ; SAVE1 = $1 ; next} {printf ("%-10s %-8s %-10s %6s %6s %5s %5s %7s %7s %6s %6s \n", \ HOST, SAVE1, $1, $2, $3, $4, $5, $6, $7, $8, $9 ) } ' >> $OUTLIST_PRESORT ################################################################# DELETE? ## RSH_RETCODE=$? ## ## if test $RSH_RETCODE = 1 ## then ## ## echo "COMMAND WAS NOT RUN ON $HOST. ## IT APPEARS THAT 'rsh' FAILED FOR USERID $USER.\n" >> $OUTLIST ## ## else ############################################################################ # # ############################################################################ ## Output those locdisk-info lines. ## Add hostname at the front of each line. ############################################################################ # ## echo "$WINMSG0" >> $OUTLIST_PRESORT # ################################################################### DELETE? # fi # ## END OF if test $RSH_RETCODE = 1 ############################################################################ else ## ELSE OF if test "$PINGCHECK" = "" echo " NO OUTPUT from $HOST. NOT PING-ABLE." ## >> $OUTLIST fi ## END OF if test "$PINGCHECK" = "" done ## END OF for HOST in $HOSTLIST ## FOR TESTING: # xpg $OUTLIST_PRESORT ######################################################################## ## Sort $OUTLIST_PRESORT by %BUSY column and add to $OUTLIST. ######################################################################## echo " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ HOST-SORT #1 SORTED BY ** %BUSY ** (column 4) _______________________________________________________________________________________ $COLHEAD0 $COLHEAD1 $COLHEAD2 $COLHEAD3 " >> $OUTLIST # sort +3nr -4 $OUTLIST_PRESORT >> $OUTLIST sort +3nr -4 +0 -1 +2 -3 $OUTLIST_PRESORT >> $OUTLIST ## FOR TESTING: # xpg $OUTLIST ######################################################################## ## Sort $OUTLIST_PRESORT by READS+WRITES/SEC column and add to $OUTLIST. ######################################################################## echo " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ HOST-SORT #2 SORTED BY ** DISK READS+WRITES/SEC ** (column 6) _______________________________________________________________________________________ $COLHEAD0 $COLHEAD1 $COLHEAD2 $COLHEAD3 " >> $OUTLIST # sort +5nr -6 $OUTLIST_PRESORT >> $OUTLIST sort +5nr -6 +0 -1 +2 -3 $OUTLIST_PRESORT >> $OUTLIST ## FOR TESTING: # xpg $OUTLIST ######################################################################## ## Sort $OUTLIST_PRESORT by WRITES/SEC column and add to $OUTLIST. ######################################################################## echo " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ HOST-SORT #3 SORTED BY ** DISK WRITES/SEC ** (column 8) _______________________________________________________________________________________ $COLHEAD0 $COLHEAD1 $COLHEAD2 $COLHEAD3 " >> $OUTLIST # sort +7nr -8 $OUTLIST_PRESORT >> $OUTLIST sort +7nr -8 +0 -1 +2 -3 $OUTLIST_PRESORT >> $OUTLIST ## FOR TESTING: # xpg $OUTLIST ######################################################################## ## Add TRAILER to report. ######################################################################## echo " ************************* `date '+%Y %b %d %T%p'` ************************************* The report above shows currently ping-able SGI 'network' hosts --- with their 'LOCAL DISK I/O' activity --- sorted several different ways: - by %BUSY - by reads+writes/sec - by writes/sec The 'busiest' host-devices, 'at the moment', appear at the top of each sort. ************************************************************************************** This report can be used to find 'forgotten' processes that are in a tight processing loop locally on a host. In some cases, such host-processes may also be generating lots of I/O on the network. Other reports (syscalls, ethernet-I/O, ethernet-collisions) will help determine whether these locally-busy processes are also busy on the network. ************************************************************************************** The short sampling period of $SAMPLE_DURATION seconds on each host --- and the fact that applications (especially interactive applications) will generate disk I/O in 'bursts' --- means that this picture could change quite a bit from one 'snapshot' to the next. However, these 'snapshots' can still prove to be quite valuable in revealing hosts (& applications & users) in intense processing situations. -------- The 'man' pages for 'sar' give the following description of the 'sar -d' output: 'sar -d' reports activity for each block device, i.e., disk drives. When data is displayed, the device specification dsk- is generally used to represent a disk drive. The activity data reported is: %busy - portion of time device was busy servicing a transfer request. avque - average number of requests outstanding during that time. r+w/s - number of data transfers from or to device. blks/s - number of bytes transferred in 512-byte (basic block) units. avwait - average time in ms. that transfer requests wait idly on queue. avserv - average time to be serviced (which for disks includes seek, rotational-latency and data-transfer times). You can use the 'hinv' command on a host to see some summary information about the disk (or RAID) controllers on the host. Use 'df -k' to see the local file-systems served by those controllers. -------- This report was generated from SGI 'network' hosts --- by userid $USER --- with the candidate hostnames provided via NIS = Network Information Service, i.e. by the 'ypcat hosts' command. Hence, the hosts are limited to 'network' hosts on which $USER can login. -------- The report was assembled by the script $0 The script uses a sequence of commands ('sar -d $SAMPLE_DURATION $SAMPLES_TAKEN','awk','sed') on each host, followed by several uses of the 'sort' command, applied to the collected output from all hosts --- so the HOSTNAMES WITH THE HIGHEST LOCAL-DISK-ACTIVITY ON THE NETWORK POP UP TO THE TOP OF EACH SORT LIST. -------- IMPLEMENTATION METHODS: The ' `basename $0` ' script is intended to be run periodically (preferably almost daily) by SGI network Administrators --- Application (CAD/FEA) or System (root) Administrators. It could be run at a 'quiet' time --- like noon-hour or evening/night. The script could be implemented as - a command alias, via an Administrator's .profile file; - a desktop icon, via the 'Find, File QuickFind' tool drawers; - a drawer in the SGI toolchest, via an Administrator's .auxchestrc file. Or the script could be accessed via a drawer in a command toolchest-utility, like 'nethosts_tools'. This report can be generated via nnsFEAmenu option 'u n ?' (Utilities, Net-vu, ?). .................................................................................... " >> $OUTLIST ##################################################################### ## SHOW THE REPORT. ##################################################################### ## FOR TESTING: # echo " # DISPLAY: $DISPLAY" # # set -x ##################################################################### ## $FEDIR/scripts/shofil Does not work in an 'xwsh' from toolchest. ## Apparently, ## because of '&' batch invocation of shofil.tk within this script. ##################################################################### # $FEDIR/scripts/shofil $OUTLIST ##################################################################### SHOFILENAME=$OUTLIST export SHOFILENAME XLPHP_FORMAT="AV" export XLPHP_FORMAT ## $FEDIR/tkGUIs/shofil.tk & ## DOES NOT SHOW UP in an 'xwsh'. $FEDIR/tkGUIs/shofil.tk