#!/bin/sh
##
## Nautilus
## SCRIPT: 00_1file_SHOW-STR-MATCH-LINES_awk.sh
##
## PURPOSE: Reads a (huge) file and shows the lines that contain
##          a user-specified string. (The file can contain 'binary' data.)
##
## METHOD:  Uses 'zenity --entry' to prompt for a string.
##
##          Uses 'awk' to read the file and extract matching lines.
##          (Much more efficient than a while-read loop in this script.)
##
##          Puts the results in a temp file and shows it in a GUI
##          text browser/editor of the user's choide.
##
## HOW TO USE: In Nautilus, navigate to a (huge) file, select it,
##             right-click and choose this Nautilus script to run.
##
## Created: 2014jul28 Based on the script
##                    'findANDshow_stringsINfile_plusminusNlines.sh'
##                    of the FE 'xpg' utility.
## Changed: 2014 

## FOR TESTING: (show statements as they execute)
# set -x

#######################################
## Get the filename.
## Also get the current directory, for
## use in report header/trailer lines.
#######################################

FILENAME="$1"

CURDIR="`pwd`"


#######################################################
## Check that the selected file is a text file.
## COMMENTED, for now.
#######################################################

#  FILECHECK=`file "$FILENAME" | egrep 'text|Mail|ASCII'`
 
#  if test "$FILECHECK" = ""
#  then
#     exit
#  fi


##############################################
## Prompt for the search string, using zenity.
##############################################

STRINGS=""

STRINGS=$(zenity --entry \
   --title "Enter STRING(s) to search for." \
   --text "\
Enter STRING(s) for the (Case-INsensitive) FILES search.

For example, in a huge mail file (INBOX or SENT or ...),
one could look for lines that contain 'From:' or 'Subject:'.

Multiple-strings can be specified by separating the
multiple strings by vertical-bars (|).
Example:  'from:|subject:'" \
   --entry-text "From:"
)

if test "$STRINGS" = ""
then
   exit
fi


#########################################################
## Initialize the output file.
##
## NOTE: If the selected file is in a directory for which
##       the user does not have write-permission, we
##       put the output file in /tmp rather than in the
##       current working directory.
## CHANGE: To avoid junking up the curdir, we use /tmp.
#########################################################

OUTFILE="${USER}_temp_STRINGS-MATCH_1file.lis"

# if test ! -w "$CURDIR"
# then
  OUTFILE="/tmp/$OUTFILE"
# fi

if test -f  "$OUTFILE"
then
  rm -f "$OUTFILE"
fi


#######################################################
## Generate a header for the listing.
## (Following are some parameters for the search that
## may be described in the header.)
#######################################################

MAXlineLEN=3071
Nlines=4
CaseSense="no"

echo "\
.................... `date '+%Y %b %d  %a  %T%p %Z'` ..........................

LINES THAT CONTAIN THE STRING(s):  $STRINGS

in the FILE

  $FILENAME

in DIRECTORY

  $CURDIR

The following listing includes PLUS-OR-MINUS $Nlines LINES before and
after each 'match-line'.

The matches are determined case-INsensitively. In other words, any
combination of upper and lower case letters in the above string(s)
is considered a match.

Line numbers are shown on the left of each printed line of the input
file, and 'match-lines' are indicated by an asterisk (*) to the left 
of the line numbers.

.................. START OF 'awk' OUTPUT ............................
" >  "$OUTFILE"


###########################################################
## Use 'awk' to read the file and output each 'match-line'.
## --- along with a few summary stats, like number of
## match-lines found.
#############################################################
## Add 'cut -c1-$MAXLINELEN $FILENAME |' before awk, to avoid
## 'Input record too long' error that stops awk dead.
#############################################################

cut -c1-$MAXlineLEN "$FILENAME" | \
awk  -v N="$Nlines"  -v STRING="$STRINGS" -v CASESENSE="$CaseSense" \
'BEGIN {

   ## Zero some counters.
   TOTlinesREAD = 0
   NUMmatchLINES = 0

   #######################################################
   ## Initialize the N "prev" vars to null.
   ## They are to hold the last N lines read.
   #######################################################
   for ( i = 1 ; i <= N ; i++ )
   {
       prev[i] = ""
   }


   ##################################################
   ## After converting STRING to upper-case,
   ## if CASESENSE=no,
   ## split the "STRING" into NS "subSTRING"s -- at
   ## occurrences of a vertical bar (|).
   ##################################################

   if ( CASESENSE == "no" ) {
      STRING = toupper(STRING)
   }

   NS=split(STRING,subSTRING,"|")

   ## FOR TESTING:
   #    print "CASESENSE: " CASESENSE
   #    print "NS: " NS
   #    print "subSTRING[1] :" subSTRING[1]
   #    print "subSTRING[2] :" subSTRING[2]
   #    print "subSTRING[3] :" subSTRING[3]

   ###################################################
   ## "aftcount" holds the integer N,N-1,...,2,1, or 0
   ## --- representing the number of lines after the
   ## last matched line that still need to be printed.
   ###################################################
   aftcount = 0

   ######################################################
   ## "lastprt" holds the line# of the line last printed.
   ## "lastprt" is reset any time "printf" is called.
   ######################################################
   lastprt = 0

}
#END OF BEGIN
#START OF BODY
{
   ####################################################
   ## IF WE HAVE A MATCH, SUSPEND PRINTING
   ## at N "AFTER-A-MATCH-LINES":
   ## If there is a new match, reset "aftcount" to zero.
   ##            (We do not want to print a line twice.)
   ## We will restart aftcount at N after the new match
   ## line is printed.
   ####################################################
   ## We use "Match" to indicate whether there was a
   ## match to at least one of the subSTRINGs, in the
   ## current line ($0).  Match==1 indicates a match.
   ####################################################

   Match = 0

   if ( CASESENSE == "no" ) {
      HOLDline = toupper($0)
   } 
   if ( CASESENSE == "yes" )  {
      HOLDline = $0
   }

   TOTlinesREAD += 1

   ## FOR TESTING:
   #   if  ( NR < 10 ) { print "HOLDline :" $HOLDline }

   for  ( i = 1 ; i <= NS ; i++ ) {  

      ## This fails when certain special chars are in the substring.
      # if (  HOLDline ~ subSTRING[i] )  { aftcount = 0 ; Match = 1 }

      if ( index(HOLDline,subSTRING[i]) != 0  )  {
         aftcount = 0
         Match = 1
         NUMmatchLINES += 1
      }

      ## FOR TESTING:
      #   print ""
      #   print "HOLDline: " HOLDline
      #   print "subSTRING LOOP - i: " i " subSTRING[i]: " subSTRING[i] " aftcount: " aftcount " Match: " Match
      #   print "index(HOLDline,subSTRING[i]): "index(HOLDline,subSTRING[i])

   }

   ## FOR TESTING:
   # }" "$FILENAME"
   # exit


   ######################################################
   ## PRINT ONE OF THE N "AFTER-A-MATCH-LINES":
   ## If "aftcount" is non-zero, print the current line.
   ## We had a match up to N lines ago. Decrement "aftcount"
   ## and save the number of the printed line in "lastprt".
   ######################################################

   if ( aftcount != 0 ) {

      printf (" %s : %s \n", NR, $0);

      ## If this is the last of the "aftcount" lines,
      ## print a blank line.
      if ( aftcount == 1 ) {print ""}

      aftcount = aftcount - 1 ;
      lastprt = NR

      ## FOR TESTING:
      #  print "aftcount != 0  CHECK::  aftcount: " aftcount " lastprt: " lastprt
   }

   ## FOR TESTING:
   # }" "$FILENAME"
   # exit


   ######################################################
   ## IF WE HAVE A MATCH, PRINT N-PREV & CURRENT:
   ## If there is a match, print the N previous lines
   ## --- as long as their linenums are greater than
   ## the last-printed line number.  (We do not want
   ## to print a line twice.)
   ##
   ## Then print the current line.  Also set "aftcount"
   ## to N, and save the
   ## number of the matched-printed line in "lastprt".
   ######################################################

   for  ( i = N ; i > 0 ; i-- ) {  

      recnum = NR - i
      if ( Match == 1 && recnum > lastprt ) {
         printf (" %s : %s \n", recnum, prev[i])
      }

      ## FOR TESTING:
      #  print "prev[] PRINT-LOOP::  NR= " NR " recnum= " recnum " i= " i
      #  print "prev[] PRINT-LOOP::  lastprt= " lastprt " prev[i]= " prev[i]
   }


   if ( Match == 1 ) {

       printf ("*%s : %s \n", NR, $0);
       aftcount = N;
       lastprt = NR

       ## FOR TESTING:
       #  print "Match == 1 TEST::  aftcount: " aftcount " lastprt: " lastprt
   }


   ########################################################
   ## Update prev[N], prev[N-1], ... , prev[2], and prev[1]
   ## before reading the next line.
   ########################################################

   for  ( i = N ; i > 1 ; i-- )
   {  
     prev[i] = prev[i-1]
   }

   prev[1] = $0

}
#END OF BODY
## START OF END
END {
 printf ("*********************************************************\n")
 printf ("SUMMARY:\n")
 printf ("Number of lines containing the string(s) = %s\n", NUMmatchLINES)
 printf ("Total number of lines read = %s\n", TOTlinesREAD)
}' >> "$OUTFILE"
## WAS:
## }' "$FILENAME" >> "$OUTFILE"


###############################
## Add a trailer to the listing.
###############################

SCRIPT_BASENAME=`basename $0`
SCRIPT_DIRNAME=`dirname $0`

echo "
.................. END OF 'awk' OUTPUT ............................

  The output above is from script

$SCRIPT_BASENAME

   in directory

$SCRIPT_DIRNAME

.................... `date '+%Y %b %d  %a  %T%p %Z'` ..........................
DESCRIPTION OF SCRIPT:

This utility script uses an 'awk' program that essentially extends the
capabilities of the 'egrep' (extended grep) program. ['grep' is a program
that can find lines in a file that contain a given string of characters.]

'egrep' can show the lines of a file that contain matches to *one-or-more*
strings.  Example: 'error', 'fail', 'fatal', or 'warning'.

With 'egrep', the multiple-strings argument is formed by separating the
multiple strings by vertical-bars (|).  Example:  'fatal|error|fail|warning'

But 'egrep' cannot show nearby lines.  The 'awk' program used here essentially
creates an extension of the 'egrep' (extended grep) utility.

----------------------------------------------------------------------------
THE PLUS-OR-MINUS N LINES CAPABILITY:

This utility can show plus-or-minus N lines above and below the lines that
have a match for the search string(s).

For now, N is hard-coded to $Nlines --- so this utility shows plus-or-minus
$Nlines lines above and below the 'match-lines'.

You could say this is an 'eegrep' utility --- extended, extended grep.

----------------------------------------------------------------------------
CASE-(IN)SENSITIVITY OF THE SEARCH:

With 'egrep', one can make the search case-insenstive with the '-i' option.
Likewise, this utility COULD be told to make the search either
case-sensitive or NOT.

For now, this utility is hard-coded to case-sense=\"no\". For example,
to find all lines containing either 'memory' or 'RAM' or 'disk', you
can use the search string 'memory|ram|disk'.

Note that the example search may return too many lines for the string 'ram'
--- lines with words like 'datagram' or 'telegram' or 'ramble' or 'gram'.

We could add a prompt for case-sensitivity of the search. Then, with a
case-sensitivity switch set to ON --- one could use a search string like
'memory|RAM|disk'.

----------------------------------------------------------------------------
MAXIMUM LINE LENGTH OF THE SEARCH:

The first $MAXlineLEN characters of each line of the input file is
searched for matches. Any characters beyond that column in a file
record is not searched for a match.

.................... `date '+%Y %b %d  %a  %T%p %Z'` ..........................
" >>  "$OUTFILE"


############################
## Show the list.
############################

## . $HOME/.gnome2/nautilus-scripts/.set_VIEWERvars.shi

. $HOME/.freedomenv/feNautilusScripts/set_DIR_NautilusScripts.shi
. $DIR_NautilusScripts/.set_VIEWERvars.shi

$TXTVIEWER "$OUTFILE" &