/** 
  * Word counter for text or HTML files
  * @author Robert J Morton <robmorton@clara.net>
  * @version 04 September 2000
  * @copyright Sep 2000 Robert J Morton (all rights reserved) */

/* This program accepts a filespec (path + filename). It then counts the words in the given
   file, ignoring all HTML tags it may encounter. It submits each word it encounters to the
   dictionary class for inclusion in the dictionary. This program assumes all HTML files 
   conform to strict XHTML rules: eg. no tolerance of naked & < > characters in the text. 
   This program only counts words in the <BODY> section of a document. 

   Rules:

   1) The end of a sentence is taken as being wherever a full-stop, question mark or 
      exclamation mark is followed by whitespace.

   2) Disconnected apostophe-s and apostrophe-t are not counted as separate words. 
      See wordCounter() method below for full list of excluded word fragments. */

import java.io.*;                               // for handling the file being scanned

class wordCnt {
   private int nc = 0;                          // number of characters in the file
   private int nw = 0;                          // number of words in the file
   private int nl = 0;                          // number of lines in the file
   private int ns = 0;                          // number of sentences in the file
   private dic D;                               // reference to the dictionary class
   private FileReader fr;                       // file reader for current file
   private char c;                              // current character
   private String w = "";                       // string for word currently being captured
   private boolean insideWord = false;          // true when we are currently inside a word
   private boolean endOfSentence = false;       // true when an end-of-sentence has been encountered
   private boolean beyondBodyTag = false;       // start the word counting process
   private HTMLtag T;                           // reference to HTML tag stripper


   wordCnt(dic D) throws Exception {            // construct a new file data object from file name
      this.D = D;                               // pass dictionary generator object reference
      T = new HTMLtag();                        // creat a new HTML tag stripper
   }


   boolean count(String fp) {                   // count the characters, words, lines, sentences
      w = "";                                   // string for word currently being captured
      nc = 0;                                   // number of characters in the file
      nw = 0;                                   // number of words in the file
      ns = 0;                                   // number of sentences in the file
      insideWord = false;                       // true when we are currently inside a word
      endOfSentence = false;                    // true when an end-of-sentence has been encountered
      beyondBodyTag = false;                    // start the word counting process
      int x = 0;                                // for next input character
      boolean flag = true;                      // means no malformed character entities in this file
      try {
         fr = new FileReader(fp);               // create a file reader for this file
         while((x = fr.read()) != -1) {         // loop broken by End-Of-File
            if((c = T.capture((char)x)) != 0    // provided character is not part of an HTML tag and
            && (c = charEnt.capture(c)) != 0    // it is not part of an un-substituted character entity
            ) {                                 // and provided we are now beyond the HTML <body> tag
               if(c == '^') flag = false;       // say there is a character entity error in this file
               ++nc;                            // increment the character count
               wordCounter();                   // do word count
            }
         }
         fr.close();                            // close the file reader
      } catch(Exception e) {}                   // catches end-of-file exception
      return flag;                              // true if malformed char ent was encountered
   }


   private void wordCounter() {                 // count words and sentences
      if(Character.isLetter(c)) {               // if this character is part of a word
         w += c;                                // add new character to current word
         insideWord = true;                     // indicate that we are currently in a word
      } else {                                  // if this character is other than a letter
         if(insideWord) {                       // if this is first non-letter after a word
            if(!w.equals("s")                   // do not count a decoupled apostrophe-s
            && !w.equals("t")                   // or apostophe-t eg: in don't, won't
            && !w.equals("ll")                  // or the 'll on we'll etc
            && !w.equals("ve")                  // or the 've on we've etc
            && !w.equals("ts")                  // or the 'ts on don'ts etc
            && !(w.length() == 1 && Character.isUpperCase(w.charAt(0)))
            ) {                                 // or a lone capital (an initial)
               D.submit(abbrevs(w));            // register it in dictionary 
               ++nw;                            // increment the number of words counted
            }
            w = "";                             // clear the word accumulator string
            insideWord = false;                 // set to indicate that we are outside a word
         }
         if(Character.isWhitespace(c)           // if this character is whitespace after a .?!
         && endOfSentence) {
            ns++;                               // increment the sentence count
            endOfSentence = false;              // clear the end-of-sentence flag
         } else if(c == '.'                     // else if this character is a full-stop
                || c == '?'                     // or a question-mark
                || c == '!')                    // or an exclamation mark
            endOfSentence = true;               // set the end-of-sentence flag
      }
   }


   String abbrevs(String w) {
      String abbrv[] = {
         "didn",   "couldn",   "wasn",   "isn",      "doesn",
         "hadn",   "hasn",     "aren",   "wouldn",   "shouldn"
      };
      String subst[] = {
         "didn't", "couldn't", "wasn't", "isn't",    "doesn't",
         "hadn't", "hasn't",   "aren't", "wouldn't", "shouldn't"
      };
      for(int i = 0; i < abbrv.length; i++)     // for each of abbreviations
         if(w.equalsIgnoreCase(abbrv[i])) {     // substitute any trunkated colloquial 
            w = subst[i];                       // abbreviation with proper abbreviation.
            break;
         }
      return w;
   }


   int getCharCount() {return nc;}              // number of characters in the file
   int getWordCount() {return nw;}              // number of words in the file
   int getSentCount() {return ns;}              // number of sentences in the file

/*  Robert J Morton, the author of this program, 
    is a poor but Right Honourable Fellow of the
    Ancient and Noble Order of the Long-term Unemployed.

    Offers of work please to: robmorton@clara.net  */
}

This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.