/**
  * HTML file word counter and dictionary builder.
  * @author Robert J Morton <robmorton@clara.net>
  * @version 05 Sep 2000 */

/* This code uses the Java 1.1.8 API.  Usage: Put the parent directory name in arg0
   Sample command line entry: "java wc /home/rob/website tli" Do NOT put a final slash.

   This program counts the words in each HTML files it finds under a specified parent 
   directory. The counting process excludes HTML tags and between certain prescribed 
   tags (see HTMLtag.java). It also compiles a dictionary of all the words found in all
   the files and notes the total number of occurrences of each word for all the files 
   put together. */

import java.io.*;

class wc {
   private static int dl = 0;                                // length of parent directory path name + a terminating '/'
   private static wordCnt ct;                                // reference to an instance of the word counter object
   private static dic D;                                     // reference to an instance of the dictionary generator
   private static int widths[] = {28, 10, 9, 10, 8};         // column widths
   private static String title = "  FILESPEC                  CHARACTERS    WORDS SENTENCES WORDS/S";
   private static String s;
   private static BufferedWriter o;                          // for output text file
   private static int chars = 0, totalChars = 0;             // counters
   private static int words = 0, totalWords = 0;
   private static int sents = 0, totalSents = 0;
   private static String ws = "        ";                    // for words per sentence count

   public static void main(String args[]) throws Exception {
      String bd = "", sd = "";                  
      if(args.length > 1) {                                  // provided a command line argument has been entered
         bd = args[0];                                       // name of base directory from command line
         sd = args[1];                                       // name of search directory if below base directory
         String d = bd + "/" + sd;                           // form the full path
         dl = bd.length() + 1;                               // length of parent directory path name + 1
         File pd = new File(d);                              // form file object for parent directory
         if(pd.isDirectory()) {                              // if command line argument is an existing directory
            o =  new BufferedWriter(                         // open output file in an output stream writer 
                    new OutputStreamWriter(                  // and wrap that in a buffered writer
                       new FileOutputStream("wc.txt")        // in order to be able to use write()
                    )
                 );
            o.write(title, 0 , title.length());              // print the table's column headings
            o.newLine();                                     // ensure a system-native new-line
            System.out.println(title);
            D = new dic();                                   // create a dictionary generator object
            ct = new wordCnt(D);                             // create a word counter object
            scan(d);                                         // search for HTML files in specified directory tree
            o.newLine();                                     // ensure a system-native new-line
            if(totalSents > 0)                               // provided there is at least one sentence in document
               ws = format(Integer.toString(totalWords/totalSents), 4);
            else                                             // to avoid division by zero Exception
               ws = "        ";
            s = "TOTALS                      "               // ASSEMBLE THE 'TOTALS' PRINT LINE
              + format(Integer.toString(totalChars), 1)      // pad out and append total character count
              + format(Integer.toString(totalWords), 2)      // pad out and append total word count
              + format(Integer.toString(totalSents), 3)      // pad out and append total sentence count
              + ws;                                          // append the words per sentence ratio
            o.write(s, 0, s.length());                       // write the line
            o.newLine();                                     // ensure a system-native new-line
            o.close();                                       // close the count results file
            System.out.println(s);                           // print the line to console also
            D.save();                                        // save the dictionary to a file dic.txt
            System.out.println("Results of this word count are in wc.txt.");
         } else System.out.println(d + " is not a directory.");
      } else {
         System.out.println("Must specify two parameters on command line.");
         System.out.println("1. Absolute path leading to the directory containing the");
         System.out.println("   files on which you want to do the word count, or the");
         System.out.println("   relative path from where this program is located.");
         System.out.println("2. The name of the parent directory containing the files");
         System.out.println("   on which you want to do the word count.");
         System.out.println("Eg java wc /home/rob/website tli");
         System.out.println("   where 'tli' is the directory containing the HTML");
         System.out.println("   files on which you wish to conduct the word count.");
         System.out.println("   Here 'tli' may itself contain any depth of subdirectories.");
         System.out.println("Results of word count are saved in wc.txt in current directory.");
         System.out.println("The dictionary is saved in dic.txt also in current directory.");
      }
   }


/* This method is re-entrant. It calls itself. 

   When invoked, it examines the files and directories contained within the directory 'd' 
   passed to it as its parameter. If an entry is an HTML file, it writes that file's relative 
   filespec + word count to wc.txt  The 'relative' filespec is the path+filename from the 
   point of view of the current directory. If an entry is a directory, it simply calls itself 
   to deal with that (sub) directory as it is doing with the current directory. Thus it can 
   handle any depth of sub-directories from the parent. */


   private static void scan(String d) throws IOException {
      char ch = ' ';
      File fd = new File(d);                                 // create file object for given directory name
      String D[] = fd.list();                                // list all items in this directory
      for(int i = 0; i < D.length; i++) {                    // for each HTML file in the sub-directory
         s = d + "/" + D[i];                                 // get name of [next] sub-directory
         File fs = new File(s);                              // create a file object for it
         if(fs.isDirectory()) {                              // if it is an existing directory
            scan(s);                                         // re-enter this method
         } else if(fs.isFile() && s.endsWith("htm")) {       // if it is an existing file and it is an HTML file
            if(ct.count(s))                                  // count chars, words, sentences in this file
               ch = ' ';                                     // file contains no malformed character entities
            else                                             // if file contains a malformed character entity
               ch = '*';                                     // mark its filespec with an asterisk
            s = "" + ch + " " + s.substring(dl, s.length()); // form filepath prefixed with an * if necessary
            totalChars += (chars = ct.getCharCount());       // get HTML file's character count and add it to total
            totalWords += (words = ct.getWordCount());       // get HTML file's word count and add it to total
            totalSents += (sents = ct.getSentCount());       // get HTML file's sentence count and add it to total

            for(int j = s.length(); j < widths[0]; j++)      // pad out filespec to width of filespec column
               s += " ";
            if(sents > 0)                                    // provided document contains at least one sentence
               ws = format(Integer.toString(words/sents), 4);
            else                                             // avoids division by zero exception
               ws = "     ---";
            s += format(Integer.toString(chars), 1)          // pad out and append character count
               + format(Integer.toString(words), 2)          // pad out and append word count
               + format(Integer.toString(sents), 3)          // pad out and append sentence count
               + ws;                                         // append words per sentence ratio
            o.write(s, 0, s.length());                       // write the relative filespec + word count
            o.newLine();                                     // ensure a system-native new-line
            System.out.println(s);                           // print the line to console also
         }
      }
   }


   private static String format(String s, int n) {           // pads out a numeric string with
      for(int i = s.length(); i < widths[n]; i++)            // leading spaces in oder to
         s = " " + s;                                        // make the string a prescribed
      return s;                                              // length.
   }

/*  Robert J Morton, the author of this program, 
    is a poor but Right Honourable Fellow of the
    Ancient and Noble Order of the Long-term Unemployed.

    Offers of work please to: robmorton@clara.net  */
}

This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.