/**
* HTML file word counter and dictionary builder.
* @author Robert J Morton <robmorton@clara.net>
* @version 05 Sep 2000 */
/* This code uses the Java 1.1.8 API. Usage: Put the parent directory name in arg0
Sample command line entry: "java wc /home/rob/website tli" Do NOT put a final slash.
This program counts the words in each HTML files it finds under a specified parent
directory. The counting process excludes HTML tags and between certain prescribed
tags (see HTMLtag.java). It also compiles a dictionary of all the words found in all
the files and notes the total number of occurrences of each word for all the files
put together. */
import java.io.*;
class wc {
private static int dl = 0; // length of parent directory path name + a terminating '/'
private static wordCnt ct; // reference to an instance of the word counter object
private static dic D; // reference to an instance of the dictionary generator
private static int widths[] = {28, 10, 9, 10, 8}; // column widths
private static String title = " FILESPEC CHARACTERS WORDS SENTENCES WORDS/S";
private static String s;
private static BufferedWriter o; // for output text file
private static int chars = 0, totalChars = 0; // counters
private static int words = 0, totalWords = 0;
private static int sents = 0, totalSents = 0;
private static String ws = " "; // for words per sentence count
public static void main(String args[]) throws Exception {
String bd = "", sd = "";
if(args.length > 1) { // provided a command line argument has been entered
bd = args[0]; // name of base directory from command line
sd = args[1]; // name of search directory if below base directory
String d = bd + "/" + sd; // form the full path
dl = bd.length() + 1; // length of parent directory path name + 1
File pd = new File(d); // form file object for parent directory
if(pd.isDirectory()) { // if command line argument is an existing directory
o = new BufferedWriter( // open output file in an output stream writer
new OutputStreamWriter( // and wrap that in a buffered writer
new FileOutputStream("wc.txt") // in order to be able to use write()
)
);
o.write(title, 0 , title.length()); // print the table's column headings
o.newLine(); // ensure a system-native new-line
System.out.println(title);
D = new dic(); // create a dictionary generator object
ct = new wordCnt(D); // create a word counter object
scan(d); // search for HTML files in specified directory tree
o.newLine(); // ensure a system-native new-line
if(totalSents > 0) // provided there is at least one sentence in document
ws = format(Integer.toString(totalWords/totalSents), 4);
else // to avoid division by zero Exception
ws = " ";
s = "TOTALS " // ASSEMBLE THE 'TOTALS' PRINT LINE
+ format(Integer.toString(totalChars), 1) // pad out and append total character count
+ format(Integer.toString(totalWords), 2) // pad out and append total word count
+ format(Integer.toString(totalSents), 3) // pad out and append total sentence count
+ ws; // append the words per sentence ratio
o.write(s, 0, s.length()); // write the line
o.newLine(); // ensure a system-native new-line
o.close(); // close the count results file
System.out.println(s); // print the line to console also
D.save(); // save the dictionary to a file dic.txt
System.out.println("Results of this word count are in wc.txt.");
} else System.out.println(d + " is not a directory.");
} else {
System.out.println("Must specify two parameters on command line.");
System.out.println("1. Absolute path leading to the directory containing the");
System.out.println(" files on which you want to do the word count, or the");
System.out.println(" relative path from where this program is located.");
System.out.println("2. The name of the parent directory containing the files");
System.out.println(" on which you want to do the word count.");
System.out.println("Eg java wc /home/rob/website tli");
System.out.println(" where 'tli' is the directory containing the HTML");
System.out.println(" files on which you wish to conduct the word count.");
System.out.println(" Here 'tli' may itself contain any depth of subdirectories.");
System.out.println("Results of word count are saved in wc.txt in current directory.");
System.out.println("The dictionary is saved in dic.txt also in current directory.");
}
}
/* This method is re-entrant. It calls itself.
When invoked, it examines the files and directories contained within the directory 'd'
passed to it as its parameter. If an entry is an HTML file, it writes that file's relative
filespec + word count to wc.txt The 'relative' filespec is the path+filename from the
point of view of the current directory. If an entry is a directory, it simply calls itself
to deal with that (sub) directory as it is doing with the current directory. Thus it can
handle any depth of sub-directories from the parent. */
private static void scan(String d) throws IOException {
char ch = ' ';
File fd = new File(d); // create file object for given directory name
String D[] = fd.list(); // list all items in this directory
for(int i = 0; i < D.length; i++) { // for each HTML file in the sub-directory
s = d + "/" + D[i]; // get name of [next] sub-directory
File fs = new File(s); // create a file object for it
if(fs.isDirectory()) { // if it is an existing directory
scan(s); // re-enter this method
} else if(fs.isFile() && s.endsWith("htm")) { // if it is an existing file and it is an HTML file
if(ct.count(s)) // count chars, words, sentences in this file
ch = ' '; // file contains no malformed character entities
else // if file contains a malformed character entity
ch = '*'; // mark its filespec with an asterisk
s = "" + ch + " " + s.substring(dl, s.length()); // form filepath prefixed with an * if necessary
totalChars += (chars = ct.getCharCount()); // get HTML file's character count and add it to total
totalWords += (words = ct.getWordCount()); // get HTML file's word count and add it to total
totalSents += (sents = ct.getSentCount()); // get HTML file's sentence count and add it to total
for(int j = s.length(); j < widths[0]; j++) // pad out filespec to width of filespec column
s += " ";
if(sents > 0) // provided document contains at least one sentence
ws = format(Integer.toString(words/sents), 4);
else // avoids division by zero exception
ws = " ---";
s += format(Integer.toString(chars), 1) // pad out and append character count
+ format(Integer.toString(words), 2) // pad out and append word count
+ format(Integer.toString(sents), 3) // pad out and append sentence count
+ ws; // append words per sentence ratio
o.write(s, 0, s.length()); // write the relative filespec + word count
o.newLine(); // ensure a system-native new-line
System.out.println(s); // print the line to console also
}
}
}
private static String format(String s, int n) { // pads out a numeric string with
for(int i = s.length(); i < widths[n]; i++) // leading spaces in oder to
s = " " + s; // make the string a prescribed
return s; // length.
}
/* Robert J Morton, the author of this program,
is a poor but Right Honourable Fellow of the
Ancient and Noble Order of the Long-term Unemployed.
Offers of work please to: robmorton@clara.net */
}
This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.