/**
* Word counter for text or HTML files
* @author Robert J Morton <robmorton@clara.net>
* @version 04 September 2000
* @copyright Sep 2000 Robert J Morton (all rights reserved) */
/* This program accepts a filespec (path + filename). It then counts the words in the given
file, ignoring all HTML tags it may encounter. It submits each word it encounters to the
dictionary class for inclusion in the dictionary. This program assumes all HTML files
conform to strict XHTML rules: eg. no tolerance of naked & < > characters in the text.
This program only counts words in the <BODY> section of a document.
Rules:
1) The end of a sentence is taken as being wherever a full-stop, question mark or
exclamation mark is followed by whitespace.
2) Disconnected apostophe-s and apostrophe-t are not counted as separate words.
See wordCounter() method below for full list of excluded word fragments. */
import java.io.*; // for handling the file being scanned
class wordCnt {
private int nc = 0; // number of characters in the file
private int nw = 0; // number of words in the file
private int nl = 0; // number of lines in the file
private int ns = 0; // number of sentences in the file
private dic D; // reference to the dictionary class
private FileReader fr; // file reader for current file
private char c; // current character
private String w = ""; // string for word currently being captured
private boolean insideWord = false; // true when we are currently inside a word
private boolean endOfSentence = false; // true when an end-of-sentence has been encountered
private boolean beyondBodyTag = false; // start the word counting process
private HTMLtag T; // reference to HTML tag stripper
wordCnt(dic D) throws Exception { // construct a new file data object from file name
this.D = D; // pass dictionary generator object reference
T = new HTMLtag(); // creat a new HTML tag stripper
}
boolean count(String fp) { // count the characters, words, lines, sentences
w = ""; // string for word currently being captured
nc = 0; // number of characters in the file
nw = 0; // number of words in the file
ns = 0; // number of sentences in the file
insideWord = false; // true when we are currently inside a word
endOfSentence = false; // true when an end-of-sentence has been encountered
beyondBodyTag = false; // start the word counting process
int x = 0; // for next input character
boolean flag = true; // means no malformed character entities in this file
try {
fr = new FileReader(fp); // create a file reader for this file
while((x = fr.read()) != -1) { // loop broken by End-Of-File
if((c = T.capture((char)x)) != 0 // provided character is not part of an HTML tag and
&& (c = charEnt.capture(c)) != 0 // it is not part of an un-substituted character entity
) { // and provided we are now beyond the HTML <body> tag
if(c == '^') flag = false; // say there is a character entity error in this file
++nc; // increment the character count
wordCounter(); // do word count
}
}
fr.close(); // close the file reader
} catch(Exception e) {} // catches end-of-file exception
return flag; // true if malformed char ent was encountered
}
private void wordCounter() { // count words and sentences
if(Character.isLetter(c)) { // if this character is part of a word
w += c; // add new character to current word
insideWord = true; // indicate that we are currently in a word
} else { // if this character is other than a letter
if(insideWord) { // if this is first non-letter after a word
if(!w.equals("s") // do not count a decoupled apostrophe-s
&& !w.equals("t") // or apostophe-t eg: in don't, won't
&& !w.equals("ll") // or the 'll on we'll etc
&& !w.equals("ve") // or the 've on we've etc
&& !w.equals("ts") // or the 'ts on don'ts etc
&& !(w.length() == 1 && Character.isUpperCase(w.charAt(0)))
) { // or a lone capital (an initial)
D.submit(abbrevs(w)); // register it in dictionary
++nw; // increment the number of words counted
}
w = ""; // clear the word accumulator string
insideWord = false; // set to indicate that we are outside a word
}
if(Character.isWhitespace(c) // if this character is whitespace after a .?!
&& endOfSentence) {
ns++; // increment the sentence count
endOfSentence = false; // clear the end-of-sentence flag
} else if(c == '.' // else if this character is a full-stop
|| c == '?' // or a question-mark
|| c == '!') // or an exclamation mark
endOfSentence = true; // set the end-of-sentence flag
}
}
String abbrevs(String w) {
String abbrv[] = {
"didn", "couldn", "wasn", "isn", "doesn",
"hadn", "hasn", "aren", "wouldn", "shouldn"
};
String subst[] = {
"didn't", "couldn't", "wasn't", "isn't", "doesn't",
"hadn't", "hasn't", "aren't", "wouldn't", "shouldn't"
};
for(int i = 0; i < abbrv.length; i++) // for each of abbreviations
if(w.equalsIgnoreCase(abbrv[i])) { // substitute any trunkated colloquial
w = subst[i]; // abbreviation with proper abbreviation.
break;
}
return w;
}
int getCharCount() {return nc;} // number of characters in the file
int getWordCount() {return nw;} // number of words in the file
int getSentCount() {return ns;} // number of sentences in the file
/* Robert J Morton, the author of this program,
is a poor but Right Honourable Fellow of the
Ancient and Noble Order of the Long-term Unemployed.
Offers of work please to: robmorton@clara.net */
}
This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.