/**
* HTML tag extractor
* @author Robert J Morton <robmorton@clara.net>
* @version 04 September 2000
* @copyright Sep 2000 Robert J Morton (all rights reserved) */
/* This program scans an incoming character stream. It returns each character which is not
part of an HTML tag unchanged. It returns a null character in place of each character
which is part of an HTML tag. It returns a null character in place of each character it
encounters OUTSIDE the <body> </body> tags, and in place of each character it encounters
INSIDE the tag-pairs shown in the Tags[] array below. */
class HTMLtag {
private String Tags[] = {
"table", "ul", "ol", "applet", "form" // HTML tags between which words are not to be counted
};
private int T = Tags.length; // number of entries in the Tags[] array
private int t = 0; // tag index number in Tags[]
private boolean inTag = false; // true when inside an HTML tag
private boolean betweenTags = false; // true when between one of the above tag-pairs
private boolean inBody = false; // true when between the document's <body> tags
private String Tag = ""; // tag accumulator string
char capture(char c) { // FILTER OUT HTML TAGS ACCORDING TO STRICT XML RULES
if(inTag) { // if inside an HTML tag
if(c == '>') // if char is a tag-end,
inTag = false; // reset to 'outside a tag'
else { // else if it is any other character
Tag += c; // add it to what we have captured so far
String tag = Tag.toLowerCase(); // rationalise to lower case for comparison
if(inBody) { // if currently within the <body> of the HTML document
if(tag.equals("/body")) // if what we have so far accumulated is the </body> tag
inBody = false; // set to indicate we are no longer in the document body
} else { // if we are currently outside the document <body>
if(tag.equals("body")) // if what we have so far accumulated is the <body> tag
inBody = true; // indicate we are now between the <body> and </body> tags
}
if(betweenTags && // if we are currently between a prescribed tag pair
tag.equals("/" + Tags[t]) // and we have accumulated this tag's end-tag
)
betweenTags = false; // indicate we are no longer between this tag and its end tag
else // else if we are not between a prescribed tag pair
for(int i = 0; i < T; i++) // for each type of tag being sought
if(tag.equals(Tags[i])) { // if what we have so far accumulated is this tag's name
betweenTags = true; // show that we are between that tag and its corresponding end tag
t = i; // note which tag-pair we are now between
break; // clear the tag name accumulator string
}
if(tag.equals("br")) // if what we have so far accumulated is the 'br' of a <br> tag
return ' '; // return a space character to ensure word separation
}
c = 0; // return a null character
} else if(c == '<') { // else we are not in a tag, so if char is a tag-start,
inTag = true; // set to 'inside a tag'
Tag = ""; // clear the tag accumulator
c = 0; // return a null character
}
if(!inBody || betweenTags) // if outside document body of between prescribed tag-pairs
c = 0; // return a null character
return c; // return character
}
/* Robert J Morton, the author of this program,
is a poor but Right Honourable Fellow of the
Ancient and Noble Order of the Long-term Unemployed.
Offers of work please to: robmorton@clara.net */
}
This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.