/** 
  * HTML tag extractor
  * @author Robert J Morton <robmorton@clara.net>
  * @version 04 September 2000
  * @copyright Sep 2000 Robert J Morton (all rights reserved) */

/* This program scans an incoming character stream. It returns each character which is not 
   part of an HTML tag unchanged. It returns a null character in place of each character 
   which is part of an HTML tag. It returns a null character in place of each character it 
   encounters OUTSIDE the <body> </body> tags, and in place of each character it encounters
   INSIDE the tag-pairs shown in the Tags[] array below. */


class HTMLtag {

   private String Tags[] = {
      "table", "ul", "ol", "applet", "form"        // HTML tags between which words are not to be counted
   };
   private int T = Tags.length;                    // number of entries in the Tags[] array
   private int t = 0;                              // tag index number in Tags[]
   private boolean inTag = false;                  // true when inside an HTML tag
   private boolean betweenTags = false;            // true when between one of the above tag-pairs
   private boolean inBody = false;                 // true when between the document's <body> tags
   private String Tag = "";                        // tag accumulator string


   char capture(char c) {                          // FILTER OUT HTML TAGS ACCORDING TO STRICT XML RULES
      if(inTag)  {                                 // if inside an HTML tag
         if(c == '>')                              // if char is a tag-end, 
            inTag = false;                         // reset to 'outside a tag' 
         else {                                    // else if it is any other character
            Tag += c;                              // add it to what we have captured so far
            String tag = Tag.toLowerCase();        // rationalise to lower case for comparison
            if(inBody) {                           // if currently within the <body> of the HTML document
               if(tag.equals("/body"))             // if what we have so far accumulated is the </body> tag
                  inBody = false;                  // set to indicate we are no longer in the document body
            } else {                               // if we are currently outside the document <body>
               if(tag.equals("body"))              // if what we have so far accumulated is the <body> tag
                  inBody = true;                   // indicate we are now between the <body> and </body> tags
            }
            if(betweenTags &&                      // if we are currently between a prescribed tag pair
               tag.equals("/" + Tags[t])           // and we have accumulated this tag's end-tag
            ) 
               betweenTags = false;                // indicate we are no longer between this tag and its end tag
            else                                   // else if we are not between a prescribed tag pair
               for(int i = 0; i < T; i++)          // for each type of tag being sought
                  if(tag.equals(Tags[i])) {        // if what we have so far accumulated is this tag's name
                     betweenTags = true;           // show that we are between that tag and its corresponding end tag
                     t = i;                        // note which tag-pair we are now between
                     break;                        // clear the tag name accumulator string
                  }
            if(tag.equals("br"))                   // if what we have so far accumulated is the 'br' of a <br> tag
               return ' ';                         // return a space character to ensure word separation
         }
         c = 0;                                    // return a null character
      } else if(c == '<') {                        // else we are not in a tag, so if char is a tag-start,
         inTag = true;                             // set to 'inside a tag'
         Tag = "";                                 // clear the tag accumulator
         c = 0;                                    // return a null character
      }
      if(!inBody || betweenTags)                   // if outside document body of between prescribed tag-pairs
         c = 0;                                    // return a null character
      return c;                                    // return character
   }

/*  Robert J Morton, the author of this program, 
    is a poor but Right Honourable Fellow of the
    Ancient and Noble Order of the Long-term Unemployed.

    Offers of work please to: robmorton@clara.net  */
}

This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.