/** 
  * Character entity substituter for the text/HTML word counter.
  * @author Robert J Morton <robmorton@clara.net>
  * @version 04 September 2000
  * @copyright Sep 2000 Robert J Morton (all rights reserved) */

// Converts a presented HTML character entity string into an UDT-8 character byte code.

class charEnt {
   private static final String S[] = { //valid HTML character entities
      "AElig",  "Aacute", "Acirc",  "Agrave", "Aring",  "Atilde", "Auml",   "Ccedil",
      "ETH",    "Eacute", "Ecirc",  "Egrave", "Euml",   "Iacute", "Icirc",  "Igrave",
      "Iuml",   "Ntilde", "Oacute", "Ocirc",  "Ograve", "Oslash", "Otilde", "Ouml",
      "THORN",  "Uacute", "Ucirc",  "Ugrave", "Uuml",   "Yacute", "aacute", "acirc",
      "acute",  "aelig",  "agrave", "amp",    "aring",  "atilde", "auml",   "brvbar",
      "ccedil", "cedil",  "cent",   "copy",   "curren", "deg",    "divide", "eacute",
      "ecirc",  "egrave", "eth",    "euml",   "frac12", "frac14", "frac34", "gt",
      "iacute", "icirc",  "iexcl",  "igrave", "iquest", "iuml",   "laquo",  "lt",
      "macr",   "micro",  "middot", "nbsp",   "not",    "ntilde", "oacute", "ocirc",
      "ograve", "ordf",   "ordm",   "oslash", "otilde", "ouml",   "para",   "plusmn",
      "pound",  "quot",   "raquo",  "reg",    "sect",   "shy",    "sup1",   "sup2",
      "sup3",   "szlig",  "thorn",  "times",  "uacute", "ucirc",  "ugrave", "uml",
      "uuml",   "yacute", "yen",    "yuml" 
   };

   private static final char T[] = { // HTML numeric entities
      198, 193, 194, 192, 197, 195, 196, 199, 208, 201, 202, 200, 203, 205, 206, 204,
      207, 209, 211, 212, 210, 216, 213, 214, 222, 218, 219, 217, 220, 221, 225, 226,
      180, 230, 224,  38, 229, 227, 228, 166, 231, 184, 162, 169, 164, 176, 247, 233,
      234, 232, 240, 235, 189, 188, 190,  62, 237, 238, 161, 236, 191, 239, 171,  60,
      175, 181, 183, 160, 172, 241, 243, 244, 242, 170, 186, 248, 245, 246, 182, 177,
      163,  34, 187, 174, 167, 173, 185, 178, 179, 223, 254, 215, 250, 251, 249, 168,
      252, 253, 165, 255
   };   
   
   private static int n;                        //index number of the character's ASCII code

   private static boolean find(String s) {      //accepts the given character entity
      int x;                                    //3-state string comparison variable
      int N = S.length - 1;                     //number of highest entry
      n = N >> 1;                               //start with the char ent half way up the index
      int j = n;                                //jump size

      while((j >>= 1) > 0) {                    //while the jump size, having been halved, be > zero 
         if((x = s.compareTo(S[n])) > 0)        //s > retrieved keyword so we are too low down the index
            n += j;                             //split the partition
         else if(x < 0)                         //s < retrieved keyword so we are too far up the index
            n -= j;                             //split the partition
         else return true;                      //return if s = retrieved keyword
      }

      boolean u = false;                        //true indicates going up!
      boolean d = false;                        //true indicates going down!
      while(!(u && d)) {                        //while not yet reversed direction along index
         if((x = s.compareTo(S[n])) > 0) {      //s > retrieved keyword so we are too low down the index
            if(d) break;                        //return with n pointing next lower word than w
            if(++n > N) break;                  //if moving up overshoots end of index
            u = true;                           //indicate that we have moved up the index
         } else if(x < 0) {                     //s < retrieved keyword so we are too far up the index
            if(--n < 0) break;                  //if moving down overshoots start of index
            if(u) break;                        //if overshot while going up, return with next lower word than w
            d = true;                           //indicate that we have moved down the index
         } else return true;                    //return index number if submitted keyword word matched 
      }                                         // retrieved keyword
      return false;                             //keyword cannot be found so return false
   }

   
   private static char getChar(String s) {
      if(find(s))                               //if the presented character entity is found in the array
         return T[n];                           //return its ASCII value
      else                                      //else
         return '^';                            //return a ^, hoping it will be sufficiently out
   }                                            //of context to indicate an invalid character entity


   static int a = 0;                            //character entity capture phase
   static int Ymax = 7;                         //max number of chars in character entity
   static String Y = "";                        //string in which to accumulate the characters of an entity

   static char capture(char c) {                //CAPTURE A CHARACTER ENTITY
      if(a == 0) {                              //if not currently in a character entity
         if(c != '&') return c;                 //an ordinary character so return it
         a = 1;                                 //it's an ampersand, so set phase 1 of capture
         Y = "";                                //clear character entity capture string
         c = 0;                                 //return a null character
      } else if(a == 1) {                       //if just captured an ampersand
         if(c == '#')                           //if this character is a hash, it is a numerical char ent
            Ymax = 4;                           //numerical character entities contain 3 digits
         else { 
            Ymax = 7;                           //acronym char entities contain up to 6 characters
            Y += c;                             //place first char in char entity capture string
         }
         a = 2;                                 //set to phase 2 of capture
         c = 0;                                 //return a null character
      } else if(c == ';') {                     //if this character is a terminating semi-colon
         a = 0;                                 //reset to non-capture phase
         try {                                  //try to parse the captured string as a number
            c = (char)Integer.valueOf(Y).intValue();
         } catch(NumberFormatException e) {     //if it won't parse as a numeric UDT-8 character code
            c = getChar(Y);                     //it must be in abbreviation form
         }                                      //so look up its numeric UDT-8 character code
      } else {                                  //else we are still gathering the entity's characters
         if(Y.length() < Ymax) {                //if char entity still not too long
            Y += c;                             //add this character to the capture string
            c =  0;                             //and return a null charater
         } else {                               //else we now have too many characters
            c = '^';                            //so indicate a character entity error
            a = 0;                              //return to normal character mode
         }
      }
      return c;                                 //return the appropriate character
   }

/*  Robert J Morton, the author of this program, 
    is a poor but Right Honourable Fellow of the
    Ancient and Noble Order of the Long-term Unemployed.

    Offers of work please to: robmorton@clara.net  */
}

This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.