/**
* Character entity substituter for the text/HTML word counter.
* @author Robert J Morton <robmorton@clara.net>
* @version 04 September 2000
* @copyright Sep 2000 Robert J Morton (all rights reserved) */
// Converts a presented HTML character entity string into an UDT-8 character byte code.
class charEnt {
private static final String S[] = { //valid HTML character entities
"AElig", "Aacute", "Acirc", "Agrave", "Aring", "Atilde", "Auml", "Ccedil",
"ETH", "Eacute", "Ecirc", "Egrave", "Euml", "Iacute", "Icirc", "Igrave",
"Iuml", "Ntilde", "Oacute", "Ocirc", "Ograve", "Oslash", "Otilde", "Ouml",
"THORN", "Uacute", "Ucirc", "Ugrave", "Uuml", "Yacute", "aacute", "acirc",
"acute", "aelig", "agrave", "amp", "aring", "atilde", "auml", "brvbar",
"ccedil", "cedil", "cent", "copy", "curren", "deg", "divide", "eacute",
"ecirc", "egrave", "eth", "euml", "frac12", "frac14", "frac34", "gt",
"iacute", "icirc", "iexcl", "igrave", "iquest", "iuml", "laquo", "lt",
"macr", "micro", "middot", "nbsp", "not", "ntilde", "oacute", "ocirc",
"ograve", "ordf", "ordm", "oslash", "otilde", "ouml", "para", "plusmn",
"pound", "quot", "raquo", "reg", "sect", "shy", "sup1", "sup2",
"sup3", "szlig", "thorn", "times", "uacute", "ucirc", "ugrave", "uml",
"uuml", "yacute", "yen", "yuml"
};
private static final char T[] = { // HTML numeric entities
198, 193, 194, 192, 197, 195, 196, 199, 208, 201, 202, 200, 203, 205, 206, 204,
207, 209, 211, 212, 210, 216, 213, 214, 222, 218, 219, 217, 220, 221, 225, 226,
180, 230, 224, 38, 229, 227, 228, 166, 231, 184, 162, 169, 164, 176, 247, 233,
234, 232, 240, 235, 189, 188, 190, 62, 237, 238, 161, 236, 191, 239, 171, 60,
175, 181, 183, 160, 172, 241, 243, 244, 242, 170, 186, 248, 245, 246, 182, 177,
163, 34, 187, 174, 167, 173, 185, 178, 179, 223, 254, 215, 250, 251, 249, 168,
252, 253, 165, 255
};
private static int n; //index number of the character's ASCII code
private static boolean find(String s) { //accepts the given character entity
int x; //3-state string comparison variable
int N = S.length - 1; //number of highest entry
n = N >> 1; //start with the char ent half way up the index
int j = n; //jump size
while((j >>= 1) > 0) { //while the jump size, having been halved, be > zero
if((x = s.compareTo(S[n])) > 0) //s > retrieved keyword so we are too low down the index
n += j; //split the partition
else if(x < 0) //s < retrieved keyword so we are too far up the index
n -= j; //split the partition
else return true; //return if s = retrieved keyword
}
boolean u = false; //true indicates going up!
boolean d = false; //true indicates going down!
while(!(u && d)) { //while not yet reversed direction along index
if((x = s.compareTo(S[n])) > 0) { //s > retrieved keyword so we are too low down the index
if(d) break; //return with n pointing next lower word than w
if(++n > N) break; //if moving up overshoots end of index
u = true; //indicate that we have moved up the index
} else if(x < 0) { //s < retrieved keyword so we are too far up the index
if(--n < 0) break; //if moving down overshoots start of index
if(u) break; //if overshot while going up, return with next lower word than w
d = true; //indicate that we have moved down the index
} else return true; //return index number if submitted keyword word matched
} // retrieved keyword
return false; //keyword cannot be found so return false
}
private static char getChar(String s) {
if(find(s)) //if the presented character entity is found in the array
return T[n]; //return its ASCII value
else //else
return '^'; //return a ^, hoping it will be sufficiently out
} //of context to indicate an invalid character entity
static int a = 0; //character entity capture phase
static int Ymax = 7; //max number of chars in character entity
static String Y = ""; //string in which to accumulate the characters of an entity
static char capture(char c) { //CAPTURE A CHARACTER ENTITY
if(a == 0) { //if not currently in a character entity
if(c != '&') return c; //an ordinary character so return it
a = 1; //it's an ampersand, so set phase 1 of capture
Y = ""; //clear character entity capture string
c = 0; //return a null character
} else if(a == 1) { //if just captured an ampersand
if(c == '#') //if this character is a hash, it is a numerical char ent
Ymax = 4; //numerical character entities contain 3 digits
else {
Ymax = 7; //acronym char entities contain up to 6 characters
Y += c; //place first char in char entity capture string
}
a = 2; //set to phase 2 of capture
c = 0; //return a null character
} else if(c == ';') { //if this character is a terminating semi-colon
a = 0; //reset to non-capture phase
try { //try to parse the captured string as a number
c = (char)Integer.valueOf(Y).intValue();
} catch(NumberFormatException e) { //if it won't parse as a numeric UDT-8 character code
c = getChar(Y); //it must be in abbreviation form
} //so look up its numeric UDT-8 character code
} else { //else we are still gathering the entity's characters
if(Y.length() < Ymax) { //if char entity still not too long
Y += c; //add this character to the capture string
c = 0; //and return a null charater
} else { //else we now have too many characters
c = '^'; //so indicate a character entity error
a = 0; //return to normal character mode
}
}
return c; //return the appropriate character
}
/* Robert J Morton, the author of this program,
is a poor but Right Honourable Fellow of the
Ancient and Noble Order of the Long-term Unemployed.
Offers of work please to: robmorton@clara.net */
}
This page's parent within this Web Site. About this Web Site. Its home page. Email its Author.