The function receives a pointer s to a string containing a given character entity. It then searches the first array for this given character entity. It does this using a binary slice search method. On finding the given character entity in the array it notes its position within the array, that is, its index number. It then looks in the corresponding element of the second array to find the given character entity's 8-bit ASCII value. This it then returns to the function which called it.
static int GetEnt(char *s) {
static char *CE[] = { //valid HTML character entities
"AElig", "Aacute", "Acirc", "Agrave", "Aring", "Atilde",
"Auml", "Ccedil", "ETH", "Eacute", "Ecirc", "Egrave",
"Euml", "Iacute", "Icirc", "Igrave", "Iuml", "Ntilde",
"Oacute", "Ocirc", "Ograve", "Oslash", "Otilde", "Ouml",
"THORN", "Uacute", "Ucirc", "Ugrave", "Uuml", "Yacute",
"aacute", "acirc", "acute", "aelig", "agrave", "amp",
"aring", "atilde", "auml", "brvbar", "ccidil", "cedil",
"cent", "copy", "curren", "deg", "divide", "eacute",
"ecirc", "egrave", "eth", "euml", "frac12", "frac14",
"frac34", "gt", "iacute", "icirc", "iexcl", "igrave",
"iquest", "iuml", "laquo", "lt", "macr", "micro",
"middot", "nbsp", "not", "ntilde", "oacute", "ocirc",
"ograve", "ordf", "ordm", "oslash", "otilde", "ouml",
"para", "plusmn", "pound", "quot", "raquo", "reg",
"sect", "shy", "sup1", "sup2", "sup3", "szlig",
"thorn", "times", "uacute", "ucirc", "ugrave", "uml",
"uuml", "yacute", "yen", "yuml"
};
unsigned char ce[] = {
198, 193, 194, 192, 197, 195, 196, 199, 208, 201, 202, 200,
203, 205, 206, 204, 207, 209, 211, 212, 210, 216, 213, 214,
222, 218, 219, 217, 220, 221, 225, 226, 180, 230, 224, 038,
229, 227, 228, 166, 231, 184, 162, 169, 164, 176, 247, 233,
234, 232, 240, 235, 189, 188, 190, 062, 237, 238, 161, 236,
191, 239, 171, 060, 175, 181, 183, 160, 172, 241, 243, 244,
242, 170, 186, 248, 245, 246, 182, 177, 163, 034, 187, 174,
167, 173, 185, 178, 179, 223, 254, 215, 250, 251, 249, 168,
252, 253, 165, 255
};
int k = 50, //half-way point to start search within array
j = 25, //first index shift amount
x = 0; //default list number of character entity
while(k >= 0 && k < 101) { //while index within array range
int c, //char within captured char entity
d, //char in char ent string in array
f = 1; //indicates whether a match or not
char *p = s, //ptr to a char in captured entity
*q = *(CE + k); //ptr to a char in the array string
do { //for each char of shorter string
if((c = *p++) != (d = *q++)) { //if chars not the same
if(c > d) //if char in entity > char in array
k -= j; //abort test and shift down array
else
k += j; //otherwise shift up the array
if(j > 1) //if the shift increment > 1 then
j >>= 1; //halve it, otherwise leave it = 1
f = 0; //set flag to indicate a mis-match
break; //break out of the do-loop
}
} while(c != 0 && d != 0);//end loop if either char is a null
if(f) { //if there was a match
x = *(ce + k); //get char's ISO 'ASCII' value
break; //break out of the main while loop
}
}
return(x); //return the entity's ASCII value
}
Character entities have a numeric equivalent. These are typed as a # followed by up to 3-digits followed by a semicolon. An example would be ¶. This is the same as the normal entity ¶ to create the ¶ sign. These numeric entities are worked out by the following function without calling GetEnt(s).
int CharEntConv(int c) { //next character from input text stream
static char CE[8], //array for captured character entity
*p; //ptr to captured char entity string
static int cc; //count of No of chars so far in entity
if(c == '&') { //start delimiter for HTML char entity
p = CE; //set p to start of char entity array
cc = 1; //set that we are within a char entity
}
else if(cc > 0) { //if we are currently in a char entity
if(c == ';') { //end-delimiter for an HTML char entity
*p = '\0' //put null at end of tag string
cc = 0; //set that we are in normal text
p = CE; //set pointer to start of char entity
/* If the first character of the entity is a # then it
is in numeric format, so assemble the following digit
characters into a numeric ASCII 'quantity', c.
Otherwise the character entity is an acronym, so use
GetEnt() to get its ASCII value instead. */
if((c = *p++ ) == '#') { //If first char is a #
c = 0; //initialise ASCII value
while((int a = *p++) > '\0') //until null terminator
{
if(c > 0) { //If not first digit
c <<= 1; //multiply c by 10 and
c += (c <<= 2);
}
c += a - 47; //add in current digit
}
} else c = GetEnt(CE); //First char not a #
}
else { //we are in the midst of a character entity
if(cc < 8) //character entity still within length limit
*p++ = c; //add char to character entity buffer
else //if the character entity string is too long
cc = 0; //ignore the whole character entity
c = 0; //return a null character
}
} //else return the ISO character
return(c); //equivalent of the char entity
}