package edu.vt.marian.Document;
import java.io.*;
import java.net.*;
import java.util.*;
import edu.vt.marian.common.*;
/**
A mapping between (e.g., ANSEL) characters and (e.g., OAI XML) entities.
@author Robert France
*/
public class EntityMap
{
/**
* A mapping for a combining diacritic character, including mappings where
* defined from [diacrit+modified character] to single combined entity.
*
* NOTE: In XML combining diacritics follow the modified character;
* in MARC/ANSEL they precede it. Thus when moving from XML to ANSEL
* some lookahead is necessary.
*/
private class DiacriticMap
{
String [] entityStr;
String defaultEntity;
public DiacriticMap(String defEnt)
{
defaultEntity = new String("&" + defEnt + ";");
// Entity strings for modified ASCII characters, where such defined.
entityStr = new String [128];
for (int i=0; i<128; i++)
entityStr[i] = null; // Begin with all undefined.
}
public void addMap(int charValue, String str)
{
entityStr[charValue] = new String("&" + str + ";");
}
public boolean map(char c, BufferedWriter out) throws IOException
{
if ( (c < 0) || (c > 127) || // (Such craziness does happen.)
( entityStr[c] == null ) ) // No combined character defined.
{
out.write(c);
out.write(defaultEntity);
return( false );
}
else
{
out.write(entityStr[c]);
return( true );
}
}
public int hashCode()
{
return( defaultEntity.hashCode() );
}
}
Debug debug;
/**
* Any entity longer than this has got to be a mistake.
*/
private static int MAX_ENTITY_LENGTH = 30;
/**
Mapping from non-ASCII characters to either entity strings or diacritic
maps.
When the data in charTable is a String, it is "ready to wear" with ampersand
and semicolon already in place. See note to addEntity().
*/
private Hashtable charTable;
/**
Mapping from entity strings to non-ASCII character (sequences).
*/
private Hashtable entityTable;
/**
* Find the Object corresonding to this entity string. If this is a
* symbolic entity name, use the String form. If on the other hand
* it is a numeric entity reference (presumably to a UNICODE character)
* convert it to an Integer.
*
* @param entityStr The undelineated entity: i.e., the sequence
* of characters strictly between the '&' and ';' characters.
*/
private Object resolveEntityStr(String entityStr)
{
if (entityStr.charAt(0) == '#' ) // Numeric entity reference.
{
Integer entity;
try {
if ( entityStr.charAt(1) == 'x' ) // Allow both hex and
{ // decimal values.
String t1 = entityStr.substring(2);
entity = new Integer(Integer.parseInt(t1, 16));
}
else
{
String t1 = entityStr.substring(1);
entity = new Integer(Integer.parseInt(t1));
}
return( entity );
} catch (NumberFormatException e)
{
debug.dumpTrace("EntityMap.resolveEntityString: entity '" + entityStr +
"' begins with # but is not a well-formed number. " +
"Treating as string.");
return( entityStr );
}
}
else
return( entityStr );
}
/**
* Add a simple two-way mapping between a single (non-ASCII) character and
* an entity string to the map function.
*
* NOTE: For numeric entity references, we take the representation
* of the numeric value that is being added here to be canonical. This
* means that in a round-trip translation XMl->ANSEL->XML, for
* instance, an entity reference like é of é will end up
* as é if the latter is the way that the reference occurs in
* the loading file (which is the case with "ansdel_uni_comb.map").
* What this gains us is the confidence to assume that the objects
* in charTable can always be assumed to be Strings.
*/
private void addEntity(char charVal, String entityStr)
{
charTable.put(new Character(charVal), new String("&"+entityStr+";") );
char [] charArray = new char [1];
charArray[0] = charVal;
entityTable.put(resolveEntityStr(entityStr), new String(charArray));
}
/**
* Is this a combining diacritic, rather than either an unmodified or a
* composite character?
*/
private boolean isDiacritic(Object inverseMapping)
{
if (! (inverseMapping instanceof String) || // Sanity clause.
( ((String) inverseMapping).length() != 1) )
return( false );
char c = ((String) inverseMapping).charAt(0);
Object mapping = charTable.get(new Character(c));
if ( mapping == null )
return( false );
else
return( mapping instanceof DiacriticMap );
}
/**
* @param offset the offset in baseStr of the leading '&' of the entity.
*/
private Object findEntityMapping(String baseStr, int offset)
{
int j = baseStr.indexOf(';', offset+1);
if (j == -1) // No terminating semicolon!
{
debug.dumpTrace("EntityMap.findEntityMapping(): no terminating ';' for '&' at offset '" +
offset + " in '" + baseStr + "': writing as '&'.");
return( null );
}
Object entity = resolveEntityStr( baseStr.substring(offset+1, j) );
Object mapping;
if ( (mapping = entityTable.get(entity)) == null )
{
debug.dumpTrace("EntityMap.findEntityMapping(): unknown entity '" +
baseStr.substring(offset+1, j) + "': writing as read.");
return( null );
}
return( mapping );
}
public EntityMap(Debug dbg)
{
debug = dbg;
charTable = new Hashtable(256); // The absolute maximum size for 8-bit characters.
entityTable = new Hashtable(512); // Just a good guess.
addEntity('&', "#x0026"); // Two values that are needed in any SGML, HTML
addEntity('<', "#x003C"); // or XML DTD, and that seem to be pretty standard.
}
/**
* Load from a file.
*
* FORMAT:
*/
public void load(BufferedReader in) throws IOException
{
String line = in.readLine();
if ( line == null )
{
debug.dumpTrace("EntityMap.load(): empty file?!?!.");
return;
}
StringTokenizer st;
String token;
int charVal;
String type;
try { while( line != null )
{
st = new StringTokenizer(line);
switch( line.charAt(0) )
{
case '#': // Comment: ignore.
if ( (line = in.readLine()) == null)
throw new EOFException();
break;
case '\t':
debug.dumpTrace("EntityMap.load(): indented line '" +
line + "' not part of diacritic set: ignoring.");
if ( (line = in.readLine()) == null)
throw new EOFException();
break;
default:
try
{
token = st.nextToken();
if ( token.startsWith("x") ) //**NOTE: allow both
{ // hex and decimal
String t1 = token.substring(1); // values.
charVal = Integer.parseInt(t1, 16);
}
else if ( token.startsWith("0x") )
{
String t1 = token.substring(2);
charVal = Integer.parseInt(t1, 16);
}
else
charVal = Integer.parseInt(token);
type = st.nextToken();
token = st.nextToken();
if ( type.equals("e") )
{
addEntity((char) charVal, token);
if ( (line = in.readLine()) == null)
throw new EOFException();
}
else if ( type.equals("d") )
{
// 'token' is combining diacritic: load into entityTable
// for non-composite case, then build DiacriticMap
// (including composites) for charTable.
char [] defaultCharArray = new char [1];
defaultCharArray[0] = (char) charVal;
entityTable.put(resolveEntityStr(token), new String(defaultCharArray));
// Build and load values into DiacriticMap.
DiacriticMap dMap = new DiacriticMap(token);
int modCharVal;
char [] charArray = new char [2];
charArray[0] = defaultCharArray[0];
if ( (line = in.readLine()) != null)
while ( line.charAt(0) == '\t' )
{
st = new StringTokenizer(line);
token = st.nextToken();
if ( token.startsWith("x") )
{
String t1 = token.substring(1);
modCharVal = Integer.parseInt(t1, 16);
}
else if ( token.startsWith("0x") )
{
String t1 = token.substring(2);
modCharVal = Integer.parseInt(t1, 16);
}
else
modCharVal = Integer.parseInt(token);
token = st.nextToken();
dMap.addMap(modCharVal, token);
charArray[1] = (char) modCharVal;
entityTable.put(resolveEntityStr(token),
new String(charArray));
if ( (line = in.readLine()) == null)
break;
}
charTable.put(new Character((char) charVal), dMap);
}
else
debug.dumpTrace("EntityMap.load(): unknown type in '" +
line + "': ignoring.");
} catch( NumberFormatException e )
{
debug.dumpTrace("EntityMap.load(): cannot understand '" +
line + "': ignoring.");
if ( (line = in.readLine()) == null)
throw new EOFException();
}
}
} } catch ( EOFException e ) {}
}
/**
* Take a string in a character encoding (e.g., extended ASCII) and convert it to
* an entity-based encoding (e.g., XML entity references).
* @param str The string to be converted.
* @param out A BufferedWriter into which to put the new string.
*/
public void mapStringToEntities(String str, BufferedWriter out) throws IOException
{
Object mapping;
int i;
for (i=0; i= str.length() )
out.write( ((DiacriticMap) mapping).defaultEntity);
else
((DiacriticMap) mapping).map(str.charAt(++i), out);
}
else
debug.dumpTrace("EntityMap.mapStringFromXml(): impossible error: unrecognized object in mapping table.");
}
}
/**
* Take a string in an entity-based encoding (e.g., XML entity references) and convert it to
* a character-based encoding (e.g., extended ASCII).
* @param str The string to be converted.
* @param out A BufferedWriter into which to put the new string.
*/
public void mapStringFromEntities(String str, BufferedWriter out) throws IOException
{
Object mapping;
String entityStr;
int i, j;
char c = 0;
for (i=0; i= str.length() ) || ( str.charAt(i+1) != '&' ) )
{
out.write( c ); // Unmodified character.
}
else // (c != '&') && (next character == '&'): c may be a
{ // modified character.
if ( (mapping = findEntityMapping(str, i+1)) == null )
{
out.write( c ); // And in following iterations,
} // remainder of entity string.
// Else (mapping!=null), whence there is a ';' somewhere in str.
else if ( isDiacritic(mapping) ) // Reverse order between
{ // XML & ANSEL.
out.write((String) mapping);
out.write( c );
i = str.indexOf(';', i+1);
}
else
{
out.write( c );
out.write((String) mapping);
i = str.indexOf(';', i+1);
}
}
}
if ( i < str.length() ) // Last character is not part of an entity,
out.write( c ); // and thus still needs to be written.
}
/**
* Take an stream in the entity-based encoding and create from it a string
* in the character-based encoding, stopping at the next < character
* (and therefore presumably the next tag) or at end-of-stream.
* @throws EOFException only when EOF encountered in the middle of an entity.
*/
public String getStringFromEntityReader(BufferedReader in) throws IOException
{
StringBuffer inBuf = new StringBuffer();
char [] entityChars = new char [MAX_ENTITY_LENGTH];
Object mapping;
int i = 0, j;
int c; // Incoming character, read as int then cast.
while ( true )
{
c = in.read();
if ( c == -1 ) // EOF after character or completed entity: OK.
return( new String(inBuf) );
switch ((char) c )
{
case '&':
entityChars[0] = '&'; // Prepare for "unknown entity" case.
i = 1;
while (c != ';')
{
c = in.read();
if ( c == -1 ) // EOF
throw new EOFException("EOF within entity");
if ( c == '&' ) // The last '&' was unterminated. Append
{ // what we've got and start over.
inBuf.append(entityChars, 0, i);
i = 1;
}
else if ( c == '<' ) // The last '&' was unterminated
{ // and the test is done. Append
inBuf.append(entityChars, 0, i); // what we've
return( new String(inBuf) ); // got & leave.
}
else if ( i >= MAX_ENTITY_LENGTH ) // Something screwy.
{ // Append what we've
inBuf.append(entityChars, 0, i); // got and get
inBuf.append((char) c); // back to business.
i = 0;
break;
}
else
entityChars[i++] = (char) c;
}
if ( i == 0 ) // Abnormal end while collecting entity. Go
break; // back to running main loop.
String entityStr = new String(entityChars, 0, i);
if ( (mapping = findEntityMapping(entityStr, 0)) == null )
{
// The ampersand, entity string and semicolon are already
inBuf.append(entityChars, 0, i); // in entityChars.
}
else if ( isDiacritic(mapping) ) // Reverse order between
{ // XML & ANSEL.
char mod = inBuf.charAt(inBuf.length()-1);
inBuf.setLength(inBuf.length()-1);
inBuf.append((String) mapping);
inBuf.append( mod );
}
else
{
inBuf.append((String) mapping);
}
break;
case '<':
return( new String(inBuf) );
default:
inBuf.append((char) c);
break;
}
}
}
}