package edu.vt.marian.Document; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** A mapping between (e.g., ANSEL) characters and (e.g., OAI XML) entities. @author Robert France */ public class EntityMap { /** * A mapping for a combining diacritic character, including mappings where * defined from [diacrit+modified character] to single combined entity. * * NOTE: In XML combining diacritics follow the modified character; * in MARC/ANSEL they precede it. Thus when moving from XML to ANSEL * some lookahead is necessary. */ private class DiacriticMap { String [] entityStr; String defaultEntity; public DiacriticMap(String defEnt) { defaultEntity = new String("&" + defEnt + ";"); // Entity strings for modified ASCII characters, where such defined. entityStr = new String [128]; for (int i=0; i<128; i++) entityStr[i] = null; // Begin with all undefined. } public void addMap(int charValue, String str) { entityStr[charValue] = new String("&" + str + ";"); } public boolean map(char c, BufferedWriter out) throws IOException { if ( (c < 0) || (c > 127) || // (Such craziness does happen.) ( entityStr[c] == null ) ) // No combined character defined. { out.write(c); out.write(defaultEntity); return( false ); } else { out.write(entityStr[c]); return( true ); } } public int hashCode() { return( defaultEntity.hashCode() ); } } Debug debug; /** * Any entity longer than this has got to be a mistake. */ private static int MAX_ENTITY_LENGTH = 30; /** Mapping from non-ASCII characters to either entity strings or diacritic maps. When the data in charTable is a String, it is "ready to wear" with ampersand and semicolon already in place. See note to addEntity(). */ private Hashtable charTable; /** Mapping from entity strings to non-ASCII character (sequences). */ private Hashtable entityTable; /** * Find the Object corresonding to this entity string. If this is a * symbolic entity name, use the String form. If on the other hand * it is a numeric entity reference (presumably to a UNICODE character) * convert it to an Integer. * * @param entityStr The undelineated entity: i.e., the sequence * of characters strictly between the '&' and ';' characters. */ private Object resolveEntityStr(String entityStr) { if (entityStr.charAt(0) == '#' ) // Numeric entity reference. { Integer entity; try { if ( entityStr.charAt(1) == 'x' ) // Allow both hex and { // decimal values. String t1 = entityStr.substring(2); entity = new Integer(Integer.parseInt(t1, 16)); } else { String t1 = entityStr.substring(1); entity = new Integer(Integer.parseInt(t1)); } return( entity ); } catch (NumberFormatException e) { debug.dumpTrace("EntityMap.resolveEntityString: entity '" + entityStr + "' begins with # but is not a well-formed number. " + "Treating as string."); return( entityStr ); } } else return( entityStr ); } /** * Add a simple two-way mapping between a single (non-ASCII) character and * an entity string to the map function. * * NOTE: For numeric entity references, we take the representation * of the numeric value that is being added here to be canonical. This * means that in a round-trip translation XMl->ANSEL->XML, for * instance, an entity reference like é of é will end up * as é if the latter is the way that the reference occurs in * the loading file (which is the case with "ansdel_uni_comb.map"). * What this gains us is the confidence to assume that the objects * in charTable can always be assumed to be Strings. */ private void addEntity(char charVal, String entityStr) { charTable.put(new Character(charVal), new String("&"+entityStr+";") ); char [] charArray = new char [1]; charArray[0] = charVal; entityTable.put(resolveEntityStr(entityStr), new String(charArray)); } /** * Is this a combining diacritic, rather than either an unmodified or a * composite character? */ private boolean isDiacritic(Object inverseMapping) { if (! (inverseMapping instanceof String) || // Sanity clause. ( ((String) inverseMapping).length() != 1) ) return( false ); char c = ((String) inverseMapping).charAt(0); Object mapping = charTable.get(new Character(c)); if ( mapping == null ) return( false ); else return( mapping instanceof DiacriticMap ); } /** * @param offset the offset in baseStr of the leading '&' of the entity. */ private Object findEntityMapping(String baseStr, int offset) { int j = baseStr.indexOf(';', offset+1); if (j == -1) // No terminating semicolon! { debug.dumpTrace("EntityMap.findEntityMapping(): no terminating ';' for '&' at offset '" + offset + " in '" + baseStr + "': writing as '&'."); return( null ); } Object entity = resolveEntityStr( baseStr.substring(offset+1, j) ); Object mapping; if ( (mapping = entityTable.get(entity)) == null ) { debug.dumpTrace("EntityMap.findEntityMapping(): unknown entity '" + baseStr.substring(offset+1, j) + "': writing as read."); return( null ); } return( mapping ); } public EntityMap(Debug dbg) { debug = dbg; charTable = new Hashtable(256); // The absolute maximum size for 8-bit characters. entityTable = new Hashtable(512); // Just a good guess. addEntity('&', "#x0026"); // Two values that are needed in any SGML, HTML addEntity('<', "#x003C"); // or XML DTD, and that seem to be pretty standard. } /** * Load from a file. * * FORMAT: */ public void load(BufferedReader in) throws IOException { String line = in.readLine(); if ( line == null ) { debug.dumpTrace("EntityMap.load(): empty file?!?!."); return; } StringTokenizer st; String token; int charVal; String type; try { while( line != null ) { st = new StringTokenizer(line); switch( line.charAt(0) ) { case '#': // Comment: ignore. if ( (line = in.readLine()) == null) throw new EOFException(); break; case '\t': debug.dumpTrace("EntityMap.load(): indented line '" + line + "' not part of diacritic set: ignoring."); if ( (line = in.readLine()) == null) throw new EOFException(); break; default: try { token = st.nextToken(); if ( token.startsWith("x") ) //**NOTE: allow both { // hex and decimal String t1 = token.substring(1); // values. charVal = Integer.parseInt(t1, 16); } else if ( token.startsWith("0x") ) { String t1 = token.substring(2); charVal = Integer.parseInt(t1, 16); } else charVal = Integer.parseInt(token); type = st.nextToken(); token = st.nextToken(); if ( type.equals("e") ) { addEntity((char) charVal, token); if ( (line = in.readLine()) == null) throw new EOFException(); } else if ( type.equals("d") ) { // 'token' is combining diacritic: load into entityTable // for non-composite case, then build DiacriticMap // (including composites) for charTable. char [] defaultCharArray = new char [1]; defaultCharArray[0] = (char) charVal; entityTable.put(resolveEntityStr(token), new String(defaultCharArray)); // Build and load values into DiacriticMap. DiacriticMap dMap = new DiacriticMap(token); int modCharVal; char [] charArray = new char [2]; charArray[0] = defaultCharArray[0]; if ( (line = in.readLine()) != null) while ( line.charAt(0) == '\t' ) { st = new StringTokenizer(line); token = st.nextToken(); if ( token.startsWith("x") ) { String t1 = token.substring(1); modCharVal = Integer.parseInt(t1, 16); } else if ( token.startsWith("0x") ) { String t1 = token.substring(2); modCharVal = Integer.parseInt(t1, 16); } else modCharVal = Integer.parseInt(token); token = st.nextToken(); dMap.addMap(modCharVal, token); charArray[1] = (char) modCharVal; entityTable.put(resolveEntityStr(token), new String(charArray)); if ( (line = in.readLine()) == null) break; } charTable.put(new Character((char) charVal), dMap); } else debug.dumpTrace("EntityMap.load(): unknown type in '" + line + "': ignoring."); } catch( NumberFormatException e ) { debug.dumpTrace("EntityMap.load(): cannot understand '" + line + "': ignoring."); if ( (line = in.readLine()) == null) throw new EOFException(); } } } } catch ( EOFException e ) {} } /** * Take a string in a character encoding (e.g., extended ASCII) and convert it to * an entity-based encoding (e.g., XML entity references). * @param str The string to be converted. * @param out A BufferedWriter into which to put the new string. */ public void mapStringToEntities(String str, BufferedWriter out) throws IOException { Object mapping; int i; for (i=0; i= str.length() ) out.write( ((DiacriticMap) mapping).defaultEntity); else ((DiacriticMap) mapping).map(str.charAt(++i), out); } else debug.dumpTrace("EntityMap.mapStringFromXml(): impossible error: unrecognized object in mapping table."); } } /** * Take a string in an entity-based encoding (e.g., XML entity references) and convert it to * a character-based encoding (e.g., extended ASCII). * @param str The string to be converted. * @param out A BufferedWriter into which to put the new string. */ public void mapStringFromEntities(String str, BufferedWriter out) throws IOException { Object mapping; String entityStr; int i, j; char c = 0; for (i=0; i= str.length() ) || ( str.charAt(i+1) != '&' ) ) { out.write( c ); // Unmodified character. } else // (c != '&') && (next character == '&'): c may be a { // modified character. if ( (mapping = findEntityMapping(str, i+1)) == null ) { out.write( c ); // And in following iterations, } // remainder of entity string. // Else (mapping!=null), whence there is a ';' somewhere in str. else if ( isDiacritic(mapping) ) // Reverse order between { // XML & ANSEL. out.write((String) mapping); out.write( c ); i = str.indexOf(';', i+1); } else { out.write( c ); out.write((String) mapping); i = str.indexOf(';', i+1); } } } if ( i < str.length() ) // Last character is not part of an entity, out.write( c ); // and thus still needs to be written. } /** * Take an stream in the entity-based encoding and create from it a string * in the character-based encoding, stopping at the next < character * (and therefore presumably the next tag) or at end-of-stream. * @throws EOFException only when EOF encountered in the middle of an entity. */ public String getStringFromEntityReader(BufferedReader in) throws IOException { StringBuffer inBuf = new StringBuffer(); char [] entityChars = new char [MAX_ENTITY_LENGTH]; Object mapping; int i = 0, j; int c; // Incoming character, read as int then cast. while ( true ) { c = in.read(); if ( c == -1 ) // EOF after character or completed entity: OK. return( new String(inBuf) ); switch ((char) c ) { case '&': entityChars[0] = '&'; // Prepare for "unknown entity" case. i = 1; while (c != ';') { c = in.read(); if ( c == -1 ) // EOF throw new EOFException("EOF within entity"); if ( c == '&' ) // The last '&' was unterminated. Append { // what we've got and start over. inBuf.append(entityChars, 0, i); i = 1; } else if ( c == '<' ) // The last '&' was unterminated { // and the test is done. Append inBuf.append(entityChars, 0, i); // what we've return( new String(inBuf) ); // got & leave. } else if ( i >= MAX_ENTITY_LENGTH ) // Something screwy. { // Append what we've inBuf.append(entityChars, 0, i); // got and get inBuf.append((char) c); // back to business. i = 0; break; } else entityChars[i++] = (char) c; } if ( i == 0 ) // Abnormal end while collecting entity. Go break; // back to running main loop. String entityStr = new String(entityChars, 0, i); if ( (mapping = findEntityMapping(entityStr, 0)) == null ) { // The ampersand, entity string and semicolon are already inBuf.append(entityChars, 0, i); // in entityChars. } else if ( isDiacritic(mapping) ) // Reverse order between { // XML & ANSEL. char mod = inBuf.charAt(inBuf.length()-1); inBuf.setLength(inBuf.length()-1); inBuf.append((String) mapping); inBuf.append( mod ); } else { inBuf.append((String) mapping); } break; case '<': return( new String(inBuf) ); default: inBuf.append((char) c); break; } } } }