package edu.vt.marian.Document; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** A USMARC record (see http://www.loc.gov/marc/).

The primary purpose of this root class is for methods for reading and writing MARC records in various formats (USMARC "tape" format, Open Archives Initiative XML MARC trasport format, and Atomic transport format. I've also provided a set of basic methods to get at parts of the record, most created originally by Jianxin Zhao. More sophisticated methods, e.g. for presentation to users or reflective self-description, are added in the child class MarcDocument.

@author Robert France */ public class MarcRecord { protected char status; // From MARC Leader position 5: see MARC protected char type; // (Leader 6) documentation for meaning protected char level; // (Leader 7) and acceptable values. protected char controlType; // Leader 8: '#'==normal, 'a'==archival. protected char charEncoding; // Leader 9: should be '#' or else we've got Unicode. protected char encLvl; // Leader 17: "fullness of the bibliographic info": // '#'==full, else lower numbers are better. VT-Lib // flags local theses (including ETDs) with a 'K' here. protected char catForm; // Leader 18: descriptive cataloging convention used. protected char linkedRecRqrd; // Leader 19: Related record required for 76X-78X flds? /** The fixed fields of this object as a Vector of Strings. */ protected Vector fixFields; /** The variable fields of this object as a Vector of MarcVarFields. */ protected Vector varFields; /** The field separator of USMARC record in tape format. */ protected final static char field_separator = '\036'; /** The record end symbol of a USMARC record in tape format. */ protected final static char record_end = '\035'; /** Just used for debugging. */ protected Debug debug; /** Device for mapping ANSEL (and some ASCII) characters to XML entities. */ protected EntityMap xmlMap; /** Has this been properly instantiated (e.g., from a valid tape format or XML string)? */ protected boolean isInstantiated; /** Set instance variables to default values.

NOTE: Called in constructors and in setFrom*(). Added to (not overridden) in MarcDocument class. */ protected void init() { status = ' '; // Blank character: standard USMARC encoding for type = ' '; // "not defined". level = ' '; controlType = ' '; charEncoding = ' '; encLvl = ' '; catForm = ' '; linkedRecRqrd = ' '; fixFields = new Vector(); varFields = new Vector(); isInstantiated = false; } /** Create the correct sort of field object.

NOTE: This may not seem very sensible at this level, but in the MarcDocument subclass, where variable fields with different semantics get created as different subclasses of MarcVarField, it will become indispensable.
DEVEL: This is a method created to be replaced by a private method in the subclass. Should it be protected or private? */ protected MarcVarField newVarField(int ID) { return( new MarcVarField(ID, xmlMap, debug) ); } /** Create a null MarcRecord, with a minimal xmlMap (one with only & and < in it). @param debug -- used for debugging */ public MarcRecord(Debug dbg) { debug = dbg; xmlMap = new EntityMap(dbg); init(); } /** Create a MarcRecord object using an explicit EntityMap. @param xMap --- an EntityMap used when reading or writing the record in XML. @param debug -- used for debugging */ public MarcRecord(EntityMap xMap, Debug dbg) { debug = dbg; xmlMap = xMap; init(); } /** Is this object a valid MARC record? @return true / false

**DEVEL: At this point this method is incomplete and only checks if something has been read in. More checking would be reasonable when we figure out the right level ... */ public boolean isValid() { return( isInstantiated ); } /** Return the number of fixed fields contained in this object. @return the number of the fixed fields of this object as an integer. */ public int getNumberFixFields() { if ( ! isInstantiated ) return( 0 ); else return fixFields.size(); } /** Return the number of variable fields contained in this object. @return the number of the variable fields of this object as an integer. */ public int getNumberVarFields() { if ( ! isInstantiated ) return( 0 ); else return varFields.size(); } /** Return the fixed field of this object with the specified id. @param id -- the fixed field with this id will be returned @return a MarcFixField -- the fixed field in this object with the specified id
null -- the id is invalid or there is no such fixed field in this object with the specified id */ public MarcFixField getFixFieldById(int id) { if ( ! isInstantiated ) return( null ); if ((id <= 0) || (id > 9)) { debug.dumpTrace("getFixFieldById(): id is not valid"); return null; } // id is valid MarcFixField mff = null; for (int i = 0; i < fixFields.size(); i++) { mff = (MarcFixField) fixFields.elementAt(i); if (mff.getID() == id) { // a match is found, return the field return mff; } } // the specified field is not found return null; } /** Return all the variable fields of this object with the specified id. @param id -- all the variable fields with this id will be returned @return a vector containing all the variable fields found with the specified id.
null -- the id is not valid
the empty vector -- everything was fine, but there are no fields with that id in this record. NOTE: Since a MARC record may have several variable variable fields with the same ID, one ID may return zero, one, or several fields. */ public Vector getVarFieldsById(int id) { if ( ! isInstantiated ) return( null ); if ((id < 10) || (id > 999)) { debug.dumpTrace("getVarFieldsById(): id is not valid"); return null; } // id is valid Vector returnFields = new Vector(); MarcVarField mvf = null; for (int i = 0; i < varFields.size(); i++) { mvf = (MarcVarField) varFields.elementAt(i); if (mvf.getID() == id) { // a match is found, add the field to the return vector returnFields.addElement(mvf); } } // return the final result return returnFields; } /** Return the indexth fixed field of this object. @param index -- this will be used to search the fixed fields @return the fixed field at the specified index or
null if the index is not valid. */ public MarcFixField getFixFieldByIndex(int index) { if ( ! isInstantiated ) return( null ); if ((index < 0) || (index >= fixFields.size())) { // the index is not valid debug.dumpTrace("getFixFieldByIndex(): invalid index"); return null; } // index is valid, return the corresponding subfield return (MarcFixField) fixFields.elementAt(index); } /** Return the indexth variable field of this object. @param index -- this will be used to search the variable fields @return the variable field at the specified index or
null if the index is not valid. */ public MarcVarField getVarFieldByIndex(int index) { if ( ! isInstantiated ) return( null ); if ((index < 0) || (index >= varFields.size())) { // the index is not valid debug.dumpTrace("getVarFieldByIndex(): invalid index"); return null; } // index is valid, return the corresponding subfield return (MarcVarField) varFields.elementAt(index); } /** Return the indexth char in the header of this object. @param index -- specifies with char to return in the header @return a char other than '\0' -- the indexth char in the header
'\0' -- the index is not valid */ public char getStatus() { return( status ); } public char getType() { return( type ); } public char getBiblioLevel() { return( level ); } public char getControlType() { return( controlType ); } public char getCharEncoding() { return( charEncoding ); } public char getEncodingLevel() { return( encLvl ); } public char getCatalogingForm() { return( catForm ); } public char getLinkedRecordRq() { return( linkedRecRqrd ); } /** Instantiate this MarcRecord object from a raw (tape format) record. @param rawMarcRecord --- a string containing a USMARC record in tape format. @return OK -- everything jake.
IO_ERROR or PARSE_ERROR -- problems. */ public int setFromTapeFormat(String inStr) { int Err; try { StringReader sr = new StringReader( inStr ); BufferedReader bsr = new BufferedReader( sr ); Err = setFromTapeFormat(bsr); } catch (IOException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): exception " + e.toString() + " raised: bailing out."); return( ReturnCodes.IO_ERROR ); } return( Err ); } public int setFromTapeFormat(BufferedReader in) throws IOException { if (in == null) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): null input stream."); return( ReturnCodes.BAD_PARAMS ); } init(); // Blank out any values previously set. // parse record length int recLen = 0; int charsToRead; int charsRead; int k; try { char [] lenField = new char [5]; charsToRead = 5; charsRead = 0; while ( charsRead != 5) { k = in.read(lenField, charsRead, charsToRead); if (k == -1) { if (charsRead != 0) debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot read record length (only " + charsRead + " characters in input)."); throw new EOFException("in length field."); } charsRead += k; charsToRead -= k; } String lenStr = new String(lenField); recLen = Integer.parseInt(lenStr); } catch (NumberFormatException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): error parsing record length."); return( ReturnCodes.PARSE_ERROR ); } // Get the rest of the record: NOTE: to make the offsets easier to // understand from the MARC standard, leave space for the record // length field. But don't bother to actually put the recLen in it, // since we'll never look at it. char [] rawMarc = new char [recLen]; charsToRead = recLen-5; charsRead = 5; while ( charsRead < recLen) { k = in.read(rawMarc, charsRead, charsToRead); if (k == -1) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot read " + recLen + " characters for full MARC record (only " + (charsRead+5) + " in input)."); throw new EOFException("in main record."); } charsRead += k; charsToRead -= k; } String rawMarcRecord = new String(rawMarc); // parse flags from the header status = rawMarc[5]; type = rawMarc[6]; level = rawMarc[7]; controlType = rawMarc[8]; charEncoding = rawMarc[9]; encLvl = rawMarc[17]; catForm = rawMarc[18]; linkedRecRqrd = rawMarc[19]; // parse field base int fieldBase = 0; try { fieldBase = Integer.parseInt(rawMarcRecord.substring(12, 17)); } catch (NumberFormatException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): error parsing field base in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } if ( (fieldBase > recLen) || (fieldBase < 25)) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): field base is not correct in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } // directory contains the id, offset and length of all the fields String directory = rawMarcRecord.substring(24, fieldBase - 1); if ((directory.length() == 0) || ((directory.length() % 12) != 0)) { // invalid directory length debug.dumpTrace("MarcRecord.setFromTapeFormat(): directory length is not valid in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } // directory length is valid: parse fixed and variable fields. int numFields = directory.length() / 12; int id, length, offset; int Err; for (int i = 0; i < numFields; i++) { // parse field id try { id = Integer.parseInt(directory.substring(i*12, i*12 + 3)); } catch (NumberFormatException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse field #" + i + " in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } if ( (id <= 0) || (id > 999) ) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): field #" + i + " has illegal ID " + id + " in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } // parse field length try { length = Integer.parseInt(directory.substring(i*12 + 3, i*12 + 7)); } catch (NumberFormatException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse length for " + id + " field (#" + i + ") in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } if (length <= 0) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i + "): length " + length + " is not correct in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } // parse offset try { offset = Integer.parseInt(directory.substring(i*12 + 7, i*12 + 12)); } catch (NumberFormatException e) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse offset for " + id + " field (#" + i + ") in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } if (((offset + length + fieldBase) > recLen) || (offset < 0)) { debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i + "): offset is not correct in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } // create a field if ( rawMarc[fieldBase + offset + length - 1] != field_separator ) { // the last char of the string should be the field separator debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i + "): length (" + length + ") not correct: last character in field is '" + (char) rawMarc[fieldBase + offset + length - 1] + "' in '" + rawMarcRecord + "'."); return( ReturnCodes.PARSE_ERROR ); } String data = rawMarcRecord.substring(offset + fieldBase, offset + length + fieldBase - 1); // ends with the field separator, remove it before creating // corresponding field if (id < 10) // this is a fixed field { MarcFixField mff = new MarcFixField(id, data, xmlMap, debug); fixFields.addElement(mff); } else // this is a variable field { MarcVarField mvf = newVarField(id); // Indicators set in setFromtapeFormat(). if ( (Err = mvf.setFromTapeFormat(data)) != ReturnCodes.OK ) return( Err ); //**DEVEL: Or should we just ignore that VarField and keep //** rolling? Sometimes a VarField can be invalid but the //** rest of the record OK. Also, should we dumpTrace() more //** context here while we've got it? --RKF 23Feb01 varFields.addElement(mvf); } } // -- end for (int i = 0; i < numFields; i++) // all the fields have been parsed isInstantiated = true; return( ReturnCodes.OK ); } /** Instantiate this MarcRecord object from an XML stream. @param in -- a BufferedReader (String or InputStream, presumably) from which to read the next record in OAI MARC XML format. @return OK -- everything jake.
IO_ERROR or PARSE_ERROR -- problems.

NOTE: This routine checks the OIA XML for correctness as it is read. In particular, it checks for matched tags and required attributes, and for the presence of at least one fixed field. It does not check the incoming MARC record for correctness as a MARC record. */ public int setFromXml(BufferedReader in) throws IOException { if (in == null) { debug.dumpTrace("setFromXml(): null input."); return( ReturnCodes.IO_ERROR ); } init(); // Blank out any values previously set. Vector bindings; boolean seenOpen = false; boolean seenFixedField = false; boolean seenTypeAttr = false; boolean seenLevelAttr = false; while ( true ) { if ( (bindings = XmlDoc.acceptTag(in)) == null ) { debug.dumpTrace("MarcRecord.setFromXml(): cannot find start tag."); return( ReturnCodes.PARSE_ERROR ); } String tagName = (String) bindings.elementAt(0); // System.err.println("Got <" + tagName + "> tag."); if ( tagName.equals("/oai_marc") ) { if ( seenOpen && seenFixedField && seenTypeAttr && seenLevelAttr ) { isInstantiated = true; return( ReturnCodes.OK ); } else { debug.dumpTrace("MarcRecord.setFromXml(): missing pieces."); return( ReturnCodes.PARSE_ERROR ); } } else if ( tagName.equals("oai_marc") ) { seenOpen = true; int i; for (i=1; i IO_ERROR or PARSE_ERROR -- problems. */ public int presentAsXml(BufferedWriter out) throws IOException { // Write header information. out.write(""); out.newLine(); Enumeration fixfld = fixFields.elements(); int Err; try { while ( true ) { MarcFixField ff = (MarcFixField) fixfld.nextElement(); if ( (Err = ff.presentAsXml(out)) != ReturnCodes.OK ) return( Err ); out.newLine(); } } catch( NoSuchElementException e) {}; Enumeration varfld = varFields.elements(); try { while ( true ) { MarcVarField vf = (MarcVarField) varfld.nextElement(); if ( (Err = vf.presentAsXml(out)) != ReturnCodes.OK ) return( Err ); out.newLine(); } } catch( NoSuchElementException e) {}; out.write(""); out.newLine(); return( ReturnCodes.OK ); } /** Send this MarcRecord to an output stream in tape format. @param out -- a BufferedWriter (String or InputStream, presumably) to which to write this. @return OK -- everything jake.
IO_ERROR or PARSE_ERROR -- problems. */ public int presentAsTapeFormat(BufferedWriter out) throws IOException { int i=0; int Err; // First, prepare all the fields as Strings in tape format.. int numFields = fixFields.size() + varFields.size(); int fieldLength = 0; String [] fieldStrs = new String [numFields]; int [] fieldIDs = new int [numFields]; Enumeration fixfld = fixFields.elements(); try { while ( true ) { MarcFixField ff = (MarcFixField) fixfld.nextElement(); fieldIDs[i] = ff.getID(); fieldStrs[i] = ff.getData(); fieldLength += fieldStrs[i].length() + 1; // Allow for field_separatpr. i++; } } catch( NoSuchElementException e) {}; Enumeration varfld = varFields.elements(); try { while ( true ) { MarcVarField vf = (MarcVarField) varfld.nextElement(); StringWriter sw = new StringWriter(); BufferedWriter bsw = new BufferedWriter( sw ); if ( (Err = vf.presentAsTapeFormat(bsw)) != ReturnCodes.OK ) return( Err ); fieldIDs[i] = vf.getID(); bsw.flush(); fieldStrs[i] = sw.toString(); fieldLength += fieldStrs[i].length() + 1; i++; } } catch( NoSuchElementException e) {}; if ( (Err = presentLeaderInTapeFormat(out, numFields, fieldLength)) != ReturnCodes.OK ) return( Err ); // Build and write directory. char [] tagBuf = new char[3]; char [] lenBuf = new char[4]; char [] offBuf = new char[5]; int fieldLen; int offset = 0; for (i=0; i0) && (i>=0); i--, num /= 10) charBuf[i] = (char) (num%10 + (int) '0'); if (num > 0) // Ran out of space. return( ReturnCodes.NO_CAN_DO ); while (i >= 0) charBuf[i--] = '0'; return( ReturnCodes.OK ); } /** Convert data into the closest possible version that uses only ASCII characters.

NOTE: At this point, this function leaves nonprintable characters unchanged. In the primary application, this is silly and probably wrong, but it is morally pure. */ public static void anselToCanonicalAscii(String data, BufferedWriter out) throws IOException { int i; for (i=0; i