package edu.vt.marian.Document; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** A USMARC record (see http://www.loc.gov/marc/).
The primary purpose of this root class is for methods for reading and writing MARC records in various formats (USMARC "tape" format, Open Archives Initiative XML MARC trasport format, and Atomic transport format. I've also provided a set of basic methods to get at parts of the record, most created originally by Jianxin Zhao. More sophisticated methods, e.g. for presentation to users or reflective self-description, are added in the child class MarcDocument.
@author Robert France */ public class MarcRecord { protected char status; // From MARC Leader position 5: see MARC protected char type; // (Leader 6) documentation for meaning protected char level; // (Leader 7) and acceptable values. protected char controlType; // Leader 8: '#'==normal, 'a'==archival. protected char charEncoding; // Leader 9: should be '#' or else we've got Unicode. protected char encLvl; // Leader 17: "fullness of the bibliographic info": // '#'==full, else lower numbers are better. VT-Lib // flags local theses (including ETDs) with a 'K' here. protected char catForm; // Leader 18: descriptive cataloging convention used. protected char linkedRecRqrd; // Leader 19: Related record required for 76X-78X flds? /** The fixed fields of this object as a Vector of Strings. */ protected Vector fixFields; /** The variable fields of this object as a Vector of MarcVarFields. */ protected Vector varFields; /** The field separator of USMARC record in tape format. */ protected final static char field_separator = '\036'; /** The record end symbol of a USMARC record in tape format. */ protected final static char record_end = '\035'; /** Just used for debugging. */ protected Debug debug; /** Device for mapping ANSEL (and some ASCII) characters to XML entities. */ protected EntityMap xmlMap; /** Has this been properly instantiated (e.g., from a valid tape format or XML string)? */ protected boolean isInstantiated; /** Set instance variables to default values.
NOTE: Called in constructors and in setFrom*(). Added to (not overridden) in MarcDocument class. */ protected void init() { status = ' '; // Blank character: standard USMARC encoding for type = ' '; // "not defined". level = ' '; controlType = ' '; charEncoding = ' '; encLvl = ' '; catForm = ' '; linkedRecRqrd = ' '; fixFields = new Vector(); varFields = new Vector(); isInstantiated = false; } /** Create the correct sort of field object.
NOTE: This may not seem very sensible at this level, but in
the MarcDocument subclass, where variable fields with different
semantics get created as different subclasses of MarcVarField,
it will become indispensable.
DEVEL: This is a method created to be replaced by a private
method in the subclass. Should it be protected or private?
*/
protected MarcVarField newVarField(int ID)
{
return( new MarcVarField(ID, xmlMap, debug) );
}
/**
Create a null MarcRecord, with a minimal xmlMap (one with only & and < in it).
@param debug -- used for debugging
*/
public MarcRecord(Debug dbg)
{
debug = dbg;
xmlMap = new EntityMap(dbg);
init();
}
/**
Create a MarcRecord object using an explicit EntityMap.
@param xMap --- an EntityMap used when reading or writing the record
in XML.
@param debug -- used for debugging
*/
public MarcRecord(EntityMap xMap, Debug dbg)
{
debug = dbg;
xmlMap = xMap;
init();
}
/**
Is this object a valid MARC record?
@return true / false
**DEVEL: At this point this method is incomplete and only
checks if something has been read in. More checking would be
reasonable when we figure out the right level ...
*/
public boolean isValid()
{
return( isInstantiated );
}
/**
Return the number of fixed fields contained in this object.
@return the number of the fixed fields of this object as an integer.
*/
public int getNumberFixFields()
{
if ( ! isInstantiated )
return( 0 );
else
return fixFields.size();
}
/**
Return the number of variable fields contained in this object.
@return the number of the variable fields of this object
as an integer.
*/
public int getNumberVarFields()
{
if ( ! isInstantiated )
return( 0 );
else
return varFields.size();
}
/**
Return the fixed field of this object with the specified id.
@param id -- the fixed field with this id will be returned
@return a MarcFixField -- the fixed field in this object with the
specified id
null -- the id is invalid or there is no such fixed field in this
object with the specified id
*/
public MarcFixField getFixFieldById(int id)
{
if ( ! isInstantiated )
return( null );
if ((id <= 0) || (id > 9))
{
debug.dumpTrace("getFixFieldById(): id is not valid");
return null;
}
// id is valid
MarcFixField mff = null;
for (int i = 0; i < fixFields.size(); i++)
{
mff = (MarcFixField) fixFields.elementAt(i);
if (mff.getID() == id)
{
// a match is found, return the field
return mff;
}
}
// the specified field is not found
return null;
}
/**
Return all the variable fields of this object with the specified id.
@param id -- all the variable fields with this id will be returned
@return a vector containing all the variable fields found with
the specified id.
null -- the id is not valid
the empty vector -- everything was fine, but there are
no fields with that id in this record.
NOTE: Since a MARC record may have several variable
variable fields with the same ID, one ID may return
zero, one, or several fields.
*/
public Vector getVarFieldsById(int id)
{
if ( ! isInstantiated )
return( null );
if ((id < 10) || (id > 999))
{
debug.dumpTrace("getVarFieldsById(): id is not valid");
return null;
}
// id is valid
Vector returnFields = new Vector();
MarcVarField mvf = null;
for (int i = 0; i < varFields.size(); i++)
{
mvf = (MarcVarField) varFields.elementAt(i);
if (mvf.getID() == id)
{
// a match is found, add the field to the return vector
returnFields.addElement(mvf);
}
}
// return the final result
return returnFields;
}
/**
Return the indexth fixed field of this object.
@param index -- this will be used to search the fixed fields
@return the fixed field at the specified index or
null if the index is not valid.
*/
public MarcFixField getFixFieldByIndex(int index)
{
if ( ! isInstantiated )
return( null );
if ((index < 0) || (index >= fixFields.size()))
{
// the index is not valid
debug.dumpTrace("getFixFieldByIndex(): invalid index");
return null;
}
// index is valid, return the corresponding subfield
return (MarcFixField) fixFields.elementAt(index);
}
/**
Return the indexth variable field of this object.
@param index -- this will be used to search the variable fields
@return the variable field at the specified index or
null if the index is not valid.
*/
public MarcVarField getVarFieldByIndex(int index)
{
if ( ! isInstantiated )
return( null );
if ((index < 0) || (index >= varFields.size()))
{
// the index is not valid
debug.dumpTrace("getVarFieldByIndex(): invalid index");
return null;
}
// index is valid, return the corresponding subfield
return (MarcVarField) varFields.elementAt(index);
}
/**
Return the indexth char in the header of this object.
@param index -- specifies with char to return in the header
@return a char other than '\0' -- the indexth char in the header
'\0' -- the index is not valid
*/
public char getStatus()
{
return( status );
}
public char getType()
{
return( type );
}
public char getBiblioLevel()
{
return( level );
}
public char getControlType()
{
return( controlType );
}
public char getCharEncoding()
{
return( charEncoding );
}
public char getEncodingLevel()
{
return( encLvl );
}
public char getCatalogingForm()
{
return( catForm );
}
public char getLinkedRecordRq()
{
return( linkedRecRqrd );
}
/**
Instantiate this MarcRecord object from a raw (tape format) record.
@param rawMarcRecord --- a string containing a USMARC record in
tape format.
@return OK -- everything jake.
IO_ERROR or PARSE_ERROR -- problems.
*/
public int setFromTapeFormat(String inStr)
{
int Err;
try
{
StringReader sr = new StringReader( inStr );
BufferedReader bsr = new BufferedReader( sr );
Err = setFromTapeFormat(bsr);
} catch (IOException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): exception " + e.toString() +
" raised: bailing out.");
return( ReturnCodes.IO_ERROR );
}
return( Err );
}
public int setFromTapeFormat(BufferedReader in) throws IOException
{
if (in == null)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): null input stream.");
return( ReturnCodes.BAD_PARAMS );
}
init(); // Blank out any values previously set.
// parse record length
int recLen = 0;
int charsToRead;
int charsRead;
int k;
try
{
char [] lenField = new char [5];
charsToRead = 5;
charsRead = 0;
while ( charsRead != 5)
{
k = in.read(lenField, charsRead, charsToRead);
if (k == -1)
{
if (charsRead != 0)
debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot read record length (only " +
charsRead + " characters in input).");
throw new EOFException("in length field.");
}
charsRead += k;
charsToRead -= k;
}
String lenStr = new String(lenField);
recLen = Integer.parseInt(lenStr);
}
catch (NumberFormatException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): error parsing record length.");
return( ReturnCodes.PARSE_ERROR );
}
// Get the rest of the record: NOTE: to make the offsets easier to
// understand from the MARC standard, leave space for the record
// length field. But don't bother to actually put the recLen in it,
// since we'll never look at it.
char [] rawMarc = new char [recLen];
charsToRead = recLen-5;
charsRead = 5;
while ( charsRead < recLen)
{
k = in.read(rawMarc, charsRead, charsToRead);
if (k == -1)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot read " + recLen +
" characters for full MARC record (only " + (charsRead+5) +
" in input).");
throw new EOFException("in main record.");
}
charsRead += k;
charsToRead -= k;
}
String rawMarcRecord = new String(rawMarc);
// parse flags from the header
status = rawMarc[5];
type = rawMarc[6];
level = rawMarc[7];
controlType = rawMarc[8];
charEncoding = rawMarc[9];
encLvl = rawMarc[17];
catForm = rawMarc[18];
linkedRecRqrd = rawMarc[19];
// parse field base
int fieldBase = 0;
try
{
fieldBase = Integer.parseInt(rawMarcRecord.substring(12, 17));
}
catch (NumberFormatException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): error parsing field base in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
if ( (fieldBase > recLen) || (fieldBase < 25))
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): field base is not correct in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
// directory contains the id, offset and length of all the fields
String directory = rawMarcRecord.substring(24, fieldBase - 1);
if ((directory.length() == 0) || ((directory.length() % 12) != 0))
{
// invalid directory length
debug.dumpTrace("MarcRecord.setFromTapeFormat(): directory length is not valid in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
// directory length is valid: parse fixed and variable fields.
int numFields = directory.length() / 12;
int id, length, offset;
int Err;
for (int i = 0; i < numFields; i++)
{
// parse field id
try
{
id = Integer.parseInt(directory.substring(i*12, i*12 + 3));
}
catch (NumberFormatException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse field #" + i + " in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
if ( (id <= 0) || (id > 999) )
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): field #" + i + " has illegal ID " + id + " in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
// parse field length
try
{
length = Integer.parseInt(directory.substring(i*12 + 3,
i*12 + 7));
}
catch (NumberFormatException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse length for " + id + " field (#" + i + ") in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
if (length <= 0)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i + "): length " + length + " is not correct in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
// parse offset
try
{
offset = Integer.parseInt(directory.substring(i*12 + 7,
i*12 + 12));
}
catch (NumberFormatException e)
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): cannot parse offset for " + id + " field (#" + i + ") in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
if (((offset + length + fieldBase) > recLen) || (offset < 0))
{
debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i + "): offset is not correct in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
// create a field
if ( rawMarc[fieldBase + offset + length - 1] != field_separator )
{
// the last char of the string should be the field separator
debug.dumpTrace("MarcRecord.setFromTapeFormat(): " + id + " field (#" + i +
"): length (" + length +
") not correct: last character in field is '" +
(char) rawMarc[fieldBase + offset + length - 1] + "' in '" +
rawMarcRecord + "'.");
return( ReturnCodes.PARSE_ERROR );
}
String data = rawMarcRecord.substring(offset + fieldBase,
offset + length + fieldBase - 1);
// ends with the field separator, remove it before creating
// corresponding field
if (id < 10) // this is a fixed field
{
MarcFixField mff = new MarcFixField(id, data, xmlMap, debug);
fixFields.addElement(mff);
}
else // this is a variable field
{
MarcVarField mvf = newVarField(id);
// Indicators set in setFromtapeFormat().
if ( (Err = mvf.setFromTapeFormat(data)) != ReturnCodes.OK )
return( Err );
//**DEVEL: Or should we just ignore that VarField and keep
//** rolling? Sometimes a VarField can be invalid but the
//** rest of the record OK. Also, should we dumpTrace() more
//** context here while we've got it? --RKF 23Feb01
varFields.addElement(mvf);
}
} // -- end for (int i = 0; i < numFields; i++)
// all the fields have been parsed
isInstantiated = true;
return( ReturnCodes.OK );
}
/**
Instantiate this MarcRecord object from an XML stream.
@param in -- a BufferedReader (String or InputStream, presumably) from which
to read the next record in OAI MARC XML format.
@return OK -- everything jake.
IO_ERROR or PARSE_ERROR -- problems.
NOTE: This routine checks the OIA XML for correctness as it is
read. In particular, it checks for matched tags and required
attributes, and for the presence of at least one fixed field.
It does not check the incoming MARC record for correctness
as a MARC record.
*/
public int setFromXml(BufferedReader in) throws IOException
{
if (in == null)
{
debug.dumpTrace("setFromXml(): null input.");
return( ReturnCodes.IO_ERROR );
}
init(); // Blank out any values previously set.
Vector bindings;
boolean seenOpen = false;
boolean seenFixedField = false;
boolean seenTypeAttr = false;
boolean seenLevelAttr = false;
while ( true )
{
if ( (bindings = XmlDoc.acceptTag(in)) == null )
{
debug.dumpTrace("MarcRecord.setFromXml(): cannot find start tag.");
return( ReturnCodes.PARSE_ERROR );
}
String tagName = (String) bindings.elementAt(0);
// System.err.println("Got <" + tagName + "> tag.");
if ( tagName.equals("/oai_marc") )
{
if ( seenOpen && seenFixedField && seenTypeAttr && seenLevelAttr )
{
isInstantiated = true;
return( ReturnCodes.OK );
}
else
{
debug.dumpTrace("MarcRecord.setFromXml(): missing pieces.");
return( ReturnCodes.PARSE_ERROR );
}
}
else if ( tagName.equals("oai_marc") )
{
seenOpen = true;
int i;
for (i=1; i
NOTE: At this point, this function leaves nonprintable characters
unchanged. In the primary application, this is silly and
probably wrong, but it is morally pure.
*/
public static void anselToCanonicalAscii(String data, BufferedWriter out)
throws IOException
{
int i;
for (i=0; i
IO_ERROR or PARSE_ERROR -- problems.
*/
public int presentAsTapeFormat(BufferedWriter out) throws IOException
{
int i=0;
int Err;
// First, prepare all the fields as Strings in tape format..
int numFields = fixFields.size() + varFields.size();
int fieldLength = 0;
String [] fieldStrs = new String [numFields];
int [] fieldIDs = new int [numFields];
Enumeration fixfld = fixFields.elements();
try { while ( true )
{
MarcFixField ff = (MarcFixField) fixfld.nextElement();
fieldIDs[i] = ff.getID();
fieldStrs[i] = ff.getData();
fieldLength += fieldStrs[i].length() + 1; // Allow for field_separatpr.
i++;
} } catch( NoSuchElementException e) {};
Enumeration varfld = varFields.elements();
try { while ( true )
{
MarcVarField vf = (MarcVarField) varfld.nextElement();
StringWriter sw = new StringWriter();
BufferedWriter bsw = new BufferedWriter( sw );
if ( (Err = vf.presentAsTapeFormat(bsw)) != ReturnCodes.OK )
return( Err );
fieldIDs[i] = vf.getID();
bsw.flush();
fieldStrs[i] = sw.toString();
fieldLength += fieldStrs[i].length() + 1;
i++;
} } catch( NoSuchElementException e) {};
if ( (Err = presentLeaderInTapeFormat(out, numFields, fieldLength)) != ReturnCodes.OK )
return( Err );
// Build and write directory.
char [] tagBuf = new char[3];
char [] lenBuf = new char[4];
char [] offBuf = new char[5];
int fieldLen;
int offset = 0;
for (i=0; i