package edu.vt.marian.Document; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** SgmlDocument
class description: this class represents an NLM SGML document in the system.
designer(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu)
implementator(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu), Robert France
finished time:
known bugs:
JDK version: 1.1.5
side effects: */ public class SgmlDocument implements Document { /** this string contains all the information of this document the format of the string is in SGML */ private String sgmlString = null; /** this vector contains formatted data from the sgml string */ private Vector fields = null; /** this flag tells whether or not the sgml string has been extracted to the vector fields */ private boolean extracted = false; /** this string array maintains the mapping between tags and fields along with the seperator used to concat strings whne there are multiple tags with the same name */ private final static String[][] tagToField = { {"ab", "description", ""}, // format -- tag name, field name, seperator {"ac", "acronym", ""}, {"ad", "address", ""}, {"eml", "email", ""}, {"fx", "cross reference", ""}, {"gn", "general notes", ""}, {"ho", "holdings", ""}, {"ic", "contact person", ""}, {"kw", "keyword", ""}, {"lun", "limitations on use", ""}, {"mh", "mesh heading", ""}, {"na", "name", ""}, {"nt", "type of orgnization", ""}, {"pb", "publications", ""}, {"rg", "region", ""}, {"sa", "sponsoring agency", ""}, {"site", "other officies", ""}, {"tel", "telephone number", ""}, }; /** those are the return values of methods of this class */ public final static int OK = 0; public final static int NULL_STREAM = 3; public final static int NULL_DOCUMENT_STRING = 6; public final static int NULL_SGML_STRING = 8; public final static int EXTRACT_ERROR = 9; public final static int INVALID_TAG_NAME = 10; public final static int NULL_FIELD_NAME = 11; /** just used for debugging */ Debug debug; /** create an SgmlDocument object from the specified stream. @param br the stream from which to read out this document @param debug used for debugging */ public SgmlDocument(BufferedReader br, Debug debug) { this.debug = debug; if (br == null) { debug.dumpTrace("SgmlDocument.[constructor 1]: br is null"); return; } // br is not null, read out raw string from it int num_lines = 0; try { num_lines = Integer.parseInt(br.readLine()); } catch (Exception e0) { debug.dumpTrace("SgmlDocument.[constructor 1]: error reading number of lines"); } if (num_lines > 0) // Even the null string gets an empty line. { int i; sgmlString = new String(""); try { for (i = 0; i < num_lines; i++) { if (i == 0) { // the beginning sgmlString += br.readLine(); } else { // not the first line, so add a line separator between them sgmlString += System.getProperty("line.separator") + br.readLine(); } } } catch (IOException e1) { debug.dumpTrace("SgmlDocument.[constructor 1]: error reading sgml string"); sgmlString = null; } } } /** create an SgmlDocument object from a document string. @param docString a string encoding this document in SGML @param debug used for debugging */ public SgmlDocument(String documentString, Debug debug) { this.debug = debug; sgmlString = documentString; } /** tell whether the object is valid (not whether it has been extracted yet). */ public boolean isValid() { return( sgmlString != null ); } /** tell whether this object and the parameter object represent the same document.
NOTE: At this point we are using String compare on the raw strings to determine equality. This obviously leaves something to be desired. @param d the document used to compare with this object @return true / false */ public boolean equals(SgmlDocument d) { if (d == null) { debug.dumpTrace("SgmlDocument.equals(): d is null"); return( false ); } // Not a great implementation. What if d differs only by white space? return( sgmlString.equals(d.sgmlString) ); } /** print the contents of this object to the specified stream. @param pw the stream to which to write this object @return OK -- this object has been written to the stream correctly
NULL_STREAM -- the parameter stream is null */ public int toStream(PrintWriter pw) { if (pw == null) { debug.dumpTrace("SgmlDocument.toStream(): parameter stream is null"); return NULL_STREAM; } // used to count lines pw.println(LinedString.count_lines(sgmlString)); pw.println(sgmlString); return OK; } /** return the sgml string of the document this object represents. @return the raw form of this document as a string */ public String getDocumentString() { return sgmlString; } /** set the sgml string of the document this object represents. @param documentString this will become the new raw string for this document object @return OK -- the new raw marc record has been set correctly
NULL_DOCUMENT_STRING -- the parameter is null */ public int setDocumentString(String documentString) { if (documentString == null) { debug.dumpTrace("SgmlDocument.setDocumentString(): parameter documentString is null"); return NULL_DOCUMENT_STRING; } // document string is not null sgmlString = documentString; extracted = false; return OK; } /** extract sgml string into different fields. @return OK -- the characteristics have been extracted successfully
other -- other problems
*/
private int extract()
{
if (sgmlString == null)
{
debug.dumpTrace("SgmlDocument.extract(): sgml string is null");
return NULL_SGML_STRING;
}
// sgml string is not null, extract it to different fields
fields = new Vector();
// do some error checking here
int start_index = sgmlString.indexOf("
other -- problems */ private String filt(String tag_data) { int tag_begin = tag_data.indexOf('<'); int tag_end = tag_data.indexOf('>'); int current_position = 0; StringBuffer sb = new StringBuffer(); while ((tag_begin != -1) && (tag_end != -1)) { if (tag_begin > tag_end) { // this is not a tag, skip it sb.append(tag_data.substring(current_position, tag_begin)); current_position = tag_begin + 1; continue; } // find a tag, remove it sb.append(tag_data.substring(current_position, tag_begin)); current_position = tag_end + 1; tag_begin = tag_data.indexOf('<', tag_end + 1); tag_end = tag_data.indexOf('>', tag_end + 1); } // boundary condition sb.append(tag_data.substring(current_position, tag_data.length())); return sb.toString(); } /** add the tag to the corresponding position of the fields vector @return OK -- the characteristics have been extracted successfully
other -- */ private int add_tag(String tag_name, String tag_data) { String field_name = null; int i, j; for (i = 0; i < tagToField.length; i++) { if (tagToField[i][0].equals(tag_name)) { // we found it, assume there is no duplicats field_name = tagToField[i][1]; break; } } if (field_name == null) { // debug.dumpTrace("SgmlDocument.add_tag(): invalid tag name"); return INVALID_TAG_NAME; } // put it into corresponding position of the fields vector for (j = 0; j < fields.size(); j+= 2) { if (field_name.equals((String) fields.elementAt(j))) { // we find the field, append to it String new_field_data = ((String) fields.elementAt(j + 1)) + tagToField[i][2] + tag_data; fields.setElementAt(new_field_data, j + 1); return OK; } } // this is the first time this field appears, append it to the // end of the vector fields.addElement(field_name); fields.addElement(tag_data); return OK; } /** return the number of fields in this document */ public int getNumberFields() { if (! extracted) { extract(); } return fields.size() / 2; } /** return the name of the specified field. */ public String getFieldNameByIndex(int index) { if (! extracted) { extract(); } if ((index < 0) || (index >= (fields.size() / 2))) { debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid"); return null; } // valid index, return the name of the field return (String) fields.elementAt(index * 2); } /** return the data of the specified field. */ public String getFieldDataByIndex(int index) { if (! extracted) { extract(); } if ((index < 0) || (index >= (fields.size() / 2))) { debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid"); return null; } // valid index, return the name of the field return (String) fields.elementAt(index * 2 + 1); } /** return the data of this document corresponding to the specified field @return the field data in the form of a String, or
null -- extraction problem */ public String getFieldData(String field_name) { if (field_name == null) { debug.dumpTrace("SgmlDocument.getFieldData(): parameter field_name is null"); return null; } // try to find the data of the field if (! extracted) { extract(); } for (int i = 0; i < fields.size(); i += 2) { if (field_name.equals((String) fields.elementAt(i))) { // we found the field, return its data return (String) fields.elementAt(i + 1); } } // this is no such field in this document return null; } /** tell the separator between different text strings in the specified field. @param fieldName name of field to search @return String, or null if no such field exists. */ public String getFieldSeparator(String fieldName) { if (fieldName == null) { debug.dumpTrace("SgmlDocument.get_field_separator(): parameter field name is null"); return null; } // try to find the separator of this field for (int i = 0; i < tagToField.length; i++) { if (fieldName.equals(tagToField[i][1])) { // we found a match, return its seperator return tagToField[i][2]; } } // there is no such field in the document debug.dumpTrace("SgmlDocument.getFieldSeparator(): invalid fieldName"); return null; } /** An attempt to get around declaring public clone() methods. */ public DigInfObj copy() { return( (DigInfObj) new SgmlDocument(sgmlString, debug) ); } /** return the short description of this document in one line. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return the short description String. */ public String presentShort(int markupType) { String s = getFieldData("name"); if (s == null) { return ""; } s = s.replace('\n', ' '); String s1 = getFieldData("acronym"); if (s1 != null) { // add the acronym too s1 = s1.replace('\n', ' '); s += " (" + s1 + ")"; } return s; } /** return a Vector of metadata attributes for this document. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return a Vector of triples [attrName, attrType, attrValue]. */ public Vector presentAttributes(int markupType) { debug.dumpTrace("SgmlDocument.presentAttributes(): not yet implemented"); return null; } public Vector attributes() { debug.dumpTrace("SgmlDocument.pattributes(): not yet implemented"); return null; } public Object presentAttribute(int attrID, int markupType) { debug.dumpTrace("SgmlDocument.presentAttribute(): not yet implemented"); return null; } /** return the full description of this document. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return a (potentially very long) String. */ public String presentFull(int markupType) { String s, s1; StringBuffer sb = new StringBuffer(2048); // A guess on the size. // format the title s = getFieldData("name"); if (s != null) { s = s.replace('\n', ' '); sb.append("" + s); s1 = getFieldData("acronym"); if (s1 != null) { // add the acronym too s1 = s1.replace('\n', ' '); sb.append(" (" + s1 + ")"); } sb.append(""); } // format other sections format_section_1("description", "Description", sb); format_section("contact person", "Contact Person(s)", sb); format_section("address", "Address", sb); format_section("telephone number", "Telephone Number(s)", sb); format_section_special("email", sb); format_section_1("general notes", "General Notes", sb); format_section_1("holdings", "Holdings", sb); format_section_1("limitations on use", "Limitations On Use", sb); format_section_1("publications", "Publications", sb); format_section("other officies", "Other Officies", sb); format_section("type of orgnization", "Type of Orgnization", sb); format_section("sponsoring agency", "Sponsoring Agency(s)", sb); format_section("region", "NULM Region Number", sb); format_section("cross reference", "Cross Reference(s)", sb); format_section("mesh heading", "MeSH Heading(s)", sb); format_section("keyword", "Keyword(s)", sb); return( new String(sb) ); } public int presentShort(int markupType, BufferedWriter out) throws IOException { String str = presentShort(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.OK ); } public String presentLong(int markupType) { return( presentFull(markupType) ); } public int presentLong(int markupType, BufferedWriter out) throws IOException { String str = presentFull(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.NOT_YET_IMPLEMENTED ); } public int presentFull(int markupType, BufferedWriter out) throws IOException { String str = presentFull(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.NOT_YET_IMPLEMENTED ); } /** this method will format a section of a sgml document to html string and append the string to the end of the parameter StringBuffer */ private void format_section(String section_name, String section_header, StringBuffer sb) { String s = getFieldData(section_name); if ((s == null) || s.equals("")) { return; } // format the header sb.append("
" + section_header + "
");
// format the body
int begin_index = 0;
int end_index = s.indexOf(System.getProperty("line.separator"), begin_index);
while (end_index != -1)
{
// we find a line separator, change it to html
tag.
sb.append(s.substring(begin_index, end_index) + "
" + System.getProperty("line.separator"));
begin_index = end_index + 1;
end_index = s.indexOf(System.getProperty("line.separator"), begin_index);
}
// process the boundary condition
if (begin_index != s.length())
{
sb.append(s.substring(begin_index));
}
}
/** this method will format a section of a sgml document to html string
and append the string to the end of the parameter StringBuffer, it will
not change line separators to html
tags.
*/
private void format_section_1(String section_name, String section_header,
StringBuffer sb)
{
String s = getFieldData(section_name);
if ((s == null) || s.equals(""))
{
return;
}
// format the header
sb.append("
" + section_header + "
"
+ s);
}
/** this method deals with special fields, currently only email and
URL are supported
*/
private void format_section_special(String section_name, StringBuffer sb)
{
if (! section_name.equals("email"))
{
debug.dumpTrace("class response, method format_section_special,invalid section name");
return;
}
String s = getFieldData(section_name);
if ((s == null) || s.equals(""))
{
return;
}
StringBuffer eml_sb = new StringBuffer();
StringBuffer url_sb = new StringBuffer();
StringTokenizer st = new StringTokenizer(s, System.getProperty("line.separator"));
String s1 = null;
while (st.hasMoreTokens())
{
s1 = st.nextToken();
if (s1.indexOf('@') != -1)
{
// this is an email address
eml_sb.append(s1 + "
" + System.getProperty("line.separator"));
}
else
{
// this is an URL address
url_sb.append(s1 + "
" + System.getProperty("line.separator"));
}
}
if (eml_sb.length() != 0)
{
// there is at least one email address
sb.append("
Email(s)
"); sb.append(eml_sb.toString()); } if (url_sb.length() != 0) { // there is at least one URL address sb.append("
URL(s)
"); sb.append(url_sb.toString()); } } }