package edu.vt.marian.Document; import java.io.*; import java.net.*; import java.util.*; import edu.vt.marian.common.*; /** SgmlDocument

class description: this class represents an NLM SGML document in the system.

designer(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu)

implementator(s): Jianxin Zhao (jxzhao@csgrad.cs.vt.edu), Robert France

finished time:

known bugs:

JDK version: 1.1.5

side effects: */ public class SgmlDocument implements Document { /** this string contains all the information of this document the format of the string is in SGML */ private String sgmlString = null; /** this vector contains formatted data from the sgml string */ private Vector fields = null; /** this flag tells whether or not the sgml string has been extracted to the vector fields */ private boolean extracted = false; /** this string array maintains the mapping between tags and fields along with the seperator used to concat strings whne there are multiple tags with the same name */ private final static String[][] tagToField = { {"ab", "description", ""}, // format -- tag name, field name, seperator {"ac", "acronym", ""}, {"ad", "address", ""}, {"eml", "email", ""}, {"fx", "cross reference", ""}, {"gn", "general notes", ""}, {"ho", "holdings", ""}, {"ic", "contact person", ""}, {"kw", "keyword", ""}, {"lun", "limitations on use", ""}, {"mh", "mesh heading", ""}, {"na", "name", ""}, {"nt", "type of orgnization", ""}, {"pb", "publications", ""}, {"rg", "region", ""}, {"sa", "sponsoring agency", ""}, {"site", "other officies", ""}, {"tel", "telephone number", ""}, }; /** those are the return values of methods of this class */ public final static int OK = 0; public final static int NULL_STREAM = 3; public final static int NULL_DOCUMENT_STRING = 6; public final static int NULL_SGML_STRING = 8; public final static int EXTRACT_ERROR = 9; public final static int INVALID_TAG_NAME = 10; public final static int NULL_FIELD_NAME = 11; /** just used for debugging */ Debug debug; /** create an SgmlDocument object from the specified stream. @param br the stream from which to read out this document @param debug used for debugging */ public SgmlDocument(BufferedReader br, Debug debug) { this.debug = debug; if (br == null) { debug.dumpTrace("SgmlDocument.[constructor 1]: br is null"); return; } // br is not null, read out raw string from it int num_lines = 0; try { num_lines = Integer.parseInt(br.readLine()); } catch (Exception e0) { debug.dumpTrace("SgmlDocument.[constructor 1]: error reading number of lines"); } if (num_lines > 0) // Even the null string gets an empty line. { int i; sgmlString = new String(""); try { for (i = 0; i < num_lines; i++) { if (i == 0) { // the beginning sgmlString += br.readLine(); } else { // not the first line, so add a line separator between them sgmlString += System.getProperty("line.separator") + br.readLine(); } } } catch (IOException e1) { debug.dumpTrace("SgmlDocument.[constructor 1]: error reading sgml string"); sgmlString = null; } } } /** create an SgmlDocument object from a document string. @param docString a string encoding this document in SGML @param debug used for debugging */ public SgmlDocument(String documentString, Debug debug) { this.debug = debug; sgmlString = documentString; } /** tell whether the object is valid (not whether it has been extracted yet). */ public boolean isValid() { return( sgmlString != null ); } /** tell whether this object and the parameter object represent the same document.

NOTE: At this point we are using String compare on the raw strings to determine equality. This obviously leaves something to be desired. @param d the document used to compare with this object @return true / false */ public boolean equals(SgmlDocument d) { if (d == null) { debug.dumpTrace("SgmlDocument.equals(): d is null"); return( false ); } // Not a great implementation. What if d differs only by white space? return( sgmlString.equals(d.sgmlString) ); } /** print the contents of this object to the specified stream. @param pw the stream to which to write this object @return OK -- this object has been written to the stream correctly

NULL_STREAM -- the parameter stream is null */ public int toStream(PrintWriter pw) { if (pw == null) { debug.dumpTrace("SgmlDocument.toStream(): parameter stream is null"); return NULL_STREAM; } // used to count lines pw.println(LinedString.count_lines(sgmlString)); pw.println(sgmlString); return OK; } /** return the sgml string of the document this object represents. @return the raw form of this document as a string */ public String getDocumentString() { return sgmlString; } /** set the sgml string of the document this object represents. @param documentString this will become the new raw string for this document object @return OK -- the new raw marc record has been set correctly

NULL_DOCUMENT_STRING -- the parameter is null */ public int setDocumentString(String documentString) { if (documentString == null) { debug.dumpTrace("SgmlDocument.setDocumentString(): parameter documentString is null"); return NULL_DOCUMENT_STRING; } // document string is not null sgmlString = documentString; extracted = false; return OK; } /** extract sgml string into different fields. @return OK -- the characteristics have been extracted successfully

other -- other problems */ private int extract() { if (sgmlString == null) { debug.dumpTrace("SgmlDocument.extract(): sgml string is null"); return NULL_SGML_STRING; } // sgml string is not null, extract it to different fields fields = new Vector(); // do some error checking here int start_index = sgmlString.indexOf(""); if (start_index == -1) { debug.dumpTrace("SgmlDocument.extract(): sgml string doesn't contain doc tag"); return EXTRACT_ERROR; } start_index += 5; int end_index = sgmlString.indexOf(""); if (end_index == -1) { debug.dumpTrace("SgmlDocument.extract(): sgml string doesn't contain /doc tag"); return EXTRACT_ERROR; } if (start_index >= end_index) { debug.dumpTrace("SgmlDocument.extract(): tags doc and /doc misplaced"); return EXTRACT_ERROR; } // parsing all the tags int tag_name_begin_index, tag_name_end_index, tag_data_begin_index, tag_data_end_index; while (start_index < end_index) { // get the name of the tag tag_name_begin_index = sgmlString.indexOf("<", start_index); if ((tag_name_begin_index == -1) || (tag_name_begin_index == end_index)) { break; } tag_name_end_index = sgmlString.indexOf(">", tag_name_begin_index); if ((tag_name_end_index == -1) || (tag_name_end_index > end_index)) { debug.dumpTrace("SgmlDocument.extract(): tag name not ended"); return EXTRACT_ERROR; } if (tag_name_begin_index == (tag_name_end_index - 1)) { debug.dumpTrace("SgmlDocument.extract(): empty tag identified"); return EXTRACT_ERROR; } String tag_name = sgmlString.substring(tag_name_begin_index + 1, tag_name_end_index); // get the data of the tag tag_data_begin_index = tag_name_end_index + 1; tag_data_end_index = sgmlString.indexOf("", tag_data_begin_index ); if ((tag_data_end_index == -1) || (tag_data_end_index >= end_index)) { debug.dumpTrace("SgmlDocument.extract(): parse field error"); return EXTRACT_ERROR; } String tag_data = sgmlString.substring(tag_data_begin_index, tag_data_end_index); // remove tags from tag data and then add this tag to fields vector tag_data = filt(tag_data); add_tag(tag_name, tag_data); // prepare for the next tag start_index = tag_data_end_index + 3 + tag_name.length(); } // all the tags has been parsed extracted = true; return OK; } /** remove all the tags inside the parameter tag_data @return OK -- the characteristics have been extracted successfully

other -- problems */ private String filt(String tag_data) { int tag_begin = tag_data.indexOf('<'); int tag_end = tag_data.indexOf('>'); int current_position = 0; StringBuffer sb = new StringBuffer(); while ((tag_begin != -1) && (tag_end != -1)) { if (tag_begin > tag_end) { // this is not a tag, skip it sb.append(tag_data.substring(current_position, tag_begin)); current_position = tag_begin + 1; continue; } // find a tag, remove it sb.append(tag_data.substring(current_position, tag_begin)); current_position = tag_end + 1; tag_begin = tag_data.indexOf('<', tag_end + 1); tag_end = tag_data.indexOf('>', tag_end + 1); } // boundary condition sb.append(tag_data.substring(current_position, tag_data.length())); return sb.toString(); } /** add the tag to the corresponding position of the fields vector @return OK -- the characteristics have been extracted successfully

other -- */ private int add_tag(String tag_name, String tag_data) { String field_name = null; int i, j; for (i = 0; i < tagToField.length; i++) { if (tagToField[i][0].equals(tag_name)) { // we found it, assume there is no duplicats field_name = tagToField[i][1]; break; } } if (field_name == null) { // debug.dumpTrace("SgmlDocument.add_tag(): invalid tag name"); return INVALID_TAG_NAME; } // put it into corresponding position of the fields vector for (j = 0; j < fields.size(); j+= 2) { if (field_name.equals((String) fields.elementAt(j))) { // we find the field, append to it String new_field_data = ((String) fields.elementAt(j + 1)) + tagToField[i][2] + tag_data; fields.setElementAt(new_field_data, j + 1); return OK; } } // this is the first time this field appears, append it to the // end of the vector fields.addElement(field_name); fields.addElement(tag_data); return OK; } /** return the number of fields in this document */ public int getNumberFields() { if (! extracted) { extract(); } return fields.size() / 2; } /** return the name of the specified field. */ public String getFieldNameByIndex(int index) { if (! extracted) { extract(); } if ((index < 0) || (index >= (fields.size() / 2))) { debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid"); return null; } // valid index, return the name of the field return (String) fields.elementAt(index * 2); } /** return the data of the specified field. */ public String getFieldDataByIndex(int index) { if (! extracted) { extract(); } if ((index < 0) || (index >= (fields.size() / 2))) { debug.dumpTrace("SgmlDocument.get_field_name_by_index(): index is not inlid"); return null; } // valid index, return the name of the field return (String) fields.elementAt(index * 2 + 1); } /** return the data of this document corresponding to the specified field @return the field data in the form of a String, or

null -- extraction problem */ public String getFieldData(String field_name) { if (field_name == null) { debug.dumpTrace("SgmlDocument.getFieldData(): parameter field_name is null"); return null; } // try to find the data of the field if (! extracted) { extract(); } for (int i = 0; i < fields.size(); i += 2) { if (field_name.equals((String) fields.elementAt(i))) { // we found the field, return its data return (String) fields.elementAt(i + 1); } } // this is no such field in this document return null; } /** tell the separator between different text strings in the specified field. @param fieldName name of field to search @return String, or null if no such field exists. */ public String getFieldSeparator(String fieldName) { if (fieldName == null) { debug.dumpTrace("SgmlDocument.get_field_separator(): parameter field name is null"); return null; } // try to find the separator of this field for (int i = 0; i < tagToField.length; i++) { if (fieldName.equals(tagToField[i][1])) { // we found a match, return its seperator return tagToField[i][2]; } } // there is no such field in the document debug.dumpTrace("SgmlDocument.getFieldSeparator(): invalid fieldName"); return null; } /** An attempt to get around declaring public clone() methods. */ public DigInfObj copy() { return( (DigInfObj) new SgmlDocument(sgmlString, debug) ); } /** return the short description of this document in one line. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return the short description String. */ public String presentShort(int markupType) { String s = getFieldData("name"); if (s == null) { return ""; } s = s.replace('\n', ' '); String s1 = getFieldData("acronym"); if (s1 != null) { // add the acronym too s1 = s1.replace('\n', ' '); s += " (" + s1 + ")"; } return s; } /** return a Vector of metadata attributes for this document. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return a Vector of triples [attrName, attrType, attrValue]. */ public Vector presentAttributes(int markupType) { debug.dumpTrace("SgmlDocument.presentAttributes(): not yet implemented"); return null; } public Vector attributes() { debug.dumpTrace("SgmlDocument.pattributes(): not yet implemented"); return null; } public Object presentAttribute(int attrID, int markupType) { debug.dumpTrace("SgmlDocument.presentAttribute(): not yet implemented"); return null; } /** return the full description of this document. @param markupType how to mark up the string returned (e.g., HTML or ASCII). @return a (potentially very long) String. */ public String presentFull(int markupType) { String s, s1; StringBuffer sb = new StringBuffer(2048); // A guess on the size. // format the title s = getFieldData("name"); if (s != null) { s = s.replace('\n', ' '); sb.append("" + s); s1 = getFieldData("acronym"); if (s1 != null) { // add the acronym too s1 = s1.replace('\n', ' '); sb.append(" (" + s1 + ")"); } sb.append(""); } // format other sections format_section_1("description", "Description", sb); format_section("contact person", "Contact Person(s)", sb); format_section("address", "Address", sb); format_section("telephone number", "Telephone Number(s)", sb); format_section_special("email", sb); format_section_1("general notes", "General Notes", sb); format_section_1("holdings", "Holdings", sb); format_section_1("limitations on use", "Limitations On Use", sb); format_section_1("publications", "Publications", sb); format_section("other officies", "Other Officies", sb); format_section("type of orgnization", "Type of Orgnization", sb); format_section("sponsoring agency", "Sponsoring Agency(s)", sb); format_section("region", "NULM Region Number", sb); format_section("cross reference", "Cross Reference(s)", sb); format_section("mesh heading", "MeSH Heading(s)", sb); format_section("keyword", "Keyword(s)", sb); return( new String(sb) ); } public int presentShort(int markupType, BufferedWriter out) throws IOException { String str = presentShort(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.OK ); } public String presentLong(int markupType) { return( presentFull(markupType) ); } public int presentLong(int markupType, BufferedWriter out) throws IOException { String str = presentFull(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.NOT_YET_IMPLEMENTED ); } public int presentFull(int markupType, BufferedWriter out) throws IOException { String str = presentFull(markupType); if ( str == null ) return( ReturnCodes.NO_CAN_DO ); out.write(str); return( ReturnCodes.NOT_YET_IMPLEMENTED ); } /** this method will format a section of a sgml document to html string and append the string to the end of the parameter StringBuffer */ private void format_section(String section_name, String section_header, StringBuffer sb) { String s = getFieldData(section_name); if ((s == null) || s.equals("")) { return; } // format the header sb.append("

" + section_header + "

"); // format the body int begin_index = 0; int end_index = s.indexOf(System.getProperty("line.separator"), begin_index); while (end_index != -1) { // we find a line separator, change it to html
tag. sb.append(s.substring(begin_index, end_index) + "
" + System.getProperty("line.separator")); begin_index = end_index + 1; end_index = s.indexOf(System.getProperty("line.separator"), begin_index); } // process the boundary condition if (begin_index != s.length()) { sb.append(s.substring(begin_index)); } } /** this method will format a section of a sgml document to html string and append the string to the end of the parameter StringBuffer, it will not change line separators to html
tags. */ private void format_section_1(String section_name, String section_header, StringBuffer sb) { String s = getFieldData(section_name); if ((s == null) || s.equals("")) { return; } // format the header sb.append("

" + section_header + "

" + s); } /** this method deals with special fields, currently only email and URL are supported */ private void format_section_special(String section_name, StringBuffer sb) { if (! section_name.equals("email")) { debug.dumpTrace("class response, method format_section_special,invalid section name"); return; } String s = getFieldData(section_name); if ((s == null) || s.equals("")) { return; } StringBuffer eml_sb = new StringBuffer(); StringBuffer url_sb = new StringBuffer(); StringTokenizer st = new StringTokenizer(s, System.getProperty("line.separator")); String s1 = null; while (st.hasMoreTokens()) { s1 = st.nextToken(); if (s1.indexOf('@') != -1) { // this is an email address eml_sb.append(s1 + "
" + System.getProperty("line.separator")); } else { // this is an URL address url_sb.append(s1 + "
" + System.getProperty("line.separator")); } } if (eml_sb.length() != 0) { // there is at least one email address sb.append("

Email(s)

"); sb.append(eml_sb.toString()); } if (url_sb.length() != 0) { // there is at least one URL address sb.append("

URL(s)

"); sb.append(url_sb.toString()); } } }