SOLR-1437 . javadocs added. skipTag inlined and use loop instead of recursion

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@816577 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Noble Paul 2009-09-18 10:32:15 +00:00
parent aa02a6a5c8
commit 0db6aefc6c
1 changed files with 195 additions and 92 deletions

View File

@ -28,35 +28,51 @@ import java.util.regex.Pattern;
/**
* <p>
* A streaming xpath parser which uses StAX for XML parsing. It supports only a
* subset of xpath syntax.
* </p>
* A streaming xpath parser which uses StAX for XML parsing. It supports only
* a subset of xpath syntax.
* </p><pre>
* /a/b/subject[@qualifier='fullTitle']
* /a/b/subject[@qualifier=]/subtag
* /a/b/subject/@qualifier
* /a/b/c
*
* </pre>
* Keep in mind that the wild-card syntax '//' is not supported
* A record is a Map<String,Object> . The key is the provided name
* and the value is a String or a List<String>
*
* This class is thread-safe for parsing xml. But adding fields is not
* thread-safe. The recommended usage is to addField() in one thread and
* then share the instance across threads.
* </p>
* <p/>
* <b>This API is experimental and may change in the future.</b>
* This class is thread-safe for parsing xml . But adding fields is not thread-safe. The recommended usage is
* to addField() in one thread and then share the instance across threads.
*
* <p>
* @version $Id$
* @since solr 1.3
*/
public class XPathRecordReader {
private Node rootNode = new Node("/", null);
/**Use this flag in the addField() method to fetch all the cdata under a specific tag
*
/**
* The FLATTEN flag indicates that all text and cdata under a specific
* tag should be recursivly fetched and appended to the current Node's
* value.
*/
public static final int FLATTEN = 1;
/**
* @param forEachXpath The XPATH for which a record is emitted. At the start of this xpath tag, it starts collecting the fields and at the close
* of the tag ,a record is emitted and the fields collected since the tag start is included in the record. If there
* are fields collected in the parent tag(s) they also will be included in the record but not cleared after emitting the record.
* It can use the ' | ' syntax of XPATH to pass in multiple xpaths.
* A constructor called with a '|' seperated list of Xpath expressions
* which define sub sections of the XML stream that are to be emitted
* seperate records.
*
* @param forEachXpath The XPATH for which a record is emitted. Once the
* xpath tag is encountered, the Node.parse method starts collecting wanted
* fields and at the close of the tag, a record is emitted containing all
* fields collected since the tag start. Once
* emitted the collected fields are cleared. Any fields collected in the parent tag or above
* will also be included in the record, but these are not
* cleared after emitting the record.
* It uses the ' | ' syntax of XPATH to pass in multiple xpaths.
*/
public XPathRecordReader(String forEachXpath) {
String[] splits = forEachXpath.split("\\|");
@ -64,10 +80,21 @@ public class XPathRecordReader {
split = split.trim();
if (split.length() == 0)
continue;
// The created Node has a name set to the full forEach attribute xpath
addField0(split, split, false, true, 0);
}
}
/**
* A wrapper around {@link #addField0 addField0()} to create a series of Nodes
* based on the supplied Xpath for the given fieldName. The created nodes
* are inserted into a Node tree.
*
* @param name The name for this field in the emitted record
* @param xpath The xpath expression for this field
* @param multiValued If 'true' then the emitted record will have values in
* a List<String>
*/
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
if (!xpath.startsWith("/"))
throw new RuntimeException("xpath must start with '/' : " + xpath);
@ -75,11 +102,16 @@ public class XPathRecordReader {
return this;
}
/**Add a field's XPATH and its name.
* @param name . The name by which this field is referred in the emitted record
* @param xpath . The xpath to this field
* @param multiValued . If this is 'true' , then the emitted record will have a List<String> as value
* @param flags . The only supported flag is 'FLATTEN'
/**
* A wrapper around {@link #addField0 addField0()} to create a series of Nodes
* based on the supplied Xpath for the given fieldName. The created nodes
* are inserted into a Node tree.
*
* @param name The name for this field in the emitted record
* @param xpath The xpath expression for this field
* @param multiValued If 'true' then the emitted record will have values in
* a List<String>
* @param flags FLATTEN: Recursivly combine text from all child XML elements
*/
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
if (!xpath.startsWith("/"))
@ -88,6 +120,18 @@ public class XPathRecordReader {
return this;
}
/**
* Splits the XPATH into a List of xpath segments and calls build() to
* construct a tree of Nodes representing xpath segments. The resulting
* tree structure ends up describing all the Xpaths we are interested in.
*
* @param xpath The xpath expression for this field
* @param name The name for this field in the emitted record
* @param multiValued If 'true' then the emitted record will have values in
* a List<String>
* @param isRecord When 'true' flags that this XPATH is from a forEach statement
* @param flags The only supported flag is 'FLATTEN'
*/
private void addField0(String xpath, String name, boolean multiValued,
boolean isRecord, int flags) {
List<String> paths = splitEscapeQuote(xpath);
@ -96,6 +140,14 @@ public class XPathRecordReader {
rootNode.build(paths, name, multiValued, isRecord, flags);
}
/**
* Uses {@link #streamRecords streamRecords} to parse the XML source but
* collects the emitted records into a List which is returned upon completion.
*
* @param r the stream reader
* @return results a List of emitted records
*
*/
public List<Map<String, Object>> getAllRecords(Reader r) {
final List<Map<String, Object>> results = new ArrayList<Map<String, Object>>();
streamRecords(r, new Handler() {
@ -106,8 +158,12 @@ public class XPathRecordReader {
return results;
}
/** Stream records as and when they are colected
* @param r The reader
/**
* Creates an XML stream reader on top of whatever reader has been
* configured. Then calls parse() with a handler which is
* invoked forEach record emitted.
*
* @param r the stream reader
* @param handler The callback instance
*/
public void streamRecords(Reader r, Handler handler) {
@ -120,60 +176,73 @@ public class XPathRecordReader {
}
}
/**For each node/leaf in the tree there is one object of this class
/**
* For each node/leaf in the Node tree there is one object of this class.
* This tree of objects represents all the XPaths we are interested in.
* For each Xpath segment of interest we create a node. In most cases the
* node (branch) is rather basic , but for the final portion (leaf) of any Xpath we add
* more information to the Node. When parsing the XML document we
* step though this tree as we stream records from the reader. If the XML
* document departs from this tree we skip start tags till we are back on
* the tree.
*
*/
private class Node {
/**name of the tag/attribute*/
String name;
/**The field name as passed in the addField() . This will be used in the record*/
String fieldName;
/**stores the xpath name such as '@attr='xyz'*/
String xpathName;
/**The xpath of the record. if this is a record node */
String forEachPath;
/**child attribute nodes */
List<Node> attributes;
/**child nodes*/
List<Node> childNodes;
/**if attribs are used in the xpath their names and values*/
String name; // genrally: segment of the Xpath represented by this Node
String fieldName; // the fieldname in the emitted record (key of the map)
String xpathName; // the segment of the Xpath represented by this Node
String forEachPath; // the full Xpath from the forEach entity attribute
List<Node> attributes; // a List of attribute Nodes associated with this Node
List<Node> childNodes; // a List of child Nodes of this node
List<Map.Entry<String, String>> attribAndValues;
Node parent; // parent Node in the tree
boolean hasText=false; // flag: store/emit streamed text for this node
boolean multiValued=false; //flag: this fields values are returned as a List
boolean isRecord=false; //flag: this Node starts a new record
private boolean flatten; //flag: child text is also to be emitted
/**Parent node of this node */
Node parent;
boolean hasText = false, multiValued = false, isRecord = false;
private boolean flatten;
public Node(String name, Node p) {
// Create a basic Node, suitable for the mid portions of any Xpath.
// Node.xpathName and Node.name are set to same value
xpathName = this.name = name;
parent = p;
}
public Node(String name, String fieldName, boolean multiValued) {
this.name = name;
this.fieldName = fieldName;
this.multiValued = multiValued;
// This is only called from build() when describing an attribute.
this.name = name; // a segment from the Xpath
this.fieldName = fieldName; // name to store collected values against
this.multiValued = multiValued; // return collected values in a List
}
/**This is the method where all the parsing happens. For each tag/subtag this gets called recursively.
/**
* This is the method where all the XML parsing happens. For each
* tag/subtag read from the source, this method is called recursively.
*
*/
private void parse(XMLStreamReader parser, Handler handler,
Map<String, Object> values, Stack<Set<String>> stack,
boolean recordStarted) throws IOException, XMLStreamException {
Set<String> valuesAddedinThisFrame = null;
if (isRecord) {
// This Node is a match for an XPATH from a forEach attribute,
// prepare to emit a new record when its END_ELEMENT is matched
recordStarted = true;
valuesAddedinThisFrame = new HashSet<String>();
stack.push(valuesAddedinThisFrame);
} else if (recordStarted) {
// This node is a child of some parent which matched against forEach
// attribute. Continue to add values to an existing record.
valuesAddedinThisFrame = stack.peek();
} else {
//if this tag has an attribute or text which is a brank/leaf just push an item up the stack
if (attributes != null || hasText)
valuesAddedinThisFrame = new HashSet<String>();
stack.push(valuesAddedinThisFrame);
}
try {
if (attributes != null) {
for (Node node : attributes) {
@ -184,13 +253,15 @@ public class XPathRecordReader {
}
}
}
Set<Node> childrenFound = new HashSet<Node>();
// for any normal event , parser.next() should be called in each iteration.
// But for CDATA | CHARACTERS | SPACE it should not do so because handling of
// CDATA itself would have consumed the next event. CDATA may throw multiple events
// so all the events are slurped till a START_ELEMENT is encountered.
// Internally we have to gobble CDATA | CHARACTERS | SPACE events as we
// store text, the gobbling continues till we have fetched some other
// event. We use "isNextEventFetched" to indcate that the gobbling has
// already fetched the next event.
boolean isNextEventFetched = false;
int event = -1;
while (true) {
if (!isNextEventFetched) {
event = parser.next();
@ -214,6 +285,8 @@ public class XPathRecordReader {
if ((event == CDATA || event == CHARACTERS || event == SPACE)
&& hasText) {
valuesAddedinThisFrame.add(fieldName);
// becuase we are fetching events here we need to ensure the outer
// loop does not end up doing an extra parser.next()
isNextEventFetched = true;
String text = parser.getText();
event = parser.next();
@ -236,6 +309,8 @@ public class XPathRecordReader {
}
}
} else {
// We are not flatten-ing, so look to see if any of the child
// elements are wanted, and recurse if any are found.
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
}
} else {
@ -243,6 +318,7 @@ public class XPathRecordReader {
}
event = parser.next();
}
// save the text we have read against the fieldName in the Map values
putText(values, text, fieldName, multiValued);
} else if (event == START_ELEMENT) {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
@ -276,8 +352,15 @@ public class XPathRecordReader {
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
} else {
skipTag(parser);
}
else {
// skip ELEMENTS till source document is back within the tree
int count=1; // we have had our first START_ELEMENT
while ( count != 0 ) {
int token = parser.next();
if (token == START_ELEMENT) count++;
else if (token == END_ELEMENT) count--;
}
}
}
@ -311,8 +394,9 @@ public class XPathRecordReader {
return true;
}
/**If there is no value available for a field in a subtag then add a null
* TODO : needs better explanation
/**
* A recursive routine that walks the Node tree from a supplied start
* pushing a null string onto every multiValued fieldName's List of values.
*/
private void putNulls(Map<String, Object> values) {
if (attributes != null) {
@ -329,7 +413,11 @@ public class XPathRecordReader {
}
}
/**Handle multivalued fields by adding List<String>
/**
* Add the field name and text into the values Map. If it is a non multivalued field, then the text
* is simply placed in the object portion of the Map. If it is a
* multivalued field then the text is pushed onto a List which is
* the object portion of the Map.
*/
@SuppressWarnings("unchecked")
private void putText(Map<String, Object> values, String value,
@ -346,49 +434,58 @@ public class XPathRecordReader {
}
}
/**Skip a tag w/o processing the tag or its subtags
*/
private void skipTag(XMLStreamReader parser) throws IOException,
XMLStreamException {
int type;
while ((type = parser.next()) != END_ELEMENT) {
if (type == START_ELEMENT)
skipTag(parser);
}
}
/**Build the node structure from the xpath
* @param paths the xpaths split by '/'
* @param fieldName name of the field
* @param multiValued . is multiValued or not
* @param record is this xpath a record or a field
* @param flags extra flags
/**
* Build a Node tree structure representing all Xpaths of intrest to us.
* This must be done before parsing of the XML stream starts. Each node
* holds one portion of an Xpath. Taking each Xpath segment in turn this
* method walks the Node tree and finds where the new segment should be
* inserted. It creates a Node representing a field's name, XPATH and
* some flags and inserts the Node into the Node tree.
*
*/
private void build(List<String> paths, String fieldName,
boolean multiValued, boolean record, int flags) {
String name = paths.remove(0);
private void build(
List<String> paths, // a List of segments from the split xpaths
String fieldName, // the fieldName assoc with this Xpath
boolean multiValued, // flag if this fieldName is multiValued or not
boolean record, // is this xpath a record or a field
int flags // are we to flatten matching xpaths
) {
// recursivly walk the paths Lists adding new Nodes as required
String name = paths.remove(0); // shift out next Xpath segment
if (paths.isEmpty() && name.startsWith("@")) {
// we have reached end of element portion of Xpath and can now only
// have an element attribute. Add it to this nodes list of attributes
if (attributes == null) {
attributes = new ArrayList<Node>();
}
name = name.substring(1);
name = name.substring(1); // strip the '@'
attributes.add(new Node(name, fieldName, multiValued));
} else {
if (childNodes == null)
childNodes = new ArrayList<Node>();
// does this "name" already exist as a child node.
Node n = getOrAddChildNode(name);
if (paths.isEmpty()) {
// We have reached the end of paths. When parsing the actual
// input we have traversed to a position where we actutally have to
// do something. getOrAddChildNode() will have created and returned
// a new minimal Node with name and xpathName already populated. We
// need to add more information
if (record) {
n.isRecord = true;
n.forEachPath = fieldName;
// forEach attribute
n.isRecord = true; // flag: forEach attribute, prepare to emit rec
n.forEachPath = fieldName; // the full forEach attribute xpath
} else {
n.hasText = true;
n.fieldName = fieldName;
n.multiValued = multiValued;
n.flatten = flags == FLATTEN;
// xpath with content we want to store and return
n.hasText = true; // we have to store text found here
n.fieldName = fieldName; // name to store collected text against
n.multiValued = multiValued; // true: text be stored in a List
n.flatten = flags == FLATTEN; // true: store text from child tags
}
} else {
// recurse to handle next paths segment
n.build(paths, fieldName, multiValued, record, flags);
}
}
@ -398,8 +495,8 @@ public class XPathRecordReader {
for (Node n : childNodes)
if (n.xpathName.equals(xpathName))
return n;
Node n = new Node(xpathName, this);
// new territory! add a new node for this Xpath bitty
Node n = new Node(xpathName, this); // a minimal Node initalization
Matcher m = ATTRIB_PRESENT_WITHVAL.matcher(xpathName);
if (m.find()) {
n.name = m.group(1);
@ -419,16 +516,20 @@ public class XPathRecordReader {
childNodes.add(n);
return n;
}
}
} // end of class Node
/**If a field has List then they have to be deep-copied for thread safety
/**
* Copies a supplied Map to a new Map which is returned. Used to copy a
* records values. If a fields value is a List then they have to be
* deep-copied for thread safety
*/
private Map<String, Object> getDeepCopy(Map<String, Object> values) {
Map<String, Object> result = new HashMap<String, Object>();
for (Map.Entry<String, Object> entry : values.entrySet()) {
if (entry.getValue() instanceof List) {
result.put(entry.getKey(),new ArrayList((List) entry.getValue()));
} else{
} else {
result.put(entry.getKey(),entry.getValue());
}
}
@ -436,8 +537,9 @@ public class XPathRecordReader {
}
/**
* Used for handling cases where there is a slash '/' character
* inside the attribute value e.g. x@html='text/html'. We need to split
* The Xpath is split into segments using the '/' s a seperator. However
* this method deals with special cases where there is a slash '/' character
* inside the attribute value e.g. x/@html='text/html'. We need to split
* by '/' excluding the '/' which is a part of the attribute's value.
*/
private static List<String> splitEscapeQuote(String str) {
@ -465,15 +567,16 @@ public class XPathRecordReader {
factory.setProperty(XMLInputFactory.SUPPORT_DTD , Boolean.FALSE);
}
/**Implement this interface to stream records as and when it is found.
/**Implement this interface to stream records as and when one is found.
*
*/
public static interface Handler {
/**
* @param record The record map . The key is the field name as provided in the addField() methods. The value
* can be a single String (for single valued) or a List<String> (for multiValued)
* if an Exception is thrown from this method the parsing will be aborted
* @param xpath . The forEach XPATH for which this record is being emitted
* @param record The record map. The key is the field name as provided in
* the addField() methods. The value can be a single String (for single
* valued fields) or a List<String> (for multiValued).
* @param xpath The forEach XPATH for which this record is being emitted
* If there is any change all parsing will be aborted and the Exception is propogated up
*/
public void handle(Map<String, Object> record, String xpath);
}