mirror of https://github.com/apache/lucene.git
javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@814618 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9f24548e6d
commit
38688edbaa
|
@ -31,16 +31,33 @@ import java.util.regex.Pattern;
|
||||||
* A streaming xpath parser which uses StAX for XML parsing. It supports only a
|
* A streaming xpath parser which uses StAX for XML parsing. It supports only a
|
||||||
* subset of xpath syntax.
|
* subset of xpath syntax.
|
||||||
* </p>
|
* </p>
|
||||||
|
* /a/b/subject[@qualifier='fullTitle']
|
||||||
|
* /a/b/subject/@qualifier
|
||||||
|
* /a/b/c
|
||||||
|
*
|
||||||
|
* Keep in mind that the wild-card syntax '//' is not supported
|
||||||
|
*
|
||||||
* <p/>
|
* <p/>
|
||||||
* <b>This API is experimental and may change in the future.</b>
|
* <b>This API is experimental and may change in the future.</b>
|
||||||
|
* This class is thread-safe for parsing xml . But adding fields is not thread-safe. The recommended usage is
|
||||||
|
* to addField() in one thread and then share the instance across threads.
|
||||||
*
|
*
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
* @since solr 1.3
|
* @since solr 1.3
|
||||||
*/
|
*/
|
||||||
public class XPathRecordReader {
|
public class XPathRecordReader {
|
||||||
private Node rootNode = new Node("/", null);
|
private Node rootNode = new Node("/", null);
|
||||||
|
/**Use this flag in the addField() method to fetch all the cdata under a specific tag
|
||||||
|
*
|
||||||
|
*/
|
||||||
public static final int FLATTEN = 1;
|
public static final int FLATTEN = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param forEachXpath The XPATH for which a record is emitted. At the start of this xpath tag, it starts collecting the fields and at the close
|
||||||
|
* of the tag ,a record is emitted and the fields collected since the tag start is included in the record. If there
|
||||||
|
* are fields collected in the parent tag(s) they also will be included in the record but not cleared after emitting the record.
|
||||||
|
* It can use the ' | ' syntax of XPATH to pass in multiple xpaths.
|
||||||
|
*/
|
||||||
public XPathRecordReader(String forEachXpath) {
|
public XPathRecordReader(String forEachXpath) {
|
||||||
String[] splits = forEachXpath.split("\\|");
|
String[] splits = forEachXpath.split("\\|");
|
||||||
for (String split : splits) {
|
for (String split : splits) {
|
||||||
|
@ -58,6 +75,12 @@ public class XPathRecordReader {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**Add a field's XPATH and its name.
|
||||||
|
* @param name . The name by which this field is referred in the emitted record
|
||||||
|
* @param xpath . The xpath to this field
|
||||||
|
* @param multiValued . If this is 'true' , then the emitted record will have a List<String> as value
|
||||||
|
* @param flags . The only supported flag is 'FLATTEN'
|
||||||
|
*/
|
||||||
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
|
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
|
||||||
if (!xpath.startsWith("/"))
|
if (!xpath.startsWith("/"))
|
||||||
throw new RuntimeException("xpath must start with '/' : " + xpath);
|
throw new RuntimeException("xpath must start with '/' : " + xpath);
|
||||||
|
@ -83,6 +106,10 @@ public class XPathRecordReader {
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Stream records as and when they are colected
|
||||||
|
* @param r The reader
|
||||||
|
* @param handler The callback instance
|
||||||
|
*/
|
||||||
public void streamRecords(Reader r, Handler handler) {
|
public void streamRecords(Reader r, Handler handler) {
|
||||||
try {
|
try {
|
||||||
XMLStreamReader parser = factory.createXMLStreamReader(r);
|
XMLStreamReader parser = factory.createXMLStreamReader(r);
|
||||||
|
@ -93,13 +120,26 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**For each node/leaf in the tree there is one object of this class
|
||||||
|
*/
|
||||||
private class Node {
|
private class Node {
|
||||||
String name, fieldName, xpathName, forEachPath;
|
/**name of the tag/attribute*/
|
||||||
|
String name;
|
||||||
List<Node> attributes, childNodes;
|
|
||||||
|
|
||||||
|
/**The field name as passed in the addField() . This will be used in the record*/
|
||||||
|
String fieldName;
|
||||||
|
/**stores the xpath name such as '@attr='xyz'*/
|
||||||
|
String xpathName;
|
||||||
|
/**The xpath of the record. if this is a record node */
|
||||||
|
String forEachPath;
|
||||||
|
/**child attribute nodes */
|
||||||
|
List<Node> attributes;
|
||||||
|
/**child nodes*/
|
||||||
|
List<Node> childNodes;
|
||||||
|
/**if attribs are used in the xpath their names and values*/
|
||||||
List<Map.Entry<String, String>> attribAndValues;
|
List<Map.Entry<String, String>> attribAndValues;
|
||||||
|
|
||||||
|
/**Parent node of this node */
|
||||||
Node parent;
|
Node parent;
|
||||||
|
|
||||||
boolean hasText = false, multiValued = false, isRecord = false;
|
boolean hasText = false, multiValued = false, isRecord = false;
|
||||||
|
@ -117,6 +157,8 @@ public class XPathRecordReader {
|
||||||
this.multiValued = multiValued;
|
this.multiValued = multiValued;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**This is the method where all the parsing happens. For each tag/subtag this gets called recursively.
|
||||||
|
*/
|
||||||
private void parse(XMLStreamReader parser, Handler handler,
|
private void parse(XMLStreamReader parser, Handler handler,
|
||||||
Map<String, Object> values, Stack<Set<String>> stack,
|
Map<String, Object> values, Stack<Set<String>> stack,
|
||||||
boolean recordStarted) throws IOException, XMLStreamException {
|
boolean recordStarted) throws IOException, XMLStreamException {
|
||||||
|
@ -203,7 +245,8 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
|
/*If a record has ended (tag closed) then clearup all the fields found
|
||||||
|
in this record after this tag started */
|
||||||
Set<String> cleanThis = null;
|
Set<String> cleanThis = null;
|
||||||
if (isRecord || !recordStarted) {
|
if (isRecord || !recordStarted) {
|
||||||
cleanThis = stack.pop();
|
cleanThis = stack.pop();
|
||||||
|
@ -218,6 +261,9 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**if a new tag is encountered, check if it is of interest of not (if there is a matching child Node).
|
||||||
|
* if yes continue parsing else skip
|
||||||
|
*/
|
||||||
private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound,
|
private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound,
|
||||||
Handler handler, Map<String, Object> values,
|
Handler handler, Map<String, Object> values,
|
||||||
Stack<Set<String>> stack, boolean recordStarted)
|
Stack<Set<String>> stack, boolean recordStarted)
|
||||||
|
@ -231,6 +277,8 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**check if the current tag is to be parsed or not. if yes return the Node object
|
||||||
|
*/
|
||||||
private Node getMatchingChild(XMLStreamReader parser) {
|
private Node getMatchingChild(XMLStreamReader parser) {
|
||||||
if (childNodes == null)
|
if (childNodes == null)
|
||||||
return null;
|
return null;
|
||||||
|
@ -259,6 +307,9 @@ public class XPathRecordReader {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**If there is no value available for a field in a subtag then add a null
|
||||||
|
* TODO : needs better explanation
|
||||||
|
*/
|
||||||
private void putNulls(Map<String, Object> values) {
|
private void putNulls(Map<String, Object> values) {
|
||||||
if (attributes != null) {
|
if (attributes != null) {
|
||||||
for (Node n : attributes) {
|
for (Node n : attributes) {
|
||||||
|
@ -274,6 +325,8 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**Handle multivalued fields by adding List<String>
|
||||||
|
*/
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private void putText(Map<String, Object> values, String value,
|
private void putText(Map<String, Object> values, String value,
|
||||||
String fieldName, boolean multiValued) {
|
String fieldName, boolean multiValued) {
|
||||||
|
@ -289,6 +342,8 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**Skip a tag w/o processing the tag or its subtags
|
||||||
|
*/
|
||||||
private void skipTag(XMLStreamReader parser) throws IOException,
|
private void skipTag(XMLStreamReader parser) throws IOException,
|
||||||
XMLStreamException {
|
XMLStreamException {
|
||||||
int type;
|
int type;
|
||||||
|
@ -298,7 +353,14 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void build(List<String> paths, String fieldName,
|
/**Build the node structure from the xpath
|
||||||
|
* @param paths the xpaths split by '/'
|
||||||
|
* @param fieldName name of the field
|
||||||
|
* @param multiValued . is multiValued or not
|
||||||
|
* @param record is this xpath a record or a field
|
||||||
|
* @param flags extra flags
|
||||||
|
*/
|
||||||
|
private void build(List<String> paths, String fieldName,
|
||||||
boolean multiValued, boolean record, int flags) {
|
boolean multiValued, boolean record, int flags) {
|
||||||
String name = paths.remove(0);
|
String name = paths.remove(0);
|
||||||
if (paths.isEmpty() && name.startsWith("@")) {
|
if (paths.isEmpty() && name.startsWith("@")) {
|
||||||
|
@ -355,6 +417,8 @@ public class XPathRecordReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**If a field has List then they have to be deep-copied for thread safety
|
||||||
|
*/
|
||||||
private Map<String, Object> getDeepCopy(Map<String, Object> values) {
|
private Map<String, Object> getDeepCopy(Map<String, Object> values) {
|
||||||
Map<String, Object> result = new HashMap<String, Object>();
|
Map<String, Object> result = new HashMap<String, Object>();
|
||||||
for (Map.Entry<String, Object> entry : values.entrySet()) {
|
for (Map.Entry<String, Object> entry : values.entrySet()) {
|
||||||
|
@ -397,7 +461,16 @@ public class XPathRecordReader {
|
||||||
factory.setProperty(XMLInputFactory.SUPPORT_DTD , Boolean.FALSE);
|
factory.setProperty(XMLInputFactory.SUPPORT_DTD , Boolean.FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**Implement this interface to stream records as and when it is found.
|
||||||
|
*
|
||||||
|
*/
|
||||||
public static interface Handler {
|
public static interface Handler {
|
||||||
|
/**
|
||||||
|
* @param record The record map . The key is the field name as provided in the addField() methods. The value
|
||||||
|
* can be a single String (for single valued) or a List<String> (for multiValued)
|
||||||
|
* if an Exception is thrown from this method the parsing will be aborted
|
||||||
|
* @param xpath . The forEach XPATH for which this record is being emitted
|
||||||
|
*/
|
||||||
public void handle(Map<String, Object> record, String xpath);
|
public void handle(Map<String, Object> record, String xpath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue