SOLR-1003 -- XPathEntityprocessor must allow slurping all text from a given xml node and its children

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@741268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-02-05 19:53:10 +00:00
parent 6f00fa6f7d
commit 34b0ed7cff
4 changed files with 60 additions and 11 deletions

View File

@ -59,6 +59,9 @@ New Features
13.SOLR-980: A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String.
(Nathan Adams, Noble Paul via shalin)
14.SOLR-1003: XPathEntityprocessor must allow slurping all text from a given xml node and its children.
(Noble Paul via shalin)
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used

View File

@ -122,9 +122,14 @@ public class XPathEntityProcessor extends EntityProcessorBase {
for (Map<String, String> field : context.getAllEntityFields()) {
if (field.get(XPATH) == null)
continue;
int flags = 0;
if ("true".equals(field.get("flatten"))) {
flags = XPathRecordReader.FLATTEN;
}
xpathReader.addField(field.get(DataImporter.COLUMN),
field.get(XPATH), Boolean.parseBoolean(field
.get(DataImporter.MULTI_VALUED)));
field.get(XPATH),
Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)),
flags);
}
} catch (RuntimeException e) {
throw new DataImportHandlerException(SEVERE,

View File

@ -39,6 +39,7 @@ import java.util.regex.Pattern;
*/
public class XPathRecordReader {
private Node rootNode = new Node("/", null);
public static final int FLATTEN = 1;
public XPathRecordReader(String forEachXpath) {
String[] splits = forEachXpath.split("\\|");
@ -46,24 +47,30 @@ public class XPathRecordReader {
split = split.trim();
if (split.length() == 0)
continue;
addField0(split, split, false, true);
addField0(split, split, false, true, 0);
}
}
public synchronized XPathRecordReader addField(String name, String xpath,
boolean multiValued) {
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
if (!xpath.startsWith("/"))
throw new RuntimeException("xpath must start with '/' : " + xpath);
addField0(xpath, name, multiValued, false);
addField0(xpath, name, multiValued, false, 0);
return this;
}
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
if (!xpath.startsWith("/"))
throw new RuntimeException("xpath must start with '/' : " + xpath);
addField0(xpath, name, multiValued, false, flags);
return this;
}
private void addField0(String xpath, String name, boolean multiValued,
boolean isRecord) {
boolean isRecord, int flags) {
List<String> paths = new LinkedList<String>(Arrays.asList(xpath.split("/")));
if ("".equals(paths.get(0).trim()))
paths.remove(0);
rootNode.build(paths, name, multiValued, isRecord);
rootNode.build(paths, name, multiValued, isRecord, flags);
}
public List<Map<String, Object>> getAllRecords(Reader r) {
@ -97,6 +104,8 @@ public class XPathRecordReader {
boolean hasText = false, multiValued = false, isRecord = false;
private boolean flatten;
public Node(String name, Node p) {
xpathName = this.name = name;
parent = p;
@ -167,7 +176,22 @@ public class XPathRecordReader {
if(event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
} else if(event == START_ELEMENT) {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
if (flatten) {
int starts = 1;
while (true) {
event = parser.next();
if (event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
} else if (event == START_ELEMENT) {
starts++;
} else if (event == END_ELEMENT) {
starts--;
if (starts == 0) break;
}
}
} else {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
}
} else {
break;
}
@ -275,7 +299,7 @@ public class XPathRecordReader {
}
public void build(List<String> paths, String fieldName,
boolean multiValued, boolean record) {
boolean multiValued, boolean record, int flags) {
String name = paths.remove(0);
if (paths.isEmpty() && name.startsWith("@")) {
if (attributes == null) {
@ -296,9 +320,10 @@ public class XPathRecordReader {
n.hasText = true;
n.fieldName = fieldName;
n.multiValued = multiValued;
n.flatten = flags == FLATTEN;
}
} else {
n.build(paths, fieldName, multiValued, record);
n.build(paths, fieldName, multiValued, record, flags);
}
}
}

View File

@ -152,7 +152,23 @@ public class TestXPathRecordReader {
Assert.assertTrue(p.contains("This text is"));
Assert.assertTrue(p.contains("and this text is"));
Assert.assertTrue(p.contains("!"));
// Should not contain content from child elements
Assert.assertFalse(p.contains("bold"));
}
@Test
public void mixedContentFlattened() {
String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
" <xhtml:b>bold</xhtml:b> and this text is \n" +
" <xhtml:u>underlined</xhtml:u>!\n" +
"</xhtml:p>";
XPathRecordReader rr = new XPathRecordReader("/p");
rr.addField("p", "/p", false, XPathRecordReader.FLATTEN);
List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
Map<String, Object> row = l.get(0);
Assert.assertEquals("This text is \n" +
" bold and this text is \n" +
" underlined!", ((String)row.get("p")).trim() );
}
@Test