mirror of https://github.com/apache/lucene.git
SOLR-1003 -- XPathEntityprocessor must allow slurping all text from a given xml node and its children
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@741268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6f00fa6f7d
commit
34b0ed7cff
|
@ -59,6 +59,9 @@ New Features
|
|||
13.SOLR-980: A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String.
|
||||
(Nathan Adams, Noble Paul via shalin)
|
||||
|
||||
14.SOLR-1003: XPathEntityprocessor must allow slurping all text from a given xml node and its children.
|
||||
(Noble Paul via shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used
|
||||
|
|
|
@ -122,9 +122,14 @@ public class XPathEntityProcessor extends EntityProcessorBase {
|
|||
for (Map<String, String> field : context.getAllEntityFields()) {
|
||||
if (field.get(XPATH) == null)
|
||||
continue;
|
||||
int flags = 0;
|
||||
if ("true".equals(field.get("flatten"))) {
|
||||
flags = XPathRecordReader.FLATTEN;
|
||||
}
|
||||
xpathReader.addField(field.get(DataImporter.COLUMN),
|
||||
field.get(XPATH), Boolean.parseBoolean(field
|
||||
.get(DataImporter.MULTI_VALUED)));
|
||||
field.get(XPATH),
|
||||
Boolean.parseBoolean(field.get(DataImporter.MULTI_VALUED)),
|
||||
flags);
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
throw new DataImportHandlerException(SEVERE,
|
||||
|
|
|
@ -39,6 +39,7 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
public class XPathRecordReader {
|
||||
private Node rootNode = new Node("/", null);
|
||||
public static final int FLATTEN = 1;
|
||||
|
||||
public XPathRecordReader(String forEachXpath) {
|
||||
String[] splits = forEachXpath.split("\\|");
|
||||
|
@ -46,24 +47,30 @@ public class XPathRecordReader {
|
|||
split = split.trim();
|
||||
if (split.length() == 0)
|
||||
continue;
|
||||
addField0(split, split, false, true);
|
||||
addField0(split, split, false, true, 0);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized XPathRecordReader addField(String name, String xpath,
|
||||
boolean multiValued) {
|
||||
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued) {
|
||||
if (!xpath.startsWith("/"))
|
||||
throw new RuntimeException("xpath must start with '/' : " + xpath);
|
||||
addField0(xpath, name, multiValued, false);
|
||||
addField0(xpath, name, multiValued, false, 0);
|
||||
return this;
|
||||
}
|
||||
|
||||
public synchronized XPathRecordReader addField(String name, String xpath, boolean multiValued, int flags) {
|
||||
if (!xpath.startsWith("/"))
|
||||
throw new RuntimeException("xpath must start with '/' : " + xpath);
|
||||
addField0(xpath, name, multiValued, false, flags);
|
||||
return this;
|
||||
}
|
||||
|
||||
private void addField0(String xpath, String name, boolean multiValued,
|
||||
boolean isRecord) {
|
||||
boolean isRecord, int flags) {
|
||||
List<String> paths = new LinkedList<String>(Arrays.asList(xpath.split("/")));
|
||||
if ("".equals(paths.get(0).trim()))
|
||||
paths.remove(0);
|
||||
rootNode.build(paths, name, multiValued, isRecord);
|
||||
rootNode.build(paths, name, multiValued, isRecord, flags);
|
||||
}
|
||||
|
||||
public List<Map<String, Object>> getAllRecords(Reader r) {
|
||||
|
@ -97,6 +104,8 @@ public class XPathRecordReader {
|
|||
|
||||
boolean hasText = false, multiValued = false, isRecord = false;
|
||||
|
||||
private boolean flatten;
|
||||
|
||||
public Node(String name, Node p) {
|
||||
xpathName = this.name = name;
|
||||
parent = p;
|
||||
|
@ -167,7 +176,22 @@ public class XPathRecordReader {
|
|||
if(event == CDATA || event == CHARACTERS || event == SPACE) {
|
||||
text = text + parser.getText();
|
||||
} else if(event == START_ELEMENT) {
|
||||
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
|
||||
if (flatten) {
|
||||
int starts = 1;
|
||||
while (true) {
|
||||
event = parser.next();
|
||||
if (event == CDATA || event == CHARACTERS || event == SPACE) {
|
||||
text = text + parser.getText();
|
||||
} else if (event == START_ELEMENT) {
|
||||
starts++;
|
||||
} else if (event == END_ELEMENT) {
|
||||
starts--;
|
||||
if (starts == 0) break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
@ -275,7 +299,7 @@ public class XPathRecordReader {
|
|||
}
|
||||
|
||||
public void build(List<String> paths, String fieldName,
|
||||
boolean multiValued, boolean record) {
|
||||
boolean multiValued, boolean record, int flags) {
|
||||
String name = paths.remove(0);
|
||||
if (paths.isEmpty() && name.startsWith("@")) {
|
||||
if (attributes == null) {
|
||||
|
@ -296,9 +320,10 @@ public class XPathRecordReader {
|
|||
n.hasText = true;
|
||||
n.fieldName = fieldName;
|
||||
n.multiValued = multiValued;
|
||||
n.flatten = flags == FLATTEN;
|
||||
}
|
||||
} else {
|
||||
n.build(paths, fieldName, multiValued, record);
|
||||
n.build(paths, fieldName, multiValued, record, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -152,7 +152,23 @@ public class TestXPathRecordReader {
|
|||
Assert.assertTrue(p.contains("This text is"));
|
||||
Assert.assertTrue(p.contains("and this text is"));
|
||||
Assert.assertTrue(p.contains("!"));
|
||||
// Should not contain content from child elements
|
||||
Assert.assertFalse(p.contains("bold"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mixedContentFlattened() {
|
||||
String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
|
||||
" <xhtml:b>bold</xhtml:b> and this text is \n" +
|
||||
" <xhtml:u>underlined</xhtml:u>!\n" +
|
||||
"</xhtml:p>";
|
||||
XPathRecordReader rr = new XPathRecordReader("/p");
|
||||
rr.addField("p", "/p", false, XPathRecordReader.FLATTEN);
|
||||
List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
|
||||
Map<String, Object> row = l.get(0);
|
||||
Assert.assertEquals("This text is \n" +
|
||||
" bold and this text is \n" +
|
||||
" underlined!", ((String)row.get("p")).trim() );
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue