SOLR-999 -- XPathRecordReader fails on XMLs with nodes mixed with CDATA content

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@739962 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-02-02 11:30:18 +00:00
parent 164c2481e2
commit 33f9318049
3 changed files with 49 additions and 12 deletions

View File

@ -106,6 +106,9 @@ Bug Fixes
13. SOLR-985: Fix thread-safety issue with TemplateString for concurrent imports with multiple cores.
(Ryuuichi Kumai via shalin)
14. SOLR-999: XPathRecordReader fails on XMLs with nodes mixed with CDATA content.
(Fergus McMenemie, Noble Paul via shalin)
Documentation
----------------------

View File

@ -162,19 +162,20 @@ public class XPathRecordReader {
skipNextEvent = true;
String text = parser.getText();
event = parser.next();
while (event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
while (true) {
if(event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
} else if(event == START_ELEMENT) {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
} else {
break;
}
event = parser.next();
}
putText(values, text, fieldName, multiValued);
} else if (event == START_ELEMENT) {
Node n = getMatchingChild(parser);
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
} else {
skipTag(parser);
}
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
}
}
} finally {
@ -193,6 +194,19 @@ public class XPathRecordReader {
}
}
private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound,
Handler handler, Map<String, Object> values,
Stack<Set<String>> stack, boolean recordStarted)
throws IOException, XMLStreamException {
Node n = getMatchingChild(parser);
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
} else {
skipTag(parser);
}
}
private Node getMatchingChild(XMLStreamReader parser) {
if (childNodes == null)
return null;

View File

@ -25,9 +25,7 @@ import java.util.List;
import java.util.Map;
/**
* <p>
* Test for XPathRecordReader
* </p>
* <p> Test for XPathRecordReader </p>
*
* @version $Id$
* @since solr 1.3
@ -135,6 +133,28 @@ public class TestXPathRecordReader {
Assert.assertNull(((List) l.get(1).get("b")).get(0));
}
@Test
public void mixedContent() {
String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
" <xhtml:b>bold</xhtml:b> and this text is \n" +
" <xhtml:u>underlined</xhtml:u>!\n" +
"</xhtml:p>";
XPathRecordReader rr = new XPathRecordReader("/p");
rr.addField("p", "/p", true);
rr.addField("b", "/p/b", true);
rr.addField("u", "/p/u", true);
List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
Map<String, Object> row = l.get(0);
Assert.assertEquals("bold", ((List) row.get("b")).get(0));
Assert.assertEquals("underlined", ((List) row.get("u")).get(0));
String p = (String) ((List) row.get("p")).get(0);
Assert.assertTrue(p.contains("This text is"));
Assert.assertTrue(p.contains("and this text is"));
Assert.assertTrue(p.contains("!"));
}
@Test
public void elems2LevelWithAttrib() {
String xml = "<root>\n" + "\t<a>\n" + "\t <b k=\"x\">\n"