SOLR-999 -- XPathRecordReader fails on XMLs with nodes mixed with CDATA content

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@739962 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-02-02 11:30:18 +00:00
parent 164c2481e2
commit 33f9318049
3 changed files with 49 additions and 12 deletions

View File

@ -106,6 +106,9 @@ Bug Fixes
13. SOLR-985: Fix thread-safety issue with TemplateString for concurrent imports with multiple cores. 13. SOLR-985: Fix thread-safety issue with TemplateString for concurrent imports with multiple cores.
(Ryuuichi Kumai via shalin) (Ryuuichi Kumai via shalin)
14. SOLR-999: XPathRecordReader fails on XMLs with nodes mixed with CDATA content.
(Fergus McMenemie, Noble Paul via shalin)
Documentation Documentation
---------------------- ----------------------

View File

@ -162,19 +162,20 @@ public class XPathRecordReader {
skipNextEvent = true; skipNextEvent = true;
String text = parser.getText(); String text = parser.getText();
event = parser.next(); event = parser.next();
while (event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText(); while (true) {
if(event == CDATA || event == CHARACTERS || event == SPACE) {
text = text + parser.getText();
} else if(event == START_ELEMENT) {
handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
} else {
break;
}
event = parser.next(); event = parser.next();
} }
putText(values, text, fieldName, multiValued); putText(values, text, fieldName, multiValued);
} else if (event == START_ELEMENT) { } else if (event == START_ELEMENT) {
Node n = getMatchingChild(parser); handleStartElement(parser, childrenFound, handler, values, stack, recordStarted);
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
} else {
skipTag(parser);
}
} }
} }
} finally { } finally {
@ -193,6 +194,19 @@ public class XPathRecordReader {
} }
} }
private void handleStartElement(XMLStreamReader parser, Set<Node> childrenFound,
Handler handler, Map<String, Object> values,
Stack<Set<String>> stack, boolean recordStarted)
throws IOException, XMLStreamException {
Node n = getMatchingChild(parser);
if (n != null) {
childrenFound.add(n);
n.parse(parser, handler, values, stack, recordStarted);
} else {
skipTag(parser);
}
}
private Node getMatchingChild(XMLStreamReader parser) { private Node getMatchingChild(XMLStreamReader parser) {
if (childNodes == null) if (childNodes == null)
return null; return null;

View File

@ -25,9 +25,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
* <p> * <p> Test for XPathRecordReader </p>
* Test for XPathRecordReader
* </p>
* *
* @version $Id$ * @version $Id$
* @since solr 1.3 * @since solr 1.3
@ -135,6 +133,28 @@ public class TestXPathRecordReader {
Assert.assertNull(((List) l.get(1).get("b")).get(0)); Assert.assertNull(((List) l.get(1).get("b")).get(0));
} }
@Test
public void mixedContent() {
String xml = "<xhtml:p xmlns:xhtml=\"http://xhtml.com/\" >This text is \n" +
" <xhtml:b>bold</xhtml:b> and this text is \n" +
" <xhtml:u>underlined</xhtml:u>!\n" +
"</xhtml:p>";
XPathRecordReader rr = new XPathRecordReader("/p");
rr.addField("p", "/p", true);
rr.addField("b", "/p/b", true);
rr.addField("u", "/p/u", true);
List<Map<String, Object>> l = rr.getAllRecords(new StringReader(xml));
Map<String, Object> row = l.get(0);
Assert.assertEquals("bold", ((List) row.get("b")).get(0));
Assert.assertEquals("underlined", ((List) row.get("u")).get(0));
String p = (String) ((List) row.get("p")).get(0);
Assert.assertTrue(p.contains("This text is"));
Assert.assertTrue(p.contains("and this text is"));
Assert.assertTrue(p.contains("!"));
}
@Test @Test
public void elems2LevelWithAttrib() { public void elems2LevelWithAttrib() {
String xml = "<root>\n" + "\t<a>\n" + "\t <b k=\"x\">\n" String xml = "<root>\n" + "\t<a>\n" + "\t <b k=\"x\">\n"