From 33f9318049773dd6bc64b5dc4f02b4e372ea8423 Mon Sep 17 00:00:00 2001 From: Shalin Shekhar Mangar Date: Mon, 2 Feb 2009 11:30:18 +0000 Subject: [PATCH] SOLR-999 -- XPathRecordReader fails on XMLs with nodes mixed with CDATA content git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@739962 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/dataimporthandler/CHANGES.txt | 3 ++ .../handler/dataimport/XPathRecordReader.java | 32 +++++++++++++------ .../dataimport/TestXPathRecordReader.java | 26 +++++++++++++-- 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/contrib/dataimporthandler/CHANGES.txt b/contrib/dataimporthandler/CHANGES.txt index 297ff40e1f5..9f0e04b3499 100644 --- a/contrib/dataimporthandler/CHANGES.txt +++ b/contrib/dataimporthandler/CHANGES.txt @@ -106,6 +106,9 @@ Bug Fixes 13. SOLR-985: Fix thread-safety issue with TemplateString for concurrent imports with multiple cores. (Ryuuichi Kumai via shalin) +14. SOLR-999: XPathRecordReader fails on XMLs with nodes mixed with CDATA content. + (Fergus McMenemie, Noble Paul via shalin) + Documentation ---------------------- diff --git a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java index d3b028d70d8..aa02fe4fda8 100644 --- a/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java +++ b/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/XPathRecordReader.java @@ -162,19 +162,20 @@ public class XPathRecordReader { skipNextEvent = true; String text = parser.getText(); event = parser.next(); - while (event == CDATA || event == CHARACTERS || event == SPACE) { - text = text + parser.getText(); + + while (true) { + if(event == CDATA || event == CHARACTERS || event == SPACE) { + text = text + parser.getText(); + } else if(event == START_ELEMENT) { + handleStartElement(parser, childrenFound, handler, values, stack, recordStarted); + } else { + break; + } event = parser.next(); } putText(values, text, fieldName, multiValued); } else if (event == START_ELEMENT) { - Node n = getMatchingChild(parser); - if (n != null) { - childrenFound.add(n); - n.parse(parser, handler, values, stack, recordStarted); - } else { - skipTag(parser); - } + handleStartElement(parser, childrenFound, handler, values, stack, recordStarted); } } } finally { @@ -193,6 +194,19 @@ public class XPathRecordReader { } } + private void handleStartElement(XMLStreamReader parser, Set childrenFound, + Handler handler, Map values, + Stack> stack, boolean recordStarted) + throws IOException, XMLStreamException { + Node n = getMatchingChild(parser); + if (n != null) { + childrenFound.add(n); + n.parse(parser, handler, values, stack, recordStarted); + } else { + skipTag(parser); + } + } + private Node getMatchingChild(XMLStreamReader parser) { if (childNodes == null) return null; diff --git a/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java index 32d60c4e2bd..98fd5176a49 100644 --- a/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java +++ b/contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathRecordReader.java @@ -25,9 +25,7 @@ import java.util.List; import java.util.Map; /** - *

- * Test for XPathRecordReader - *

+ *

Test for XPathRecordReader

* * @version $Id$ * @since solr 1.3 @@ -135,6 +133,28 @@ public class TestXPathRecordReader { Assert.assertNull(((List) l.get(1).get("b")).get(0)); } + @Test + public void mixedContent() { + String xml = "This text is \n" + + " bold and this text is \n" + + " underlined!\n" + + ""; + XPathRecordReader rr = new XPathRecordReader("/p"); + rr.addField("p", "/p", true); + rr.addField("b", "/p/b", true); + rr.addField("u", "/p/u", true); + List> l = rr.getAllRecords(new StringReader(xml)); + Map row = l.get(0); + + Assert.assertEquals("bold", ((List) row.get("b")).get(0)); + Assert.assertEquals("underlined", ((List) row.get("u")).get(0)); + String p = (String) ((List) row.get("p")).get(0); + Assert.assertTrue(p.contains("This text is")); + Assert.assertTrue(p.contains("and this text is")); + Assert.assertTrue(p.contains("!")); + + } + @Test public void elems2LevelWithAttrib() { String xml = "\n" + "\t\n" + "\t \n"