diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 04da8853d53..c3036967c8c 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -293,6 +293,9 @@ Bug Fixes * LUCENE-3894: ICUTokenizer, NGramTokenizer and EdgeNGramTokenizer could stop early if the Reader only partially fills the provided buffer. (Mike McCandless) + + * LUCENE-3937: Workaround a XERCES-J bug in benchmark module. + (Uwe Schindler, Robert Muir, Mike McCandless) Documentation diff --git a/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template b/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template deleted file mode 100644 index c96a64ceba2..00000000000 --- a/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - org.apache.lucene - lucene-parent - @version@ - - 4.0.0 - org.apache.lucene - lucene-xercesImpl - Lucene Specific xercesImpl - @version@ - Lucene Specific xercesImpl v2.9.1 patched with XERCESJ-1257 - jar - diff --git a/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar b/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar deleted file mode 100644 index 6eacbf558b1..00000000000 --- a/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[bbb5aa7ad5bcea61c5c66ceb2ba340431cc7262d] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/modules/benchmark/lib/xercesImpl-2.9.1.jar b/modules/benchmark/lib/xercesImpl-2.9.1.jar new file mode 100644 index 00000000000..4e3c02df36e --- /dev/null +++ b/modules/benchmark/lib/xercesImpl-2.9.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[547f56300d93fe36587910739e095f03e287d47e] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index 5153ad0c4eb..50dd6802684 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTask.feeds; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; import java.util.HashMap; import java.util.Map; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.IOUtils; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -172,7 +177,11 @@ public class EnwikiContentSource extends ContentSource { while(true){ final InputStream localFileIS = is; try { - reader.parse(new InputSource(localFileIS)); + // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. + CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder)))); } catch (IOException ioe) { synchronized(EnwikiContentSource.this) { if (localFileIS != is) {