From 4d31bb3a7c3fc6fe0a1193489671762a41bca7f3 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Thu, 29 Mar 2012 22:04:15 +0000
Subject: [PATCH] LUCENE-3937: Workaround a XERCES-J bug in benchmark module.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1307141 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |  3 ++
 .../lib/lucene-xercesImpl-pom.xml.template    | 36 -------------------
 .../xercesImpl-2.9.1-patched-XERCESJ-1257.jar |  2 --
 modules/benchmark/lib/xercesImpl-2.9.1.jar    |  2 ++
 .../byTask/feeds/EnwikiContentSource.java     | 11 +++++-
 5 files changed, 15 insertions(+), 39 deletions(-)
 delete mode 100644 modules/benchmark/lib/lucene-xercesImpl-pom.xml.template
 delete mode 100644 modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
 create mode 100644 modules/benchmark/lib/xercesImpl-2.9.1.jar
diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index 04da8853d53..c3036967c8c 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -293,6 +293,9 @@ Bug Fixes
  * LUCENE-3894: ICUTokenizer, NGramTokenizer and EdgeNGramTokenizer
    could stop early if the Reader only partially fills the provided
    buffer. (Mike McCandless) 
+   
+ * LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
+   (Uwe Schindler, Robert Muir, Mike McCandless)
   
 Documentation
 
diff --git a/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template b/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template
deleted file mode 100644
index c96a64ceba2..00000000000
--- a/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template
+++ /dev/null
@@ -1,36 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-
-  <!--
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-  -->
-
-  <parent>
-    <groupId>org.apache.lucene</groupId>
-    <artifactId>lucene-parent</artifactId>
-    <version>@version@</version>
-  </parent>
-  <modelVersion>4.0.0</modelVersion>
-  <groupId>org.apache.lucene</groupId>
-  <artifactId>lucene-xercesImpl</artifactId>
-  <name>Lucene Specific xercesImpl</name>
-  <version>@version@</version>
-  <description>Lucene Specific xercesImpl v2.9.1 patched with XERCESJ-1257</description>
-  <packaging>jar</packaging>
-</project>
diff --git a/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar b/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
deleted file mode 100644
index 6eacbf558b1..00000000000
--- a/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
+++ /dev/null
@@ -1,2 +0,0 @@
-AnyObjectId[bbb5aa7ad5bcea61c5c66ceb2ba340431cc7262d] was removed in git history.
-Apache SVN contains full history.
\ No newline at end of file
diff --git a/modules/benchmark/lib/xercesImpl-2.9.1.jar b/modules/benchmark/lib/xercesImpl-2.9.1.jar
new file mode 100644
index 00000000000..4e3c02df36e
--- /dev/null
+++ b/modules/benchmark/lib/xercesImpl-2.9.1.jar
@@ -0,0 +1,2 @@
+AnyObjectId[547f56300d93fe36587910739e095f03e287d47e] was removed in git history.
+Apache SVN contains full history.
\ No newline at end of file
diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
index 5153ad0c4eb..50dd6802684 100644
--- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
+++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
@@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTask.feeds;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.IOUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -172,7 +177,11 @@ public class EnwikiContentSource extends ContentSource {
         while(true){
           final InputStream localFileIS = is;
           try {
-            reader.parse(new InputSource(localFileIS));
+            // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+            CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPORT)
+                .onUnmappableCharacter(CodingErrorAction.REPORT);
+            reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
           } catch (IOException ioe) {
             synchronized(EnwikiContentSource.this) {
               if (localFileIS != is) {