From 819344aeab0f4c8697c15ab842fdb784fe1faaf0 Mon Sep 17 00:00:00 2001
From: Uwe Schindler <uschindler@apache.org>
Date: Sun, 31 Oct 2010 13:56:46 +0000
Subject: [PATCH] LUCENE-2732: Fix charset problems in XML loading in
 HyphenationCompoundWordTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029345 13f79535-47bb-0310-9956-ffa450edef68
---
 modules/analysis/CHANGES.txt                  |   5 +
 .../compound/CompoundWordTokenFilterBase.java |   3 +-
 .../HyphenationCompoundWordTokenFilter.java   |  31 +++--
 .../compound/hyphenation/PatternParser.java   | 117 ++----------------
 .../compound/hyphenation/hyphenation.dtd      |   1 -
 .../compound/TestCompoundWordTokenFilter.java |  18 ++-
 ...enationCompoundWordTokenFilterFactory.java |  25 ++--
 7 files changed, 61 insertions(+), 139 deletions(-)
 rename modules/analysis/common/src/{java => resources}/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (97%)

diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt
index 71bfd317333..f9945411419 100644
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@@ -2,6 +2,11 @@ Analysis Module Change Log
 
 ======================= Trunk (not yet released) =======================
 
+Bug fixes
+
+ * LUCENE-2732: Fix charset problems in XML loading in
+   HyphenationCompoundWordTokenFilter.  (Uwe Schinder)
+   
 API Changes
 
  * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
index 06f3ac73c3f..a98da16a9ad 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.LinkedList;
+import java.util.Locale;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Token;
@@ -224,7 +225,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
   protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
     for (Object obj : col) {
       String string = (String) obj;
-      target.add(string.toLowerCase());
+      target.add(string.toLowerCase(Locale.ENGLISH));
     }
   }
   
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
index 4ca92541619..4e46b1696fa 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compound;
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.Set;
 
@@ -267,7 +266,7 @@ public class HyphenationCompoundWordTokenFilter extends
    */
   public static HyphenationTree getHyphenationTree(String hyphenationFilename)
       throws Exception {
-    return getHyphenationTree(new File(hyphenationFilename));
+    return getHyphenationTree(new InputSource(hyphenationFilename));
   }
 
   /**
@@ -279,8 +278,7 @@ public class HyphenationCompoundWordTokenFilter extends
    */
   public static HyphenationTree getHyphenationTree(File hyphenationFile)
       throws Exception {
-    return getHyphenationTree(new InputStreamReader(new FileInputStream(
-        hyphenationFile), "ISO-8859-1"));
+    return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
   }
 
   /**
@@ -289,13 +287,32 @@ public class HyphenationCompoundWordTokenFilter extends
    * @param hyphenationReader the reader of the XML grammar to load from
    * @return An object representing the hyphenation patterns
    * @throws Exception
+   * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
+   * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
+   * stream, if you like.
    */
+  @Deprecated
   public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
       throws Exception {
+    final InputSource is = new InputSource(hyphenationReader);
+    // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
+    // The DTD itsself is provided via EntityResolver, so it should always load, but
+    // some parsers still want to have a base URL (Crimson).
+    is.setSystemId("urn:java:" + HyphenationTree.class.getName());
+    return getHyphenationTree(is);
+  }
+
+  /**
+   * Create a hyphenator tree
+   * 
+   * @param hyphenationSource the InputSource pointing to the XML grammar
+   * @return An object representing the hyphenation patterns
+   * @throws Exception
+   */
+  public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
+      throws Exception {
     HyphenationTree tree = new HyphenationTree();
-
-    tree.loadPatterns(new InputSource(hyphenationReader));
-
+    tree.loadPatterns(hyphenationSource);
     return tree;
   }
 
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
index bf04b3a3697..c11a96745b8 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
@@ -91,7 +91,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
    * @throws HyphenationException In case of an exception while parsing
    */
   public void parse(String filename) throws HyphenationException {
-    parse(new File(filename));
+    parse(new InputSource(filename));
   }
 
   /**
@@ -266,7 +266,15 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
   //
   @Override
   public InputSource resolveEntity(String publicId, String systemId) {
-    return HyphenationDTDGenerator.generateDTD();
+    // supply the internal hyphenation.dtd if possible
+    if (
+      (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
+      ("hyphenation-info".equals(publicId))
+    ) {
+      // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
+      return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
+    }
+    return null;
   }
 
   //
@@ -373,35 +381,6 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
 
   }
 
-  //
-  // ErrorHandler methods
-  //
-
-  /**
-   * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
-   */
-  @Override
-  public void warning(SAXParseException ex) {
-    errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
-  }
-
-  /**
-   * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
-   */
-  @Override
-  public void error(SAXParseException ex) {
-    errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
-  }
-
-  /**
-   * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
-   */
-  @Override
-  public void fatalError(SAXParseException ex) throws SAXException {
-    errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
-    throw ex;
-  }
-
   /**
    * Returns a string of the location.
    */
@@ -446,79 +425,3 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
     }
   }
 }
-
-class HyphenationDTDGenerator {
-  public static final String DTD_STRING=
-    "<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
-    "<!--\n"+
-    "  Copyright 1999-2004 The Apache Software Foundation\n"+
-    "\n"+
-    "  Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
-    "  you may not use this file except in compliance with the License.\n"+
-    "  You may obtain a copy of the License at\n"+
-    "\n"+
-    "       http://www.apache.org/licenses/LICENSE-2.0\n"+
-    "\n"+
-    "  Unless required by applicable law or agreed to in writing, software\n"+
-    "  distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
-    "  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
-    "  See the License for the specific language governing permissions and\n"+
-    "  limitations under the License.\n"+
-    "-->\n"+
-    "<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
-    "\n"+
-    "<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
-    "                           classes, exceptions?, patterns)>\n"+
-    "\n"+
-    "<!-- Hyphen character to be used in the exception list as shortcut for\n"+
-    "     <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
-    "-->\n"+
-    "<!ELEMENT hyphen-char EMPTY>\n"+
-    "<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
-    "\n"+
-    "<!-- Default minimun length in characters of hyphenated word fragments\n"+
-    "     before and after the line break. For some languages this is not\n"+
-    "     only for aesthetic purposes, wrong hyphens may be generated if this\n"+
-    "     is not accounted for.\n"+
-    "-->\n"+
-    "<!ELEMENT hyphen-min EMPTY>\n"+
-    "<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
-    "<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
-    "\n"+
-    "<!-- Character equivalent classes: space separated list of character groups, all\n"+
-    "     characters in a group are to be treated equivalent as far as\n"+
-    "     the hyphenation algorithm is concerned. The first character in a group\n"+
-    "     is the group's equivalent character. Patterns should only contain\n"+
-    "     first characters. It also defines word characters, i.e. a word that\n"+
-    "     contains characters not present in any of the classes is not hyphenated.\n"+
-    "-->\n"+
-    "<!ELEMENT classes (#PCDATA)>\n"+
-    "\n"+
-    "<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
-    "     A hyphen is indicated by the hyphen tag, but you can use the\n"+
-    "     hyphen-char defined previously as shortcut. This is in cases\n"+
-    "     when the algorithm procedure finds wrong hyphens or you want\n"+
-    "     to provide your own hyphenation for some words.\n"+
-    "-->\n"+
-    "<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
-    "\n"+
-    "<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
-    "     characters as described before, between any two word characters a digit\n"+
-    "     in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
-    "     to zero. The '.' character is reserved to indicate begining or ending\n"+
-    "     of words. -->\n"+
-    "<!ELEMENT patterns (#PCDATA)>\n"+
-    "\n"+
-    "<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
-    "     with pre-break, post-break and no-break attributes.\n"+
-    "     To be used in the exceptions list, the hyphen character is not\n"+
-    "     automatically added -->\n"+
-    "<!ELEMENT hyphen EMPTY>\n"+
-    "<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
-    "<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
-    "<!ATTLIST hyphen post CDATA #IMPLIED>\n";
-  
- public static InputSource generateDTD() {
-    return new InputSource(new StringReader(DTD_STRING));
-  }
-}
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd b/modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
similarity index 97%
rename from modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
rename to modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
index 3cbd50eebc0..daca530737f 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
+++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
@@ -14,7 +14,6 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 -->
-<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
 
 <!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
                            classes, exceptions?, patterns)>
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index ebf5f541449..8500b551dd1 100644
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -17,9 +17,9 @@ package org.apache.lucene.analysis.compound;
  * limitations under the License.
  */
 
-import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import org.xml.sax.InputSource;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Tokenizer;
@@ -31,10 +31,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
   public void testHyphenationCompoundWordsDA() throws Exception {
     String[] dict = { "læse", "hest" };
 
-    Reader reader = getHyphenationReader();
-
+    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
-        .getHyphenationTree(reader);
+        .getHyphenationTree(is);
 
     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, 
         new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
@@ -50,10 +49,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
 
   public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
     String[] dict = { "basketball", "basket", "ball", "kurv" };
-    Reader reader = getHyphenationReader();
 
+    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
-        .getHyphenationTree(reader);
+        .getHyphenationTree(is);
 
     // the word basket will not be added due to the longest match option
     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, 
@@ -73,9 +72,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    * This can be controlled with the min/max subword size.
    */
   public void testHyphenationOnly() throws Exception {
-    Reader reader = getHyphenationReader();
+    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
-      .getHyphenationTree(reader);
+        .getHyphenationTree(is);
     
     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
         TEST_VERSION_CURRENT,
@@ -185,7 +184,4 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
   }
 
-  private Reader getHyphenationReader() throws Exception {
-    return new InputStreamReader(getClass().getResourceAsStream("da_UTF8.xml"), "UTF-8");
-  }
 }
diff --git a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java
index a018c734f9b..339f1666849 100644
--- a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java
@@ -17,10 +17,6 @@
 
 package org.apache.solr.analysis;
 
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-
 import org.apache.commons.io.IOUtils;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
@@ -33,6 +29,8 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 
 import java.util.Map;
+import java.io.InputStream;
+import org.xml.sax.InputSource;
 
 /**
  * Factory for {@link HyphenationCompoundWordTokenFilter}
@@ -57,7 +55,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
   private HyphenationTree hyphenator;
   private String dictFile;
   private String hypFile;
-  private String encoding = "UTF-8"; // default to UTF-8 encoding
+  private String encoding;
   private int minWordSize;
   private int minSubwordSize;
   private int maxSubwordSize;
@@ -82,18 +80,21 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
   }
   
   public void inform(ResourceLoader loader) {
-    Reader reader = null;
+    InputStream stream = null;
     try {
       if (dictFile != null) // the dictionary can be empty.
         dictionary = getWordSet(loader, dictFile, false);
-      
-      InputStream hyph = loader.openResource(hypFile);
-      reader = new InputStreamReader(hyph, encoding);
-      hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader);
-    } catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception"
+      // TODO: Broken, because we cannot resolve real system id
+      // ResourceLoader should also supply method like ClassLoader to get resource URL
+      stream = loader.openResource(hypFile);
+      final InputSource is = new InputSource(stream);
+      is.setEncoding(encoding); // if it's null let xml parser decide
+      is.setSystemId(hypFile);
+      hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
+    } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception"
       throw new RuntimeException(e);
     } finally {
-      IOUtils.closeQuietly(reader);
+      IOUtils.closeQuietly(stream);
     }
   }