From 819344aeab0f4c8697c15ab842fdb784fe1faaf0 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Sun, 31 Oct 2010 13:56:46 +0000 Subject: [PATCH] LUCENE-2732: Fix charset problems in XML loading in HyphenationCompoundWordTokenFilter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029345 13f79535-47bb-0310-9956-ffa450edef68 --- modules/analysis/CHANGES.txt | 5 + .../compound/CompoundWordTokenFilterBase.java | 3 +- .../HyphenationCompoundWordTokenFilter.java | 31 +++-- .../compound/hyphenation/PatternParser.java | 117 ++---------------- .../compound/hyphenation/hyphenation.dtd | 1 - .../compound/TestCompoundWordTokenFilter.java | 18 ++- ...enationCompoundWordTokenFilterFactory.java | 25 ++-- 7 files changed, 61 insertions(+), 139 deletions(-) rename modules/analysis/common/src/{java => resources}/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd (97%) diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index 71bfd317333..f9945411419 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -2,6 +2,11 @@ Analysis Module Change Log ======================= Trunk (not yet released) ======================= +Bug fixes + + * LUCENE-2732: Fix charset problems in XML loading in + HyphenationCompoundWordTokenFilter. (Uwe Schinder) + API Changes * LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java index 06f3ac73c3f..a98da16a9ad 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; +import java.util.Locale; import java.util.Set; import org.apache.lucene.analysis.Token; @@ -224,7 +225,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter { protected static final void addAllLowerCase(CharArraySet target, Collection col) { for (Object obj : col) { String string = (String) obj; - target.add(string.toLowerCase()); + target.add(string.toLowerCase(Locale.ENGLISH)); } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java index 4ca92541619..4e46b1696fa 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java @@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compound; import java.io.File; import java.io.FileInputStream; -import java.io.InputStreamReader; import java.io.Reader; import java.util.Set; @@ -267,7 +266,7 @@ public class HyphenationCompoundWordTokenFilter extends */ public static HyphenationTree getHyphenationTree(String hyphenationFilename) throws Exception { - return getHyphenationTree(new File(hyphenationFilename)); + return getHyphenationTree(new InputSource(hyphenationFilename)); } /** @@ -279,8 +278,7 @@ public class HyphenationCompoundWordTokenFilter extends */ public static HyphenationTree getHyphenationTree(File hyphenationFile) throws Exception { - return getHyphenationTree(new InputStreamReader(new FileInputStream( - hyphenationFile), "ISO-8859-1")); + return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm())); } /** @@ -289,13 +287,32 @@ public class HyphenationCompoundWordTokenFilter extends * @param hyphenationReader the reader of the XML grammar to load from * @return An object representing the hyphenation patterns * @throws Exception + * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created. + * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input + * stream, if you like. */ + @Deprecated public static HyphenationTree getHyphenationTree(Reader hyphenationReader) throws Exception { + final InputSource is = new InputSource(hyphenationReader); + // we need this to load the DTD in very old parsers (like the one in JDK 1.4). + // The DTD itsself is provided via EntityResolver, so it should always load, but + // some parsers still want to have a base URL (Crimson). + is.setSystemId("urn:java:" + HyphenationTree.class.getName()); + return getHyphenationTree(is); + } + + /** + * Create a hyphenator tree + * + * @param hyphenationSource the InputSource pointing to the XML grammar + * @return An object representing the hyphenation patterns + * @throws Exception + */ + public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) + throws Exception { HyphenationTree tree = new HyphenationTree(); - - tree.loadPatterns(new InputSource(hyphenationReader)); - + tree.loadPatterns(hyphenationSource); return tree; } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java index bf04b3a3697..c11a96745b8 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java @@ -91,7 +91,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer { * @throws HyphenationException In case of an exception while parsing */ public void parse(String filename) throws HyphenationException { - parse(new File(filename)); + parse(new InputSource(filename)); } /** @@ -266,7 +266,15 @@ public class PatternParser extends DefaultHandler implements PatternConsumer { // @Override public InputSource resolveEntity(String publicId, String systemId) { - return HyphenationDTDGenerator.generateDTD(); + // supply the internal hyphenation.dtd if possible + if ( + (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) || + ("hyphenation-info".equals(publicId)) + ) { + // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm()); + return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm()); + } + return null; } // @@ -373,35 +381,6 @@ public class PatternParser extends DefaultHandler implements PatternConsumer { } - // - // ErrorHandler methods - // - - /** - * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException) - */ - @Override - public void warning(SAXParseException ex) { - errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage(); - } - - /** - * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException) - */ - @Override - public void error(SAXParseException ex) { - errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage(); - } - - /** - * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException) - */ - @Override - public void fatalError(SAXParseException ex) throws SAXException { - errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage(); - throw ex; - } - /** * Returns a string of the location. */ @@ -446,79 +425,3 @@ public class PatternParser extends DefaultHandler implements PatternConsumer { } } } - -class HyphenationDTDGenerator { - public static final String DTD_STRING= - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"+ - "\n"; - - public static InputSource generateDTD() { - return new InputSource(new StringReader(DTD_STRING)); - } -} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd b/modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd similarity index 97% rename from modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd rename to modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd index 3cbd50eebc0..daca530737f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd +++ b/modules/analysis/common/src/resources/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd @@ -14,7 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. --> - diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index ebf5f541449..8500b551dd1 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -17,9 +17,9 @@ package org.apache.lucene.analysis.compound; * limitations under the License. */ -import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; +import org.xml.sax.InputSource; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Tokenizer; @@ -31,10 +31,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDA() throws Exception { String[] dict = { "læse", "hest" }; - Reader reader = getHyphenationReader(); - + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader( @@ -50,10 +49,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { public void testHyphenationCompoundWordsDELongestMatch() throws Exception { String[] dict = { "basketball", "basket", "ball", "kurv" }; - Reader reader = getHyphenationReader(); + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); // the word basket will not be added due to the longest match option HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT, @@ -73,9 +72,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { * This can be controlled with the min/max subword size. */ public void testHyphenationOnly() throws Exception { - Reader reader = getHyphenationReader(); + InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter - .getHyphenationTree(reader); + .getHyphenationTree(is); HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( TEST_VERSION_CURRENT, @@ -185,7 +184,4 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); } - private Reader getHyphenationReader() throws Exception { - return new InputStreamReader(getClass().getResourceAsStream("da_UTF8.xml"), "UTF-8"); - } } diff --git a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java index a018c734f9b..339f1666849 100644 --- a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java @@ -17,10 +17,6 @@ package org.apache.solr.analysis; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; - import org.apache.commons.io.IOUtils; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; @@ -33,6 +29,8 @@ import org.apache.solr.common.SolrException; import org.apache.solr.util.plugin.ResourceLoaderAware; import java.util.Map; +import java.io.InputStream; +import org.xml.sax.InputSource; /** * Factory for {@link HyphenationCompoundWordTokenFilter} @@ -57,7 +55,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa private HyphenationTree hyphenator; private String dictFile; private String hypFile; - private String encoding = "UTF-8"; // default to UTF-8 encoding + private String encoding; private int minWordSize; private int minSubwordSize; private int maxSubwordSize; @@ -82,18 +80,21 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa } public void inform(ResourceLoader loader) { - Reader reader = null; + InputStream stream = null; try { if (dictFile != null) // the dictionary can be empty. dictionary = getWordSet(loader, dictFile, false); - - InputStream hyph = loader.openResource(hypFile); - reader = new InputStreamReader(hyph, encoding); - hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader); - } catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception" + // TODO: Broken, because we cannot resolve real system id + // ResourceLoader should also supply method like ClassLoader to get resource URL + stream = loader.openResource(hypFile); + final InputSource is = new InputSource(stream); + is.setEncoding(encoding); // if it's null let xml parser decide + is.setSystemId(hypFile); + hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception" throw new RuntimeException(e); } finally { - IOUtils.closeQuietly(reader); + IOUtils.closeQuietly(stream); } }