LUCENE-2732: Fix charset problems in XML loading in HyphenationCompoundWordTokenFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029345 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2010-10-31 13:56:46 +00:00
parent abffef82c7
commit 819344aeab
7 changed files with 61 additions and 139 deletions

View File

@ -2,6 +2,11 @@ Analysis Module Change Log
======================= Trunk (not yet released) =======================
Bug fixes
* LUCENE-2732: Fix charset problems in XML loading in
HyphenationCompoundWordTokenFilter. (Uwe Schinder)
API Changes
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.analysis.Token;
@ -224,7 +225,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
for (Object obj : col) {
String string = (String) obj;
target.add(string.toLowerCase());
target.add(string.toLowerCase(Locale.ENGLISH));
}
}

View File

@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compound;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Set;
@ -267,7 +266,7 @@ public class HyphenationCompoundWordTokenFilter extends
*/
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
throws Exception {
return getHyphenationTree(new File(hyphenationFilename));
return getHyphenationTree(new InputSource(hyphenationFilename));
}
/**
@ -279,8 +278,7 @@ public class HyphenationCompoundWordTokenFilter extends
*/
public static HyphenationTree getHyphenationTree(File hyphenationFile)
throws Exception {
return getHyphenationTree(new InputStreamReader(new FileInputStream(
hyphenationFile), "ISO-8859-1"));
return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
}
/**
@ -289,13 +287,32 @@ public class HyphenationCompoundWordTokenFilter extends
* @param hyphenationReader the reader of the XML grammar to load from
* @return An object representing the hyphenation patterns
* @throws Exception
* @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
* Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
* stream, if you like.
*/
@Deprecated
public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
throws Exception {
final InputSource is = new InputSource(hyphenationReader);
// we need this to load the DTD in very old parsers (like the one in JDK 1.4).
// The DTD itsself is provided via EntityResolver, so it should always load, but
// some parsers still want to have a base URL (Crimson).
is.setSystemId("urn:java:" + HyphenationTree.class.getName());
return getHyphenationTree(is);
}
/**
* Create a hyphenator tree
*
* @param hyphenationSource the InputSource pointing to the XML grammar
* @return An object representing the hyphenation patterns
* @throws Exception
*/
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
throws Exception {
HyphenationTree tree = new HyphenationTree();
tree.loadPatterns(new InputSource(hyphenationReader));
tree.loadPatterns(hyphenationSource);
return tree;
}

View File

@ -91,7 +91,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
* @throws HyphenationException In case of an exception while parsing
*/
public void parse(String filename) throws HyphenationException {
parse(new File(filename));
parse(new InputSource(filename));
}
/**
@ -266,7 +266,15 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
//
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return HyphenationDTDGenerator.generateDTD();
// supply the internal hyphenation.dtd if possible
if (
(systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
("hyphenation-info".equals(publicId))
) {
// System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
}
return null;
}
//
@ -373,35 +381,6 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
}
//
// ErrorHandler methods
//
/**
* @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
*/
@Override
public void warning(SAXParseException ex) {
errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
}
/**
* @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
*/
@Override
public void error(SAXParseException ex) {
errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
}
/**
* @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
*/
@Override
public void fatalError(SAXParseException ex) throws SAXException {
errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
throw ex;
}
/**
* Returns a string of the location.
*/
@ -446,79 +425,3 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
}
}
}
class HyphenationDTDGenerator {
public static final String DTD_STRING=
"<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
"<!--\n"+
" Copyright 1999-2004 The Apache Software Foundation\n"+
"\n"+
" Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
" you may not use this file except in compliance with the License.\n"+
" You may obtain a copy of the License at\n"+
"\n"+
" http://www.apache.org/licenses/LICENSE-2.0\n"+
"\n"+
" Unless required by applicable law or agreed to in writing, software\n"+
" distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
" See the License for the specific language governing permissions and\n"+
" limitations under the License.\n"+
"-->\n"+
"<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
"\n"+
"<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
" classes, exceptions?, patterns)>\n"+
"\n"+
"<!-- Hyphen character to be used in the exception list as shortcut for\n"+
" <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
"-->\n"+
"<!ELEMENT hyphen-char EMPTY>\n"+
"<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
"\n"+
"<!-- Default minimun length in characters of hyphenated word fragments\n"+
" before and after the line break. For some languages this is not\n"+
" only for aesthetic purposes, wrong hyphens may be generated if this\n"+
" is not accounted for.\n"+
"-->\n"+
"<!ELEMENT hyphen-min EMPTY>\n"+
"<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
"<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
"\n"+
"<!-- Character equivalent classes: space separated list of character groups, all\n"+
" characters in a group are to be treated equivalent as far as\n"+
" the hyphenation algorithm is concerned. The first character in a group\n"+
" is the group's equivalent character. Patterns should only contain\n"+
" first characters. It also defines word characters, i.e. a word that\n"+
" contains characters not present in any of the classes is not hyphenated.\n"+
"-->\n"+
"<!ELEMENT classes (#PCDATA)>\n"+
"\n"+
"<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
" A hyphen is indicated by the hyphen tag, but you can use the\n"+
" hyphen-char defined previously as shortcut. This is in cases\n"+
" when the algorithm procedure finds wrong hyphens or you want\n"+
" to provide your own hyphenation for some words.\n"+
"-->\n"+
"<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
"\n"+
"<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
" characters as described before, between any two word characters a digit\n"+
" in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
" to zero. The '.' character is reserved to indicate begining or ending\n"+
" of words. -->\n"+
"<!ELEMENT patterns (#PCDATA)>\n"+
"\n"+
"<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
" with pre-break, post-break and no-break attributes.\n"+
" To be used in the exceptions list, the hyphen character is not\n"+
" automatically added -->\n"+
"<!ELEMENT hyphen EMPTY>\n"+
"<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
"<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
"<!ATTLIST hyphen post CDATA #IMPLIED>\n";
public static InputSource generateDTD() {
return new InputSource(new StringReader(DTD_STRING));
}
}

View File

@ -14,7 +14,6 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
classes, exceptions?, patterns)>

View File

@ -17,9 +17,9 @@ package org.apache.lucene.analysis.compound;
* limitations under the License.
*/
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import org.xml.sax.InputSource;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
@ -31,10 +31,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testHyphenationCompoundWordsDA() throws Exception {
String[] dict = { "læse", "hest" };
Reader reader = getHyphenationReader();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
@ -50,10 +49,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
String[] dict = { "basketball", "basket", "ball", "kurv" };
Reader reader = getHyphenationReader();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
.getHyphenationTree(is);
// the word basket will not be added due to the longest match option
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
@ -73,9 +72,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
* This can be controlled with the min/max subword size.
*/
public void testHyphenationOnly() throws Exception {
Reader reader = getHyphenationReader();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
@ -185,7 +184,4 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
private Reader getHyphenationReader() throws Exception {
return new InputStreamReader(getClass().getResourceAsStream("da_UTF8.xml"), "UTF-8");
}
}

View File

@ -17,10 +17,6 @@
package org.apache.solr.analysis;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
@ -33,6 +29,8 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import java.util.Map;
import java.io.InputStream;
import org.xml.sax.InputSource;
/**
* Factory for {@link HyphenationCompoundWordTokenFilter}
@ -57,7 +55,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
private HyphenationTree hyphenator;
private String dictFile;
private String hypFile;
private String encoding = "UTF-8"; // default to UTF-8 encoding
private String encoding;
private int minWordSize;
private int minSubwordSize;
private int maxSubwordSize;
@ -82,18 +80,21 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
}
public void inform(ResourceLoader loader) {
Reader reader = null;
InputStream stream = null;
try {
if (dictFile != null) // the dictionary can be empty.
dictionary = getWordSet(loader, dictFile, false);
InputStream hyph = loader.openResource(hypFile);
reader = new InputStreamReader(hyph, encoding);
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader);
} catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception"
// TODO: Broken, because we cannot resolve real system id
// ResourceLoader should also supply method like ClassLoader to get resource URL
stream = loader.openResource(hypFile);
final InputSource is = new InputSource(stream);
is.setEncoding(encoding); // if it's null let xml parser decide
is.setSystemId(hypFile);
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
} catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception"
throw new RuntimeException(e);
} finally {
IOUtils.closeQuietly(reader);
IOUtils.closeQuietly(stream);
}
}