mirror of https://github.com/apache/lucene.git
LUCENE-2732: Fix charset problems in XML loading in HyphenationCompoundWordTokenFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029345 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
abffef82c7
commit
819344aeab
|
@ -2,6 +2,11 @@ Analysis Module Change Log
|
|||
|
||||
======================= Trunk (not yet released) =======================
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2732: Fix charset problems in XML loading in
|
||||
HyphenationCompoundWordTokenFilter. (Uwe Schinder)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2413: Deprecated PatternAnalyzer in common/miscellaneous, in favor
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
@ -224,7 +225,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
|
||||
for (Object obj : col) {
|
||||
String string = (String) obj;
|
||||
target.add(string.toLowerCase());
|
||||
target.add(string.toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.compound;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -267,7 +266,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
*/
|
||||
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
|
||||
throws Exception {
|
||||
return getHyphenationTree(new File(hyphenationFilename));
|
||||
return getHyphenationTree(new InputSource(hyphenationFilename));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -279,8 +278,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
*/
|
||||
public static HyphenationTree getHyphenationTree(File hyphenationFile)
|
||||
throws Exception {
|
||||
return getHyphenationTree(new InputStreamReader(new FileInputStream(
|
||||
hyphenationFile), "ISO-8859-1"));
|
||||
return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -289,13 +287,32 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
* @param hyphenationReader the reader of the XML grammar to load from
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws Exception
|
||||
* @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
|
||||
* Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
|
||||
* stream, if you like.
|
||||
*/
|
||||
@Deprecated
|
||||
public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
|
||||
throws Exception {
|
||||
final InputSource is = new InputSource(hyphenationReader);
|
||||
// we need this to load the DTD in very old parsers (like the one in JDK 1.4).
|
||||
// The DTD itsself is provided via EntityResolver, so it should always load, but
|
||||
// some parsers still want to have a base URL (Crimson).
|
||||
is.setSystemId("urn:java:" + HyphenationTree.class.getName());
|
||||
return getHyphenationTree(is);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationSource the InputSource pointing to the XML grammar
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws Exception
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
|
||||
throws Exception {
|
||||
HyphenationTree tree = new HyphenationTree();
|
||||
|
||||
tree.loadPatterns(new InputSource(hyphenationReader));
|
||||
|
||||
tree.loadPatterns(hyphenationSource);
|
||||
return tree;
|
||||
}
|
||||
|
||||
|
|
|
@ -91,7 +91,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
* @throws HyphenationException In case of an exception while parsing
|
||||
*/
|
||||
public void parse(String filename) throws HyphenationException {
|
||||
parse(new File(filename));
|
||||
parse(new InputSource(filename));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -266,7 +266,15 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
//
|
||||
@Override
|
||||
public InputSource resolveEntity(String publicId, String systemId) {
|
||||
return HyphenationDTDGenerator.generateDTD();
|
||||
// supply the internal hyphenation.dtd if possible
|
||||
if (
|
||||
(systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
|
||||
("hyphenation-info".equals(publicId))
|
||||
) {
|
||||
// System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
|
||||
return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -373,35 +381,6 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
|
||||
}
|
||||
|
||||
//
|
||||
// ErrorHandler methods
|
||||
//
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
@Override
|
||||
public void warning(SAXParseException ex) {
|
||||
errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
@Override
|
||||
public void error(SAXParseException ex) {
|
||||
errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
@Override
|
||||
public void fatalError(SAXParseException ex) throws SAXException {
|
||||
errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
throw ex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the location.
|
||||
*/
|
||||
|
@ -446,79 +425,3 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
class HyphenationDTDGenerator {
|
||||
public static final String DTD_STRING=
|
||||
"<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
|
||||
"<!--\n"+
|
||||
" Copyright 1999-2004 The Apache Software Foundation\n"+
|
||||
"\n"+
|
||||
" Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
|
||||
" you may not use this file except in compliance with the License.\n"+
|
||||
" You may obtain a copy of the License at\n"+
|
||||
"\n"+
|
||||
" http://www.apache.org/licenses/LICENSE-2.0\n"+
|
||||
"\n"+
|
||||
" Unless required by applicable law or agreed to in writing, software\n"+
|
||||
" distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
|
||||
" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
|
||||
" See the License for the specific language governing permissions and\n"+
|
||||
" limitations under the License.\n"+
|
||||
"-->\n"+
|
||||
"<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
|
||||
"\n"+
|
||||
"<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
|
||||
" classes, exceptions?, patterns)>\n"+
|
||||
"\n"+
|
||||
"<!-- Hyphen character to be used in the exception list as shortcut for\n"+
|
||||
" <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT hyphen-char EMPTY>\n"+
|
||||
"<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
|
||||
"\n"+
|
||||
"<!-- Default minimun length in characters of hyphenated word fragments\n"+
|
||||
" before and after the line break. For some languages this is not\n"+
|
||||
" only for aesthetic purposes, wrong hyphens may be generated if this\n"+
|
||||
" is not accounted for.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT hyphen-min EMPTY>\n"+
|
||||
"<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
|
||||
"<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
|
||||
"\n"+
|
||||
"<!-- Character equivalent classes: space separated list of character groups, all\n"+
|
||||
" characters in a group are to be treated equivalent as far as\n"+
|
||||
" the hyphenation algorithm is concerned. The first character in a group\n"+
|
||||
" is the group's equivalent character. Patterns should only contain\n"+
|
||||
" first characters. It also defines word characters, i.e. a word that\n"+
|
||||
" contains characters not present in any of the classes is not hyphenated.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT classes (#PCDATA)>\n"+
|
||||
"\n"+
|
||||
"<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
|
||||
" A hyphen is indicated by the hyphen tag, but you can use the\n"+
|
||||
" hyphen-char defined previously as shortcut. This is in cases\n"+
|
||||
" when the algorithm procedure finds wrong hyphens or you want\n"+
|
||||
" to provide your own hyphenation for some words.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
|
||||
"\n"+
|
||||
"<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
|
||||
" characters as described before, between any two word characters a digit\n"+
|
||||
" in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
|
||||
" to zero. The '.' character is reserved to indicate begining or ending\n"+
|
||||
" of words. -->\n"+
|
||||
"<!ELEMENT patterns (#PCDATA)>\n"+
|
||||
"\n"+
|
||||
"<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
|
||||
" with pre-break, post-break and no-break attributes.\n"+
|
||||
" To be used in the exceptions list, the hyphen character is not\n"+
|
||||
" automatically added -->\n"+
|
||||
"<!ELEMENT hyphen EMPTY>\n"+
|
||||
"<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
|
||||
"<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
|
||||
"<!ATTLIST hyphen post CDATA #IMPLIED>\n";
|
||||
|
||||
public static InputSource generateDTD() {
|
||||
return new InputSource(new StringReader(DTD_STRING));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
|
||||
|
||||
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
|
||||
classes, exceptions?, patterns)>
|
|
@ -17,9 +17,9 @@ package org.apache.lucene.analysis.compound;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
@ -31,10 +31,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
public void testHyphenationCompoundWordsDA() throws Exception {
|
||||
String[] dict = { "læse", "hest" };
|
||||
|
||||
Reader reader = getHyphenationReader();
|
||||
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
.getHyphenationTree(is);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||
|
@ -50,10 +49,10 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
|
||||
String[] dict = { "basketball", "basket", "ball", "kurv" };
|
||||
Reader reader = getHyphenationReader();
|
||||
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
.getHyphenationTree(is);
|
||||
|
||||
// the word basket will not be added due to the longest match option
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||
|
@ -73,9 +72,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
* This can be controlled with the min/max subword size.
|
||||
*/
|
||||
public void testHyphenationOnly() throws Exception {
|
||||
Reader reader = getHyphenationReader();
|
||||
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
.getHyphenationTree(is);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
|
@ -185,7 +184,4 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
}
|
||||
|
||||
private Reader getHyphenationReader() throws Exception {
|
||||
return new InputStreamReader(getClass().getResourceAsStream("da_UTF8.xml"), "UTF-8");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,10 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
|
@ -33,6 +29,8 @@ import org.apache.solr.common.SolrException;
|
|||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.InputStream;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* Factory for {@link HyphenationCompoundWordTokenFilter}
|
||||
|
@ -57,7 +55,7 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
|
|||
private HyphenationTree hyphenator;
|
||||
private String dictFile;
|
||||
private String hypFile;
|
||||
private String encoding = "UTF-8"; // default to UTF-8 encoding
|
||||
private String encoding;
|
||||
private int minWordSize;
|
||||
private int minSubwordSize;
|
||||
private int maxSubwordSize;
|
||||
|
@ -82,18 +80,21 @@ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFa
|
|||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
Reader reader = null;
|
||||
InputStream stream = null;
|
||||
try {
|
||||
if (dictFile != null) // the dictionary can be empty.
|
||||
dictionary = getWordSet(loader, dictFile, false);
|
||||
|
||||
InputStream hyph = loader.openResource(hypFile);
|
||||
reader = new InputStreamReader(hyph, encoding);
|
||||
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader);
|
||||
} catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception"
|
||||
// TODO: Broken, because we cannot resolve real system id
|
||||
// ResourceLoader should also supply method like ClassLoader to get resource URL
|
||||
stream = loader.openResource(hypFile);
|
||||
final InputSource is = new InputSource(stream);
|
||||
is.setEncoding(encoding); // if it's null let xml parser decide
|
||||
is.setSystemId(hypFile);
|
||||
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
|
||||
} catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception"
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(reader);
|
||||
IOUtils.closeQuietly(stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue