mirror of https://github.com/apache/lucene.git
SOLR-1984: add HyphenationCompoundWordTokenFilterFactory
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@962555 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
70d7eb1766
commit
c5bc95a357
|
@ -196,6 +196,8 @@ New Features
|
|||
will now return warnings about (gsingers)
|
||||
|
||||
* SOLR-1985: FastVectorHighlighter: add wrapper class for Lucene's SingleFragListBuilder (koji)
|
||||
|
||||
* SOLR-1984: Add HyphenationCompoundWordTokenFilterFactory. (PB via rmuir)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.analysis.BaseTokenFilterFactory;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link DictionaryCompoundWordTokenFilter}
|
||||
* <p>
|
||||
* This factory accepts the following parameters:
|
||||
* <ul>
|
||||
* <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern.
|
||||
* See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
|
||||
* <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
|
||||
* <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
|
||||
* <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
|
||||
* <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
|
||||
* <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
|
||||
* <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword
|
||||
* to the stream. defaults to false.
|
||||
* </ul>
|
||||
* <p>
|
||||
* @see HyphenationCompoundWordTokenFilter
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||
private CharArraySet dictionary;
|
||||
private HyphenationTree hyphenator;
|
||||
private String dictFile;
|
||||
private String hypFile;
|
||||
private String encoding = "UTF-8"; // default to UTF-8 encoding
|
||||
private int minWordSize;
|
||||
private int minSubwordSize;
|
||||
private int maxSubwordSize;
|
||||
private boolean onlyLongestMatch;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
assureMatchVersion();
|
||||
dictFile = args.get("dictionary");
|
||||
if (args.containsKey("encoding"))
|
||||
encoding = args.get("encoding");
|
||||
hypFile = args.get("hyphenator");
|
||||
if (null == hypFile) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
"Missing required parameter: hyphenator");
|
||||
}
|
||||
|
||||
minWordSize = getInt("minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
|
||||
minSubwordSize = getInt("minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
|
||||
maxSubwordSize = getInt("maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
|
||||
onlyLongestMatch = getBoolean("onlyLongestMatch", false);
|
||||
}
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
Reader reader = null;
|
||||
try {
|
||||
if (dictFile != null) // the dictionary can be empty.
|
||||
dictionary = getWordSet(loader, dictFile, false);
|
||||
|
||||
InputStream hyph = loader.openResource(hypFile);
|
||||
reader = new InputStreamReader(hyph, encoding);
|
||||
hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader);
|
||||
} catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception"
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public HyphenationCompoundWordTokenFilter create(TokenStream input) {
|
||||
return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Hyphenation compound filter factory is working.
|
||||
*/
|
||||
public class TestHyphenationCompoundWordTokenFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the factory works with hyphenation grammar+dictionary: using default options.
|
||||
*/
|
||||
public void testHyphenationWithDictionary() throws Exception {
|
||||
Reader reader = new StringReader("min veninde som er lidt af en læsehest");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory();
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("hyphenator", "da_UTF8.xml");
|
||||
args.put("dictionary", "da_compoundDictionary.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure the factory works with no dictionary: using hyphenation grammar only.
|
||||
* Also change the min/max subword sizes from the default. When using no dictionary,
|
||||
* its generally necessary to tweak these, or you get lots of expansions.
|
||||
*/
|
||||
public void testHyphenationOnly() throws Exception {
|
||||
Reader reader = new StringReader("basketballkurv");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
|
||||
HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory();
|
||||
ResourceLoader loader = new SolrResourceLoader(null, null);
|
||||
Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
|
||||
args.put("hyphenator", "da_UTF8.xml");
|
||||
args.put("minSubwordSize", "2");
|
||||
args.put("maxSubwordSize", "4");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
|
||||
);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# A set of words for testing the HyphenationCompound factory,
|
||||
# in conjunction with the danish hyphenation grammar.
|
||||
læse
|
||||
hest
|
|
@ -0,0 +1,68 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!--
|
||||
Copyright 1999-2004 The Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
|
||||
|
||||
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
|
||||
classes, exceptions?, patterns)>
|
||||
|
||||
<!-- Hyphen character to be used in the exception list as shortcut for
|
||||
<hyphen pre-break="-"/>. Defaults to '-'
|
||||
-->
|
||||
<!ELEMENT hyphen-char EMPTY>
|
||||
<!ATTLIST hyphen-char value CDATA #REQUIRED>
|
||||
|
||||
<!-- Default minimun length in characters of hyphenated word fragments
|
||||
before and after the line break. For some languages this is not
|
||||
only for aesthetic purposes, wrong hyphens may be generated if this
|
||||
is not accounted for.
|
||||
-->
|
||||
<!ELEMENT hyphen-min EMPTY>
|
||||
<!ATTLIST hyphen-min before CDATA #REQUIRED>
|
||||
<!ATTLIST hyphen-min after CDATA #REQUIRED>
|
||||
|
||||
<!-- Character equivalent classes: space separated list of character groups, all
|
||||
characters in a group are to be treated equivalent as far as
|
||||
the hyphenation algorithm is concerned. The first character in a group
|
||||
is the group's equivalent character. Patterns should only contain
|
||||
first characters. It also defines word characters, i.e. a word that
|
||||
contains characters not present in any of the classes is not hyphenated.
|
||||
-->
|
||||
<!ELEMENT classes (#PCDATA)>
|
||||
|
||||
<!-- Hyphenation exceptions: space separated list of hyphenated words.
|
||||
A hyphen is indicated by the hyphen tag, but you can use the
|
||||
hyphen-char defined previously as shortcut. This is in cases
|
||||
when the algorithm procedure finds wrong hyphens or you want
|
||||
to provide your own hyphenation for some words.
|
||||
-->
|
||||
<!ELEMENT exceptions (#PCDATA|hyphen)* >
|
||||
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
|
||||
characters as described before, between any two word characters a digit
|
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent
|
||||
to zero. The '.' character is reserved to indicate begining or ending
|
||||
of words. -->
|
||||
<!ELEMENT patterns (#PCDATA)>
|
||||
|
||||
<!-- A "full hyphen" equivalent to TeX's \discretionary
|
||||
with pre-break, post-break and no-break attributes.
|
||||
To be used in the exceptions list, the hyphen character is not
|
||||
automatically added -->
|
||||
<!ELEMENT hyphen EMPTY>
|
||||
<!ATTLIST hyphen pre CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen no CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen post CDATA #IMPLIED>
|
Loading…
Reference in New Issue