mirror of https://github.com/apache/lucene.git
SOLR-3020: Add KeywordAttribute support to HunspellStemFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1229519 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
56a28c69f8
commit
f19317d318
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
|
import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -34,6 +35,7 @@ public final class HunspellStemFilter extends TokenFilter {
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
private final HunspellStemmer stemmer;
|
private final HunspellStemmer stemmer;
|
||||||
|
|
||||||
private List<Stem> buffer;
|
private List<Stem> buffer;
|
||||||
|
@ -84,6 +86,10 @@ public final class HunspellStemFilter extends TokenFilter {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (keywordAtt.isKeyword()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
|
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
|
||||||
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
|
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
|
||||||
|
|
|
@ -0,0 +1,61 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
private static HunspellDictionary DICTIONARY;
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws IOException, ParseException {
|
||||||
|
DICTIONARY = createDict(true);
|
||||||
|
}
|
||||||
|
public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException {
|
||||||
|
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
|
||||||
|
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
|
||||||
|
|
||||||
|
return new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40, ignoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple test for KeywordAttribute
|
||||||
|
*/
|
||||||
|
public void testKeywordAttribute() throws IOException {
|
||||||
|
MockTokenizer tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
|
||||||
|
tokenizer.setEnableChecks(true);
|
||||||
|
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY);
|
||||||
|
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
|
||||||
|
|
||||||
|
// assert with keywork marker
|
||||||
|
tokenizer = new MockTokenizer(new StringReader("lucene is awesome"), MockTokenizer.WHITESPACE, true);
|
||||||
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||||
|
filter = new HunspellStemFilter(new KeywordMarkerFilter(tokenizer, set), DICTIONARY);
|
||||||
|
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue