mirror of https://github.com/apache/lucene.git
LUCENE-5468: Stem -> CharsRef
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
48f5564450
commit
caaa01d220
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
||||
|
@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
|||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final Stemmer stemmer;
|
||||
|
||||
private List<Stem> buffer;
|
||||
private List<CharsRef> buffer;
|
||||
private State savedState;
|
||||
|
||||
private final boolean dedup;
|
||||
|
@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
|||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
Stem nextStem = buffer.remove(0);
|
||||
CharsRef nextStem = buffer.remove(0);
|
||||
restoreState(savedState);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
|
||||
termAtt.setLength(nextStem.getStemLength());
|
||||
termAtt.setEmpty().append(nextStem);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
|||
return true;
|
||||
}
|
||||
|
||||
Stem stem = buffer.remove(0);
|
||||
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
|
||||
termAtt.setLength(stem.getStemLength());
|
||||
CharsRef stem = buffer.remove(0);
|
||||
termAtt.setEmpty().append(stem);
|
||||
|
||||
if (!buffer.isEmpty()) {
|
||||
savedState = captureState();
|
||||
|
|
|
@ -1,98 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
|
||||
* that were used to change the word into the stem.
|
||||
*/
|
||||
final class Stem {
|
||||
final List<Affix> prefixes = new ArrayList<Affix>();
|
||||
final List<Affix> suffixes = new ArrayList<Affix>();
|
||||
final char stem[];
|
||||
final int stemLength;
|
||||
|
||||
/**
|
||||
* Creates a new Stem wrapping the given word stem
|
||||
*
|
||||
* @param stem Stem of a word
|
||||
*/
|
||||
public Stem(char stem[], int stemLength) {
|
||||
this.stem = stem;
|
||||
this.stemLength = stemLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
|
||||
* depth first, the prefix is added to the front of the list
|
||||
*
|
||||
* @param prefix Prefix to add to the list of prefixes for this stem
|
||||
*/
|
||||
public void addPrefix(Affix prefix) {
|
||||
prefixes.add(0, prefix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
|
||||
* depth first, the suffix is added to the end of the list
|
||||
*
|
||||
* @param suffix Suffix to add to the list of suffixes for this stem
|
||||
*/
|
||||
public void addSuffix(Affix suffix) {
|
||||
suffixes.add(suffix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of prefixes used to generate the stem
|
||||
*
|
||||
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
|
||||
*/
|
||||
public List<Affix> getPrefixes() {
|
||||
return prefixes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of suffixes used to generate the stem
|
||||
*
|
||||
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
|
||||
*/
|
||||
public List<Affix> getSuffixes() {
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the text of the word's stem.
|
||||
* @see #getStemLength()
|
||||
*/
|
||||
public char[] getStem() {
|
||||
return stem;
|
||||
}
|
||||
|
||||
/** Returns the valid length of the text in {@link #getStem()} */
|
||||
public int getStemLength() {
|
||||
return stemLength;
|
||||
}
|
||||
|
||||
/** Only use this if you really need a string (e.g. for testing) */
|
||||
public String getStemString() {
|
||||
return new String(stem, 0, stemLength);
|
||||
}
|
||||
}
|
|
@ -24,6 +24,7 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -63,7 +64,7 @@ final class Stemmer {
|
|||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> stem(String word) {
|
||||
public List<CharsRef> stem(String word) {
|
||||
return stem(word.toCharArray(), word.length());
|
||||
}
|
||||
|
||||
|
@ -73,10 +74,10 @@ final class Stemmer {
|
|||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> stem(char word[], int length) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
public List<CharsRef> stem(char word[], int length) {
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||
stems.add(new Stem(word, length));
|
||||
stems.add(new CharsRef(word, 0, length));
|
||||
}
|
||||
stems.addAll(stem(word, length, null, 0));
|
||||
return stems;
|
||||
|
@ -88,18 +89,18 @@ final class Stemmer {
|
|||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> uniqueStems(char word[], int length) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
public List<CharsRef> uniqueStems(char word[], int length) {
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||
stems.add(new Stem(word, length));
|
||||
stems.add(new CharsRef(word, 0, length));
|
||||
terms.add(word);
|
||||
}
|
||||
List<Stem> otherStems = stem(word, length, null, 0);
|
||||
for (Stem s : otherStems) {
|
||||
if (!terms.contains(s.stem)) {
|
||||
List<CharsRef> otherStems = stem(word, length, null, 0);
|
||||
for (CharsRef s : otherStems) {
|
||||
if (!terms.contains(s)) {
|
||||
stems.add(s);
|
||||
terms.add(s.stem);
|
||||
terms.add(s);
|
||||
}
|
||||
}
|
||||
return stems;
|
||||
|
@ -115,8 +116,8 @@ final class Stemmer {
|
|||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems, or empty list if no stems are found
|
||||
*/
|
||||
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
|
@ -131,10 +132,7 @@ final class Stemmer {
|
|||
// TODO: can we do this in-place?
|
||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
|
||||
|
||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||
for (Stem stem : stemList) {
|
||||
stem.addSuffix(suffix);
|
||||
}
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -156,10 +154,7 @@ final class Stemmer {
|
|||
.append(word, deAffixedStart, deAffixedLength)
|
||||
.toString();
|
||||
|
||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
||||
for (Stem stem : stemList) {
|
||||
stem.addPrefix(prefix);
|
||||
}
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -177,18 +172,18 @@ final class Stemmer {
|
|||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
if (!affix.checkCondition(segment)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
||||
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
|
||||
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
|
||||
stems.add(new Stem(strippedWord, length));
|
||||
stems.add(new CharsRef(strippedWord, 0, length));
|
||||
}
|
||||
|
||||
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
|
|||
private void assertStemsTo(String s, String... expected) {
|
||||
Arrays.sort(expected);
|
||||
|
||||
List<Stem> stems = stemmer.stem(s);
|
||||
List<CharsRef> stems = stemmer.stem(s);
|
||||
String actual[] = new String[stems.size()];
|
||||
for (int i = 0; i < actual.length; i++) {
|
||||
actual[i] = stems.get(i).getStemString();
|
||||
actual[i] = stems.get(i).toString();
|
||||
}
|
||||
Arrays.sort(actual);
|
||||
|
||||
|
|
Loading…
Reference in New Issue