LUCENE-5468: Stem -> CharsRef

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-25 20:07:05 +00:00
parent 48f5564450
commit caaa01d220
4 changed files with 28 additions and 131 deletions

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
/** /**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final Stemmer stemmer; private final Stemmer stemmer;
private List<Stem> buffer; private List<CharsRef> buffer;
private State savedState; private State savedState;
private final boolean dedup; private final boolean dedup;
@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
@Override @Override
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) { if (buffer != null && !buffer.isEmpty()) {
Stem nextStem = buffer.remove(0); CharsRef nextStem = buffer.remove(0);
restoreState(savedState); restoreState(savedState);
posIncAtt.setPositionIncrement(0); posIncAtt.setPositionIncrement(0);
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); termAtt.setEmpty().append(nextStem);
termAtt.setLength(nextStem.getStemLength());
return true; return true;
} }
@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
return true; return true;
} }
Stem stem = buffer.remove(0); CharsRef stem = buffer.remove(0);
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); termAtt.setEmpty().append(stem);
termAtt.setLength(stem.getStemLength());
if (!buffer.isEmpty()) { if (!buffer.isEmpty()) {
savedState = captureState(); savedState = captureState();

View File

@ -1,98 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
/**
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
* that were used to change the word into the stem.
*/
final class Stem {
final List<Affix> prefixes = new ArrayList<Affix>();
final List<Affix> suffixes = new ArrayList<Affix>();
final char stem[];
final int stemLength;
/**
* Creates a new Stem wrapping the given word stem
*
* @param stem Stem of a word
*/
public Stem(char stem[], int stemLength) {
this.stem = stem;
this.stemLength = stemLength;
}
/**
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
* depth first, the prefix is added to the front of the list
*
* @param prefix Prefix to add to the list of prefixes for this stem
*/
public void addPrefix(Affix prefix) {
prefixes.add(0, prefix);
}
/**
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
* depth first, the suffix is added to the end of the list
*
* @param suffix Suffix to add to the list of suffixes for this stem
*/
public void addSuffix(Affix suffix) {
suffixes.add(suffix);
}
/**
* Returns the list of prefixes used to generate the stem
*
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
*/
public List<Affix> getPrefixes() {
return prefixes;
}
/**
* Returns the list of suffixes used to generate the stem
*
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
*/
public List<Affix> getSuffixes() {
return suffixes;
}
/**
* Returns the text of the word's stem.
* @see #getStemLength()
*/
public char[] getStem() {
return stem;
}
/** Returns the valid length of the text in {@link #getStem()} */
public int getStemLength() {
return stemLength;
}
/** Only use this if you really need a string (e.g. for testing) */
public String getStemString() {
return new String(stem, 0, stemLength);
}
}

View File

@ -24,6 +24,7 @@ import java.util.List;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
/** /**
@ -63,7 +64,7 @@ final class Stemmer {
* @param word Word to find the stems for * @param word Word to find the stems for
* @return List of stems for the word * @return List of stems for the word
*/ */
public List<Stem> stem(String word) { public List<CharsRef> stem(String word) {
return stem(word.toCharArray(), word.length()); return stem(word.toCharArray(), word.length());
} }
@ -73,10 +74,10 @@ final class Stemmer {
* @param word Word to find the stems for * @param word Word to find the stems for
* @return List of stems for the word * @return List of stems for the word
*/ */
public List<Stem> stem(char word[], int length) { public List<CharsRef> stem(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>(); List<CharsRef> stems = new ArrayList<CharsRef>();
if (dictionary.lookupWord(word, 0, length, scratch) != null) { if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length)); stems.add(new CharsRef(word, 0, length));
} }
stems.addAll(stem(word, length, null, 0)); stems.addAll(stem(word, length, null, 0));
return stems; return stems;
@ -88,18 +89,18 @@ final class Stemmer {
* @param word Word to find the stems for * @param word Word to find the stems for
* @return List of stems for the word * @return List of stems for the word
*/ */
public List<Stem> uniqueStems(char word[], int length) { public List<CharsRef> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>(); List<CharsRef> stems = new ArrayList<CharsRef>();
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
if (dictionary.lookupWord(word, 0, length, scratch) != null) { if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length)); stems.add(new CharsRef(word, 0, length));
terms.add(word); terms.add(word);
} }
List<Stem> otherStems = stem(word, length, null, 0); List<CharsRef> otherStems = stem(word, length, null, 0);
for (Stem s : otherStems) { for (CharsRef s : otherStems) {
if (!terms.contains(s.stem)) { if (!terms.contains(s)) {
stems.add(s); stems.add(s);
terms.add(s.stem); terms.add(s);
} }
} }
return stems; return stems;
@ -115,8 +116,8 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at * @param recursionDepth Level of recursion this stemming step is at
* @return List of stems, or empty list if no stems are found * @return List of stems, or empty list if no stems are found
*/ */
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) { private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
List<Stem> stems = new ArrayList<Stem>(); List<CharsRef> stems = new ArrayList<CharsRef>();
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i); List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@ -131,10 +132,7 @@ final class Stemmer {
// TODO: can we do this in-place? // TODO: can we do this in-place?
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
for (Stem stem : stemList) {
stem.addSuffix(suffix);
}
stems.addAll(stemList); stems.addAll(stemList);
} }
@ -156,10 +154,7 @@ final class Stemmer {
.append(word, deAffixedStart, deAffixedLength) .append(word, deAffixedStart, deAffixedLength)
.toString(); .toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
for (Stem stem : stemList) {
stem.addPrefix(prefix);
}
stems.addAll(stemList); stems.addAll(stemList);
} }
@ -177,18 +172,18 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at * @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found * @return List of stems for the word, or an empty list if none are found
*/ */
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
segment.setLength(0); segment.setLength(0);
segment.append(strippedWord, 0, length); segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) { if (!affix.checkCondition(segment)) {
return Collections.emptyList(); return Collections.emptyList();
} }
List<Stem> stems = new ArrayList<Stem>(); List<CharsRef> stems = new ArrayList<CharsRef>();
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
stems.add(new Stem(strippedWord, length)); stems.add(new CharsRef(strippedWord, 0, length));
} }
if (affix.isCrossProduct() && recursionDepth < recursionCap) { if (affix.isCrossProduct() && recursionDepth < recursionCap) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
private void assertStemsTo(String s, String... expected) { private void assertStemsTo(String s, String... expected) {
Arrays.sort(expected); Arrays.sort(expected);
List<Stem> stems = stemmer.stem(s); List<CharsRef> stems = stemmer.stem(s);
String actual[] = new String[stems.size()]; String actual[] = new String[stems.size()];
for (int i = 0; i < actual.length; i++) { for (int i = 0; i < actual.length; i++) {
actual[i] = stems.get(i).getStemString(); actual[i] = stems.get(i).toString();
} }
Arrays.sort(actual); Arrays.sort(actual);