LUCENE-5468: Stem -> CharsRef

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-25 20:07:05 +00:00
parent 48f5564450
commit caaa01d220
4 changed files with 28 additions and 131 deletions

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final Stemmer stemmer;
private List<Stem> buffer;
private List<CharsRef> buffer;
private State savedState;
private final boolean dedup;
@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
Stem nextStem = buffer.remove(0);
CharsRef nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
termAtt.setLength(nextStem.getStemLength());
termAtt.setEmpty().append(nextStem);
return true;
}
@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
return true;
}
Stem stem = buffer.remove(0);
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
termAtt.setLength(stem.getStemLength());
CharsRef stem = buffer.remove(0);
termAtt.setEmpty().append(stem);
if (!buffer.isEmpty()) {
savedState = captureState();

View File

@ -1,98 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
/**
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
* that were used to change the word into the stem.
*/
final class Stem {
final List<Affix> prefixes = new ArrayList<Affix>();
final List<Affix> suffixes = new ArrayList<Affix>();
final char stem[];
final int stemLength;
/**
* Creates a new Stem wrapping the given word stem
*
* @param stem Stem of a word
*/
public Stem(char stem[], int stemLength) {
this.stem = stem;
this.stemLength = stemLength;
}
/**
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
* depth first, the prefix is added to the front of the list
*
* @param prefix Prefix to add to the list of prefixes for this stem
*/
public void addPrefix(Affix prefix) {
prefixes.add(0, prefix);
}
/**
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
* depth first, the suffix is added to the end of the list
*
* @param suffix Suffix to add to the list of suffixes for this stem
*/
public void addSuffix(Affix suffix) {
suffixes.add(suffix);
}
/**
* Returns the list of prefixes used to generate the stem
*
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
*/
public List<Affix> getPrefixes() {
return prefixes;
}
/**
* Returns the list of suffixes used to generate the stem
*
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
*/
public List<Affix> getSuffixes() {
return suffixes;
}
/**
* Returns the text of the word's stem.
* @see #getStemLength()
*/
public char[] getStem() {
return stem;
}
/** Returns the valid length of the text in {@link #getStem()} */
public int getStemLength() {
return stemLength;
}
/** Only use this if you really need a string (e.g. for testing) */
public String getStemString() {
return new String(stem, 0, stemLength);
}
}

View File

@ -24,6 +24,7 @@ import java.util.List;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;
/**
@ -63,7 +64,7 @@ final class Stemmer {
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(String word) {
public List<CharsRef> stem(String word) {
return stem(word.toCharArray(), word.length());
}
@ -73,10 +74,10 @@ final class Stemmer {
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
public List<CharsRef> stem(char word[], int length) {
List<CharsRef> stems = new ArrayList<CharsRef>();
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length));
stems.add(new CharsRef(word, 0, length));
}
stems.addAll(stem(word, length, null, 0));
return stems;
@ -88,18 +89,18 @@ final class Stemmer {
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
public List<CharsRef> uniqueStems(char word[], int length) {
List<CharsRef> stems = new ArrayList<CharsRef>();
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length));
stems.add(new CharsRef(word, 0, length));
terms.add(word);
}
List<Stem> otherStems = stem(word, length, null, 0);
for (Stem s : otherStems) {
if (!terms.contains(s.stem)) {
List<CharsRef> otherStems = stem(word, length, null, 0);
for (CharsRef s : otherStems) {
if (!terms.contains(s)) {
stems.add(s);
terms.add(s.stem);
terms.add(s);
}
}
return stems;
@ -115,8 +116,8 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems, or empty list if no stems are found
*/
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
List<Stem> stems = new ArrayList<Stem>();
private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
List<CharsRef> stems = new ArrayList<CharsRef>();
for (int i = 0; i < length; i++) {
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@ -131,10 +132,7 @@ final class Stemmer {
// TODO: can we do this in-place?
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
for (Stem stem : stemList) {
stem.addSuffix(suffix);
}
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
stems.addAll(stemList);
}
@ -156,10 +154,7 @@ final class Stemmer {
.append(word, deAffixedStart, deAffixedLength)
.toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
for (Stem stem : stemList) {
stem.addPrefix(prefix);
}
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
stems.addAll(stemList);
}
@ -177,18 +172,18 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) {
return Collections.emptyList();
}
List<Stem> stems = new ArrayList<Stem>();
List<CharsRef> stems = new ArrayList<CharsRef>();
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
stems.add(new Stem(strippedWord, length));
stems.add(new CharsRef(strippedWord, 0, length));
}
if (affix.isCrossProduct() && recursionDepth < recursionCap) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License.
*/
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
private void assertStemsTo(String s, String... expected) {
Arrays.sort(expected);
List<Stem> stems = stemmer.stem(s);
List<CharsRef> stems = stemmer.stem(s);
String actual[] = new String[stems.size()];
for (int i = 0; i < actual.length; i++) {
actual[i] = stems.get(i).getStemString();
actual[i] = stems.get(i).toString();
}
Arrays.sort(actual);