mirror of https://github.com/apache/lucene.git
LUCENE-5468: Stem -> CharsRef
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
48f5564450
commit
caaa01d220
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
||||||
|
@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
||||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
private final Stemmer stemmer;
|
private final Stemmer stemmer;
|
||||||
|
|
||||||
private List<Stem> buffer;
|
private List<CharsRef> buffer;
|
||||||
private State savedState;
|
private State savedState;
|
||||||
|
|
||||||
private final boolean dedup;
|
private final boolean dedup;
|
||||||
|
@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (buffer != null && !buffer.isEmpty()) {
|
if (buffer != null && !buffer.isEmpty()) {
|
||||||
Stem nextStem = buffer.remove(0);
|
CharsRef nextStem = buffer.remove(0);
|
||||||
restoreState(savedState);
|
restoreState(savedState);
|
||||||
posIncAtt.setPositionIncrement(0);
|
posIncAtt.setPositionIncrement(0);
|
||||||
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
|
termAtt.setEmpty().append(nextStem);
|
||||||
termAtt.setLength(nextStem.getStemLength());
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Stem stem = buffer.remove(0);
|
CharsRef stem = buffer.remove(0);
|
||||||
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
|
termAtt.setEmpty().append(stem);
|
||||||
termAtt.setLength(stem.getStemLength());
|
|
||||||
|
|
||||||
if (!buffer.isEmpty()) {
|
if (!buffer.isEmpty()) {
|
||||||
savedState = captureState();
|
savedState = captureState();
|
||||||
|
|
|
@ -1,98 +0,0 @@
|
||||||
package org.apache.lucene.analysis.hunspell2;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
|
|
||||||
* that were used to change the word into the stem.
|
|
||||||
*/
|
|
||||||
final class Stem {
|
|
||||||
final List<Affix> prefixes = new ArrayList<Affix>();
|
|
||||||
final List<Affix> suffixes = new ArrayList<Affix>();
|
|
||||||
final char stem[];
|
|
||||||
final int stemLength;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a new Stem wrapping the given word stem
|
|
||||||
*
|
|
||||||
* @param stem Stem of a word
|
|
||||||
*/
|
|
||||||
public Stem(char stem[], int stemLength) {
|
|
||||||
this.stem = stem;
|
|
||||||
this.stemLength = stemLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
|
|
||||||
* depth first, the prefix is added to the front of the list
|
|
||||||
*
|
|
||||||
* @param prefix Prefix to add to the list of prefixes for this stem
|
|
||||||
*/
|
|
||||||
public void addPrefix(Affix prefix) {
|
|
||||||
prefixes.add(0, prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
|
|
||||||
* depth first, the suffix is added to the end of the list
|
|
||||||
*
|
|
||||||
* @param suffix Suffix to add to the list of suffixes for this stem
|
|
||||||
*/
|
|
||||||
public void addSuffix(Affix suffix) {
|
|
||||||
suffixes.add(suffix);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the list of prefixes used to generate the stem
|
|
||||||
*
|
|
||||||
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
|
|
||||||
*/
|
|
||||||
public List<Affix> getPrefixes() {
|
|
||||||
return prefixes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the list of suffixes used to generate the stem
|
|
||||||
*
|
|
||||||
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
|
|
||||||
*/
|
|
||||||
public List<Affix> getSuffixes() {
|
|
||||||
return suffixes;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the text of the word's stem.
|
|
||||||
* @see #getStemLength()
|
|
||||||
*/
|
|
||||||
public char[] getStem() {
|
|
||||||
return stem;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the valid length of the text in {@link #getStem()} */
|
|
||||||
public int getStemLength() {
|
|
||||||
return stemLength;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Only use this if you really need a string (e.g. for testing) */
|
|
||||||
public String getStemString() {
|
|
||||||
return new String(stem, 0, stemLength);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -24,6 +24,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -63,7 +64,7 @@ final class Stemmer {
|
||||||
* @param word Word to find the stems for
|
* @param word Word to find the stems for
|
||||||
* @return List of stems for the word
|
* @return List of stems for the word
|
||||||
*/
|
*/
|
||||||
public List<Stem> stem(String word) {
|
public List<CharsRef> stem(String word) {
|
||||||
return stem(word.toCharArray(), word.length());
|
return stem(word.toCharArray(), word.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,10 +74,10 @@ final class Stemmer {
|
||||||
* @param word Word to find the stems for
|
* @param word Word to find the stems for
|
||||||
* @return List of stems for the word
|
* @return List of stems for the word
|
||||||
*/
|
*/
|
||||||
public List<Stem> stem(char word[], int length) {
|
public List<CharsRef> stem(char word[], int length) {
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||||
stems.add(new Stem(word, length));
|
stems.add(new CharsRef(word, 0, length));
|
||||||
}
|
}
|
||||||
stems.addAll(stem(word, length, null, 0));
|
stems.addAll(stem(word, length, null, 0));
|
||||||
return stems;
|
return stems;
|
||||||
|
@ -88,18 +89,18 @@ final class Stemmer {
|
||||||
* @param word Word to find the stems for
|
* @param word Word to find the stems for
|
||||||
* @return List of stems for the word
|
* @return List of stems for the word
|
||||||
*/
|
*/
|
||||||
public List<Stem> uniqueStems(char word[], int length) {
|
public List<CharsRef> uniqueStems(char word[], int length) {
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||||
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||||
stems.add(new Stem(word, length));
|
stems.add(new CharsRef(word, 0, length));
|
||||||
terms.add(word);
|
terms.add(word);
|
||||||
}
|
}
|
||||||
List<Stem> otherStems = stem(word, length, null, 0);
|
List<CharsRef> otherStems = stem(word, length, null, 0);
|
||||||
for (Stem s : otherStems) {
|
for (CharsRef s : otherStems) {
|
||||||
if (!terms.contains(s.stem)) {
|
if (!terms.contains(s)) {
|
||||||
stems.add(s);
|
stems.add(s);
|
||||||
terms.add(s.stem);
|
terms.add(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return stems;
|
return stems;
|
||||||
|
@ -115,8 +116,8 @@ final class Stemmer {
|
||||||
* @param recursionDepth Level of recursion this stemming step is at
|
* @param recursionDepth Level of recursion this stemming step is at
|
||||||
* @return List of stems, or empty list if no stems are found
|
* @return List of stems, or empty list if no stems are found
|
||||||
*/
|
*/
|
||||||
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
|
private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||||
|
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||||
|
@ -131,10 +132,7 @@ final class Stemmer {
|
||||||
// TODO: can we do this in-place?
|
// TODO: can we do this in-place?
|
||||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
|
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
|
||||||
|
|
||||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||||
for (Stem stem : stemList) {
|
|
||||||
stem.addSuffix(suffix);
|
|
||||||
}
|
|
||||||
|
|
||||||
stems.addAll(stemList);
|
stems.addAll(stemList);
|
||||||
}
|
}
|
||||||
|
@ -156,10 +154,7 @@ final class Stemmer {
|
||||||
.append(word, deAffixedStart, deAffixedLength)
|
.append(word, deAffixedStart, deAffixedLength)
|
||||||
.toString();
|
.toString();
|
||||||
|
|
||||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
||||||
for (Stem stem : stemList) {
|
|
||||||
stem.addPrefix(prefix);
|
|
||||||
}
|
|
||||||
|
|
||||||
stems.addAll(stemList);
|
stems.addAll(stemList);
|
||||||
}
|
}
|
||||||
|
@ -177,18 +172,18 @@ final class Stemmer {
|
||||||
* @param recursionDepth Level of recursion this stemming step is at
|
* @param recursionDepth Level of recursion this stemming step is at
|
||||||
* @return List of stems for the word, or an empty list if none are found
|
* @return List of stems for the word, or an empty list if none are found
|
||||||
*/
|
*/
|
||||||
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
||||||
segment.setLength(0);
|
segment.setLength(0);
|
||||||
segment.append(strippedWord, 0, length);
|
segment.append(strippedWord, 0, length);
|
||||||
if (!affix.checkCondition(segment)) {
|
if (!affix.checkCondition(segment)) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Stem> stems = new ArrayList<Stem>();
|
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||||
|
|
||||||
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
|
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
|
||||||
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
|
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
|
||||||
stems.add(new Stem(strippedWord, length));
|
stems.add(new CharsRef(strippedWord, 0, length));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
|
||||||
private void assertStemsTo(String s, String... expected) {
|
private void assertStemsTo(String s, String... expected) {
|
||||||
Arrays.sort(expected);
|
Arrays.sort(expected);
|
||||||
|
|
||||||
List<Stem> stems = stemmer.stem(s);
|
List<CharsRef> stems = stemmer.stem(s);
|
||||||
String actual[] = new String[stems.size()];
|
String actual[] = new String[stems.size()];
|
||||||
for (int i = 0; i < actual.length; i++) {
|
for (int i = 0; i < actual.length; i++) {
|
||||||
actual[i] = stems.get(i).getStemString();
|
actual[i] = stems.get(i).toString();
|
||||||
}
|
}
|
||||||
Arrays.sort(actual);
|
Arrays.sort(actual);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue