LUCENE-5468: Stem -> CharsRef

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68
2014-02-25 20:07:05 +00:00 · 2014-02-25 20:07:05 +00:00 · caaa01d220
parent 48f5564450
commit caaa01d220
4 changed files with 28 additions and 131 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;

 /**
 * TokenFilter that uses hunspell affix rules and words to stem tokens.  Since hunspell supports a word having multiple
@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter {
  private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
  private final Stemmer stemmer;
  
-  private List<Stem> buffer;
+  private List<CharsRef> buffer;
  private State savedState;
  
  private final boolean dedup;
@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter {
  @Override
  public boolean incrementToken() throws IOException {
    if (buffer != null && !buffer.isEmpty()) {
-      Stem nextStem = buffer.remove(0);
+      CharsRef nextStem = buffer.remove(0);
      restoreState(savedState);
      posIncAtt.setPositionIncrement(0);
-      termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
-      termAtt.setLength(nextStem.getStemLength());
+      termAtt.setEmpty().append(nextStem);
      return true;
    }
    
@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter {
      return true;
    }     

-    Stem stem = buffer.remove(0);
-    termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
-    termAtt.setLength(stem.getStemLength());
+    CharsRef stem = buffer.remove(0);
+    termAtt.setEmpty().append(stem);

    if (!buffer.isEmpty()) {
      savedState = captureState();
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java
@ -1,98 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.List;
-
-/**
- * Stem represents all information known about a stem of a word.  This includes the stem, and the prefixes and suffixes
- * that were used to change the word into the stem.
- */
-final class Stem {
-  final List<Affix> prefixes = new ArrayList<Affix>();
-  final List<Affix> suffixes = new ArrayList<Affix>();
-  final char stem[];
-  final int stemLength;
-
-  /**
-   * Creates a new Stem wrapping the given word stem
-   *
-   * @param stem Stem of a word
-   */
-  public Stem(char stem[], int stemLength) {
-    this.stem = stem;
-    this.stemLength = stemLength;
-  }
-
-  /**
-   * Adds a prefix to the list of prefixes used to generate this stem.  Because it is assumed that prefixes are added
-   * depth first, the prefix is added to the front of the list
-   *
-   * @param prefix Prefix to add to the list of prefixes for this stem
-   */
-  public void addPrefix(Affix prefix) {
-    prefixes.add(0, prefix);
-  }
-
-  /**
-   * Adds a suffix to the list of suffixes used to generate this stem.  Because it is assumed that suffixes are added
-   * depth first, the suffix is added to the end of the list
-   *
-   * @param suffix Suffix to add to the list of suffixes for this stem
-   */
-  public void addSuffix(Affix suffix) {
-    suffixes.add(suffix);
-  }
-
-  /**
-   * Returns the list of prefixes used to generate the stem
-   *
-   * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
-   */
-  public List<Affix> getPrefixes() {
-    return prefixes;
-  }
-
-  /**
-   * Returns the list of suffixes used to generate the stem
-   * 
-   * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
-   */
-  public List<Affix> getSuffixes() {
-    return suffixes;
-  }
-
-  /**
-   * Returns the text of the word's stem.
-   * @see #getStemLength()
-   */
-  public char[] getStem() {
-    return stem;
-  }
-
-  /** Returns the valid length of the text in {@link #getStem()} */
-  public int getStemLength() {
-    return stemLength;
-  }
-  
-  /** Only use this if you really need a string (e.g. for testing) */
-  public String getStemString() {
-    return new String(stem, 0, stemLength);
-  }
-}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@ -24,6 +24,7 @@ import java.util.List;

 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;

 /**
@ -63,7 +64,7 @@ final class Stemmer {
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
-  public List<Stem> stem(String word) {
+  public List<CharsRef> stem(String word) {
    return stem(word.toCharArray(), word.length());
  }

@ -73,10 +74,10 @@ final class Stemmer {
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
-  public List<Stem> stem(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
+  public List<CharsRef> stem(char word[], int length) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();
    if (dictionary.lookupWord(word, 0, length, scratch) != null) {
-      stems.add(new Stem(word, length));
+      stems.add(new CharsRef(word, 0, length));
    }
    stems.addAll(stem(word, length, null, 0));
    return stems;
@ -88,18 +89,18 @@ final class Stemmer {
   * @param word Word to find the stems for
   * @return List of stems for the word
   */
-  public List<Stem> uniqueStems(char word[], int length) {
-    List<Stem> stems = new ArrayList<Stem>();
+  public List<CharsRef> uniqueStems(char word[], int length) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();
    CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
    if (dictionary.lookupWord(word, 0, length, scratch) != null) {
-      stems.add(new Stem(word, length));
+      stems.add(new CharsRef(word, 0, length));
      terms.add(word);
    }
-    List<Stem> otherStems = stem(word, length, null, 0);
-    for (Stem s : otherStems) {
-      if (!terms.contains(s.stem)) {
+    List<CharsRef> otherStems = stem(word, length, null, 0);
+    for (CharsRef s : otherStems) {
+      if (!terms.contains(s)) {
        stems.add(s);
-        terms.add(s.stem);
+        terms.add(s);
      }
    }
    return stems;
@ -115,8 +116,8 @@ final class Stemmer {
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems, or empty list if no stems are found
   */
-  private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
-    List<Stem> stems = new ArrayList<Stem>();
+  private List<CharsRef> stem(char word[], int length, char[] flags, int recursionDepth) {
+    List<CharsRef> stems = new ArrayList<CharsRef>();

    for (int i = 0; i < length; i++) {
      List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
@ -131,10 +132,7 @@ final class Stemmer {
          // TODO: can we do this in-place?
          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();

-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addSuffix(suffix);
-          }
+          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);

          stems.addAll(stemList);
        }
@ -156,10 +154,7 @@ final class Stemmer {
              .append(word, deAffixedStart, deAffixedLength)
              .toString();

-          List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
-          for (Stem stem : stemList) {
-            stem.addPrefix(prefix);
-          }
+          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);

          stems.addAll(stemList);
        }
@ -177,18 +172,18 @@ final class Stemmer {
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems for the word, or an empty list if none are found
   */
-  public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    if (!affix.checkCondition(segment)) {
      return Collections.emptyList();
    }

-    List<Stem> stems = new ArrayList<Stem>();
+    List<CharsRef> stems = new ArrayList<CharsRef>();

    char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
    if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
-      stems.add(new Stem(strippedWord, length));
+      stems.add(new CharsRef(strippedWord, 0, length));
    }

    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2;
 * limitations under the License.
 */

+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase {
  private void assertStemsTo(String s, String... expected) {
    Arrays.sort(expected);
    
-    List<Stem> stems = stemmer.stem(s);
+    List<CharsRef> stems = stemmer.stem(s);
    String actual[] = new String[stems.size()];
    for (int i = 0; i < actual.length; i++) {
-      actual[i] = stems.get(i).getStemString();
+      actual[i] = stems.get(i).toString();
    }
    Arrays.sort(actual);