LUCENE-3889: remove unnecessary/unused base class

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303026 13f79535-47bb-0310-9956-ffa450edef68
2012-03-20 17:28:26 +00:00 · 2012-03-20 17:28:26 +00:00 · b7a7e5a625
parent 7f8076fefc
commit b7a7e5a625
3 changed files with 0 additions and 409 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -189,11 +189,6 @@ API Changes
   not take position overlaps into account while recalculating norms.
   (Uwe Schindler, Robert Muir)

- * LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
-   with BreakIterator and allows subclasses to decompose sentences into words, or
-   use the sentence boundary information for other reasons (e.g. attribute/position increment)
-   (Robert Muir)
-
 Changes in runtime behavior

 * LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/SegmentingTokenizerBase.java
@ -1,180 +0,0 @@
-package org.apache.lucene.analysis.util;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.Reader;
-
-import java.text.BreakIterator;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-
-/**
- * Breaks text into sentences with a {@link BreakIterator} and
- * allows subclasses to decompose these sentences into words.
- * <p>
- * This can be used by subclasses that need sentence context 
- * for tokenization purposes, such as CJK segmenters.
- * <p>
- * Additionally it can be used by subclasses that want to mark
- * sentence boundaries (with a custom attribute, extra token, position
- * increment, etc) for downstream processing.
- * 
- * @lucene.experimental
- */
-public abstract class SegmentingTokenizerBase extends Tokenizer {
-  protected static final int BUFFERMAX = 4096;
-  protected final char buffer[] = new char[BUFFERMAX];
-  /** true length of text in the buffer */
-  private int length = 0; 
-  /** length in buffer that can be evaluated safely, up to a safe end point */
-  private int usableLength = 0; 
-  /** accumulated offset of previous buffers for this reader, for offsetAtt */
-  protected int offset = 0;
-  
-  private final BreakIterator iterator;
-  private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
-
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
-  /**
-   * Construct a new SegmenterBase from the given Reader, using
-   * the provided BreakIterator for sentence segmentation.
-   * <p>
-   * Note that you should never share BreakIterators across different
-   * TokenStreams, instead a newly created or cloned one should always
-   * be provided to this constructor.
-   */
-  public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
-    super(input);
-    this.iterator = iterator;
-  }
-
-  @Override
-  public final boolean incrementToken() throws IOException {
-    if (length == 0 || !incrementWord()) {
-      while (!incrementSentence()) {
-        refill();
-        if (length <= 0) // no more bytes to read;
-          return false;
-      }
-    }
-    
-    return true;
-  }
-  
-  @Override
-  public void reset() throws IOException {
-    wrapper.setText(buffer, 0, 0);
-    iterator.setText(wrapper);
-    length = usableLength = offset = 0;
-  }
-
-  @Override
-  public void reset(Reader input) throws IOException {
-    this.input = input;
-    reset();
-  }
-  
-  @Override
-  public final void end() throws IOException {
-    final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
-    offsetAtt.setOffset(finalOffset, finalOffset);
-  }  
-
-  /** Returns the last unambiguous break position in the text. */
-  private int findSafeEnd() {
-    for (int i = length - 1; i >= 0; i--)
-      if (isSafeEnd(buffer[i]))
-        return i + 1;
-    return -1;
-  }
-  
-  /** For sentence tokenization, these are the unambiguous break positions. */
-  protected boolean isSafeEnd(char ch) {
-    switch(ch) {
-      case 0x000D:
-      case 0x000A:
-      case 0x0085:
-      case 0x2028:
-      case 0x2029:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  /**
-   * Refill the buffer, accumulating the offset and setting usableLength to the
-   * last unambiguous break position
-   */
-  private void refill() throws IOException {
-    offset += usableLength;
-    int leftover = length - usableLength;
-    System.arraycopy(buffer, usableLength, buffer, 0, leftover);
-    int requested = buffer.length - leftover;
-    int returned = input.read(buffer, leftover, requested);
-    length = returned < 0 ? leftover : returned + leftover;
-    if (returned < requested) /* reader has been emptied, process the rest */
-      usableLength = length;
-    else { /* still more data to be read, find a safe-stopping place */
-      usableLength = findSafeEnd();
-      if (usableLength < 0)
-        usableLength = length; /*
-                                * more than IOBUFFER of text without breaks,
-                                * gonna possibly truncate tokens
-                                */
-    }
-
-    wrapper.setText(buffer, 0, Math.max(0, usableLength));
-    iterator.setText(wrapper);
-  }
-
-  /**
-   * return true if there is a token from the buffer, or null if it is
-   * exhausted.
-   */
-  private boolean incrementSentence() throws IOException {
-    if (length == 0) // we must refill the buffer
-      return false;
-    
-    while (true) {
-      int start = iterator.current();
-
-      if (start == BreakIterator.DONE)
-        return false; // BreakIterator exhausted
-
-      // find the next set of boundaries
-      int end = iterator.next();
-
-      if (end == BreakIterator.DONE)
-        return false; // BreakIterator exhausted
-
-      setNextSentence(start, end);
-      if (incrementWord()) {
-        return true;
-      }
-    }
-  }
-  
-  /** Provides the next input sentence for analysis */
-  protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
-  /** Returns true if another word is available */
-  protected abstract boolean incrementWord();
-}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestSegmentingTokenizerBase.java
@ -1,224 +0,0 @@
-package org.apache.lucene.analysis.util;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.Reader;
-import java.text.BreakIterator;
-import java.util.Arrays;
-import java.util.Locale;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/** Basic tests for {@link SegmentingTokenizerBase} */
-public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
-  private Analyzer sentence = new Analyzer() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
-      return new TokenStreamComponents(tokenizer, tokenizer);
-    }
-  };
-  
-  private Analyzer sentenceAndWord = new Analyzer() {
-    @Override
-    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
-      return new TokenStreamComponents(tokenizer, tokenizer);
-    }
-  };
-  
-  /** Some simple examples, just outputting the whole sentence boundaries as "terms" */
-  public void testBasics() throws IOException {
-    assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
-        new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
-    );
-    assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
-        new String[] { "He said, \"Are you going?\" ", 
-                       "John shook his head." }
-    );
-  }
-  
-  /** Test a subclass that sets some custom attribute values */
-  public void testCustomAttributes() throws IOException {
-    assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
-        new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
-        new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
-        new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
-        new int[] { 1, 1,  1,  1,  1,  2,  1,  1,  1 }
-    );
-  }
-  
-  /** Tests tokenstream reuse */
-  public void testReuse() throws IOException {
-    assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
-        new String[] { "He", "said", "Are", "you", "going" },
-        new int[] { 0, 3, 10, 14, 18 },
-        new int[] { 2, 7, 13, 17, 23 },
-        new int[] { 1, 1,  1,  1,  1,}
-    );
-    assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
-        new String[] { "John", "shook", "his", "head" },
-        new int[] { 0,  5, 11, 15 },
-        new int[] { 4, 10, 14, 19 },
-        new int[] { 1,  1,  1,  1 }
-    );
-  }
-  
-  /** Tests TokenStream.end() */
-  public void testEnd() throws IOException {
-    // BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
-    // we add some junk whitespace to the end just to test it.
-    assertAnalyzesTo(sentenceAndWord, "John shook his head          ",
-        new String[] { "John", "shook", "his", "head" }
-    );
-    assertAnalyzesTo(sentenceAndWord, "John shook his head.          ",
-        new String[] { "John", "shook", "his", "head" }
-    );
-  }
-  
-  /** Tests terms which span across boundaries */
-  public void testHugeDoc() throws IOException {
-    StringBuilder sb = new StringBuilder();
-    char whitespace[] = new char[4094];
-    Arrays.fill(whitespace, '\n');
-    sb.append(whitespace);
-    sb.append("testing 1234");
-    String input = sb.toString();
-    assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
-  }
-  
-  /** Tests the handling of binary/malformed data */
-  public void testHugeTerm() throws IOException {
-    StringBuilder sb = new StringBuilder();
-    for (int i = 0; i < 40960; i++) {
-      sb.append('a');
-    }
-    String input = sb.toString();
-    char token[] = new char[4096];
-    Arrays.fill(token, 'a');
-    String expectedToken = new String(token);
-    String expected[] = { 
-        expectedToken, expectedToken, expectedToken, 
-        expectedToken, expectedToken, expectedToken,
-        expectedToken, expectedToken, expectedToken,
-        expectedToken
-    };
-    assertAnalyzesTo(sentence, input, expected);
-  }
-  
-  /** blast some random strings through the analyzer */
-  public void testRandomStrings() throws Exception {
-    checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
-    checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
-  }
-
-  // some tokenizers for testing
-  
-  /** silly tokenizer that just returns whole sentences as tokens */
-  static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
-    int sentenceStart, sentenceEnd;
-    boolean hasSentence;
-    
-    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-    
-    public WholeSentenceTokenizer(Reader input) {
-      super(input, BreakIterator.getSentenceInstance(new Locale("")));
-    }
-
-    @Override
-    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
-      this.sentenceStart = sentenceStart;
-      this.sentenceEnd = sentenceEnd;
-      hasSentence = true;
-    }
-
-    @Override
-    protected boolean incrementWord() {
-      if (hasSentence) {
-        hasSentence = false;
-        clearAttributes();
-        termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
-        offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
-        return true;
-      } else {
-        return false;
-      }
-    }
-  }
-  
-  /** 
-   * simple tokenizer, that bumps posinc + 1 for tokens after a 
-   * sentence boundary to inhibit phrase queries without slop.
-   */
-  static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
-    int sentenceStart, sentenceEnd;
-    int wordStart, wordEnd;
-    int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
-    
-    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-    private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-    
-    public SentenceAndWordTokenizer(Reader input) {
-      super(input, BreakIterator.getSentenceInstance(new Locale("")));
-    }
-
-    @Override
-    protected void setNextSentence(int sentenceStart, int sentenceEnd) {
-      this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
-      this.sentenceEnd = sentenceEnd;
-      posBoost++;
-    }
-    
-    @Override
-    public void reset() throws IOException {
-      super.reset();
-      posBoost = -1;
-    }
-
-    @Override
-    protected boolean incrementWord() {
-      wordStart = wordEnd;
-      while (wordStart < sentenceEnd) {
-        if (Character.isLetterOrDigit(buffer[wordStart]))
-          break;
-        wordStart++;
-      }
-      
-      if (wordStart == sentenceEnd) return false;
-      
-      wordEnd = wordStart+1;
-      while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
-        wordEnd++;
-      
-      clearAttributes();
-      termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
-      offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
-      posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
-      posBoost = 0;
-      return true;
-    }
-  }
-}