LUCENE-3889: remove unnecessary/unused base class

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303026 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-03-20 17:28:26 +00:00
parent 7f8076fefc
commit b7a7e5a625
3 changed files with 0 additions and 409 deletions

View File

@ -189,11 +189,6 @@ API Changes
not take position overlaps into account while recalculating norms.
(Uwe Schindler, Robert Muir)
* LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
with BreakIterator and allows subclasses to decompose sentences into words, or
use the sentence boundary information for other reasons (e.g. attribute/position increment)
(Robert Muir)
Changes in runtime behavior
* LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work

View File

@ -1,180 +0,0 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* Breaks text into sentences with a {@link BreakIterator} and
* allows subclasses to decompose these sentences into words.
* <p>
* This can be used by subclasses that need sentence context
* for tokenization purposes, such as CJK segmenters.
* <p>
* Additionally it can be used by subclasses that want to mark
* sentence boundaries (with a custom attribute, extra token, position
* increment, etc) for downstream processing.
*
* @lucene.experimental
*/
public abstract class SegmentingTokenizerBase extends Tokenizer {
protected static final int BUFFERMAX = 4096;
protected final char buffer[] = new char[BUFFERMAX];
/** true length of text in the buffer */
private int length = 0;
/** length in buffer that can be evaluated safely, up to a safe end point */
private int usableLength = 0;
/** accumulated offset of previous buffers for this reader, for offsetAtt */
protected int offset = 0;
private final BreakIterator iterator;
private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**
* Construct a new SegmenterBase from the given Reader, using
* the provided BreakIterator for sentence segmentation.
* <p>
* Note that you should never share BreakIterators across different
* TokenStreams, instead a newly created or cloned one should always
* be provided to this constructor.
*/
public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
super(input);
this.iterator = iterator;
}
@Override
public final boolean incrementToken() throws IOException {
if (length == 0 || !incrementWord()) {
while (!incrementSentence()) {
refill();
if (length <= 0) // no more bytes to read;
return false;
}
}
return true;
}
@Override
public void reset() throws IOException {
wrapper.setText(buffer, 0, 0);
iterator.setText(wrapper);
length = usableLength = offset = 0;
}
@Override
public void reset(Reader input) throws IOException {
this.input = input;
reset();
}
@Override
public final void end() throws IOException {
final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
offsetAtt.setOffset(finalOffset, finalOffset);
}
/** Returns the last unambiguous break position in the text. */
private int findSafeEnd() {
for (int i = length - 1; i >= 0; i--)
if (isSafeEnd(buffer[i]))
return i + 1;
return -1;
}
/** For sentence tokenization, these are the unambiguous break positions. */
protected boolean isSafeEnd(char ch) {
switch(ch) {
case 0x000D:
case 0x000A:
case 0x0085:
case 0x2028:
case 0x2029:
return true;
default:
return false;
}
}
/**
* Refill the buffer, accumulating the offset and setting usableLength to the
* last unambiguous break position
*/
private void refill() throws IOException {
offset += usableLength;
int leftover = length - usableLength;
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
int requested = buffer.length - leftover;
int returned = input.read(buffer, leftover, requested);
length = returned < 0 ? leftover : returned + leftover;
if (returned < requested) /* reader has been emptied, process the rest */
usableLength = length;
else { /* still more data to be read, find a safe-stopping place */
usableLength = findSafeEnd();
if (usableLength < 0)
usableLength = length; /*
* more than IOBUFFER of text without breaks,
* gonna possibly truncate tokens
*/
}
wrapper.setText(buffer, 0, Math.max(0, usableLength));
iterator.setText(wrapper);
}
/**
* return true if there is a token from the buffer, or null if it is
* exhausted.
*/
private boolean incrementSentence() throws IOException {
if (length == 0) // we must refill the buffer
return false;
while (true) {
int start = iterator.current();
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
// find the next set of boundaries
int end = iterator.next();
if (end == BreakIterator.DONE)
return false; // BreakIterator exhausted
setNextSentence(start, end);
if (incrementWord()) {
return true;
}
}
}
/** Provides the next input sentence for analysis */
protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
/** Returns true if another word is available */
protected abstract boolean incrementWord();
}

View File

@ -1,224 +0,0 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.BreakIterator;
import java.util.Arrays;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/** Basic tests for {@link SegmentingTokenizerBase} */
public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
private Analyzer sentence = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
private Analyzer sentenceAndWord = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
/** Some simple examples, just outputting the whole sentence boundaries as "terms" */
public void testBasics() throws IOException {
assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
);
assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
new String[] { "He said, \"Are you going?\" ",
"John shook his head." }
);
}
/** Test a subclass that sets some custom attribute values */
public void testCustomAttributes() throws IOException {
assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }
);
}
/** Tests tokenstream reuse */
public void testReuse() throws IOException {
assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
new String[] { "He", "said", "Are", "you", "going" },
new int[] { 0, 3, 10, 14, 18 },
new int[] { 2, 7, 13, 17, 23 },
new int[] { 1, 1, 1, 1, 1,}
);
assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
new String[] { "John", "shook", "his", "head" },
new int[] { 0, 5, 11, 15 },
new int[] { 4, 10, 14, 19 },
new int[] { 1, 1, 1, 1 }
);
}
/** Tests TokenStream.end() */
public void testEnd() throws IOException {
// BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
// we add some junk whitespace to the end just to test it.
assertAnalyzesTo(sentenceAndWord, "John shook his head ",
new String[] { "John", "shook", "his", "head" }
);
assertAnalyzesTo(sentenceAndWord, "John shook his head. ",
new String[] { "John", "shook", "his", "head" }
);
}
/** Tests terms which span across boundaries */
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, '\n');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
}
/** Tests the handling of binary/malformed data */
public void testHugeTerm() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 40960; i++) {
sb.append('a');
}
String input = sb.toString();
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
String expected[] = {
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken
};
assertAnalyzesTo(sentence, input, expected);
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
}
// some tokenizers for testing
/** silly tokenizer that just returns whole sentences as tokens */
static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
int sentenceStart, sentenceEnd;
boolean hasSentence;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
public WholeSentenceTokenizer(Reader input) {
super(input, BreakIterator.getSentenceInstance(new Locale("")));
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
hasSentence = true;
}
@Override
protected boolean incrementWord() {
if (hasSentence) {
hasSentence = false;
clearAttributes();
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
return true;
} else {
return false;
}
}
}
/**
* simple tokenizer, that bumps posinc + 1 for tokens after a
* sentence boundary to inhibit phrase queries without slop.
*/
static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
int sentenceStart, sentenceEnd;
int wordStart, wordEnd;
int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public SentenceAndWordTokenizer(Reader input) {
super(input, BreakIterator.getSentenceInstance(new Locale("")));
}
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
this.sentenceEnd = sentenceEnd;
posBoost++;
}
@Override
public void reset() throws IOException {
super.reset();
posBoost = -1;
}
@Override
protected boolean incrementWord() {
wordStart = wordEnd;
while (wordStart < sentenceEnd) {
if (Character.isLetterOrDigit(buffer[wordStart]))
break;
wordStart++;
}
if (wordStart == sentenceEnd) return false;
wordEnd = wordStart+1;
while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
wordEnd++;
clearAttributes();
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
posBoost = 0;
return true;
}
}
}