mirror of https://github.com/apache/lucene.git
LUCENE-3889: remove unnecessary/unused base class
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303026 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7f8076fefc
commit
b7a7e5a625
|
@ -189,11 +189,6 @@ API Changes
|
|||
not take position overlaps into account while recalculating norms.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-3305: Added SegmentingTokenizerBase, which breaks text into sentences
|
||||
with BreakIterator and allows subclasses to decompose sentences into words, or
|
||||
use the sentence boundary information for other reasons (e.g. attribute/position increment)
|
||||
(Robert Muir)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-3626: PKIndexSplitter and MultiPassIndexSplitter now work
|
||||
|
|
|
@ -1,180 +0,0 @@
|
|||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import java.text.BreakIterator;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
/**
|
||||
* Breaks text into sentences with a {@link BreakIterator} and
|
||||
* allows subclasses to decompose these sentences into words.
|
||||
* <p>
|
||||
* This can be used by subclasses that need sentence context
|
||||
* for tokenization purposes, such as CJK segmenters.
|
||||
* <p>
|
||||
* Additionally it can be used by subclasses that want to mark
|
||||
* sentence boundaries (with a custom attribute, extra token, position
|
||||
* increment, etc) for downstream processing.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class SegmentingTokenizerBase extends Tokenizer {
|
||||
protected static final int BUFFERMAX = 4096;
|
||||
protected final char buffer[] = new char[BUFFERMAX];
|
||||
/** true length of text in the buffer */
|
||||
private int length = 0;
|
||||
/** length in buffer that can be evaluated safely, up to a safe end point */
|
||||
private int usableLength = 0;
|
||||
/** accumulated offset of previous buffers for this reader, for offsetAtt */
|
||||
protected int offset = 0;
|
||||
|
||||
private final BreakIterator iterator;
|
||||
private final CharArrayIterator wrapper = CharArrayIterator.newSentenceInstance();
|
||||
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Construct a new SegmenterBase from the given Reader, using
|
||||
* the provided BreakIterator for sentence segmentation.
|
||||
* <p>
|
||||
* Note that you should never share BreakIterators across different
|
||||
* TokenStreams, instead a newly created or cloned one should always
|
||||
* be provided to this constructor.
|
||||
*/
|
||||
public SegmentingTokenizerBase(Reader input, BreakIterator iterator) {
|
||||
super(input);
|
||||
this.iterator = iterator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (length == 0 || !incrementWord()) {
|
||||
while (!incrementSentence()) {
|
||||
refill();
|
||||
if (length <= 0) // no more bytes to read;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
wrapper.setText(buffer, 0, 0);
|
||||
iterator.setText(wrapper);
|
||||
length = usableLength = offset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
this.input = input;
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() throws IOException {
|
||||
final int finalOffset = correctOffset(length < 0 ? offset : offset + length);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/** Returns the last unambiguous break position in the text. */
|
||||
private int findSafeEnd() {
|
||||
for (int i = length - 1; i >= 0; i--)
|
||||
if (isSafeEnd(buffer[i]))
|
||||
return i + 1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** For sentence tokenization, these are the unambiguous break positions. */
|
||||
protected boolean isSafeEnd(char ch) {
|
||||
switch(ch) {
|
||||
case 0x000D:
|
||||
case 0x000A:
|
||||
case 0x0085:
|
||||
case 0x2028:
|
||||
case 0x2029:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Refill the buffer, accumulating the offset and setting usableLength to the
|
||||
* last unambiguous break position
|
||||
*/
|
||||
private void refill() throws IOException {
|
||||
offset += usableLength;
|
||||
int leftover = length - usableLength;
|
||||
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
|
||||
int requested = buffer.length - leftover;
|
||||
int returned = input.read(buffer, leftover, requested);
|
||||
length = returned < 0 ? leftover : returned + leftover;
|
||||
if (returned < requested) /* reader has been emptied, process the rest */
|
||||
usableLength = length;
|
||||
else { /* still more data to be read, find a safe-stopping place */
|
||||
usableLength = findSafeEnd();
|
||||
if (usableLength < 0)
|
||||
usableLength = length; /*
|
||||
* more than IOBUFFER of text without breaks,
|
||||
* gonna possibly truncate tokens
|
||||
*/
|
||||
}
|
||||
|
||||
wrapper.setText(buffer, 0, Math.max(0, usableLength));
|
||||
iterator.setText(wrapper);
|
||||
}
|
||||
|
||||
/**
|
||||
* return true if there is a token from the buffer, or null if it is
|
||||
* exhausted.
|
||||
*/
|
||||
private boolean incrementSentence() throws IOException {
|
||||
if (length == 0) // we must refill the buffer
|
||||
return false;
|
||||
|
||||
while (true) {
|
||||
int start = iterator.current();
|
||||
|
||||
if (start == BreakIterator.DONE)
|
||||
return false; // BreakIterator exhausted
|
||||
|
||||
// find the next set of boundaries
|
||||
int end = iterator.next();
|
||||
|
||||
if (end == BreakIterator.DONE)
|
||||
return false; // BreakIterator exhausted
|
||||
|
||||
setNextSentence(start, end);
|
||||
if (incrementWord()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Provides the next input sentence for analysis */
|
||||
protected abstract void setNextSentence(int sentenceStart, int sentenceEnd);
|
||||
/** Returns true if another word is available */
|
||||
protected abstract boolean incrementWord();
|
||||
}
|
|
@ -1,224 +0,0 @@
|
|||
package org.apache.lucene.analysis.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
||||
/** Basic tests for {@link SegmentingTokenizerBase} */
|
||||
public class TestSegmentingTokenizerBase extends BaseTokenStreamTestCase {
|
||||
private Analyzer sentence = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WholeSentenceTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
private Analyzer sentenceAndWord = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new SentenceAndWordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
};
|
||||
|
||||
/** Some simple examples, just outputting the whole sentence boundaries as "terms" */
|
||||
public void testBasics() throws IOException {
|
||||
assertAnalyzesTo(sentence, "The acronym for United States is U.S. but this doesn't end a sentence",
|
||||
new String[] { "The acronym for United States is U.S. but this doesn't end a sentence"}
|
||||
);
|
||||
assertAnalyzesTo(sentence, "He said, \"Are you going?\" John shook his head.",
|
||||
new String[] { "He said, \"Are you going?\" ",
|
||||
"John shook his head." }
|
||||
);
|
||||
}
|
||||
|
||||
/** Test a subclass that sets some custom attribute values */
|
||||
public void testCustomAttributes() throws IOException {
|
||||
assertAnalyzesTo(sentenceAndWord, "He said, \"Are you going?\" John shook his head.",
|
||||
new String[] { "He", "said", "Are", "you", "going", "John", "shook", "his", "head" },
|
||||
new int[] { 0, 3, 10, 14, 18, 26, 31, 37, 41 },
|
||||
new int[] { 2, 7, 13, 17, 23, 30, 36, 40, 45 },
|
||||
new int[] { 1, 1, 1, 1, 1, 2, 1, 1, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
/** Tests tokenstream reuse */
|
||||
public void testReuse() throws IOException {
|
||||
assertAnalyzesToReuse(sentenceAndWord, "He said, \"Are you going?\"",
|
||||
new String[] { "He", "said", "Are", "you", "going" },
|
||||
new int[] { 0, 3, 10, 14, 18 },
|
||||
new int[] { 2, 7, 13, 17, 23 },
|
||||
new int[] { 1, 1, 1, 1, 1,}
|
||||
);
|
||||
assertAnalyzesToReuse(sentenceAndWord, "John shook his head.",
|
||||
new String[] { "John", "shook", "his", "head" },
|
||||
new int[] { 0, 5, 11, 15 },
|
||||
new int[] { 4, 10, 14, 19 },
|
||||
new int[] { 1, 1, 1, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
/** Tests TokenStream.end() */
|
||||
public void testEnd() throws IOException {
|
||||
// BaseTokenStreamTestCase asserts that end() is set to our StringReader's length for us here.
|
||||
// we add some junk whitespace to the end just to test it.
|
||||
assertAnalyzesTo(sentenceAndWord, "John shook his head ",
|
||||
new String[] { "John", "shook", "his", "head" }
|
||||
);
|
||||
assertAnalyzesTo(sentenceAndWord, "John shook his head. ",
|
||||
new String[] { "John", "shook", "his", "head" }
|
||||
);
|
||||
}
|
||||
|
||||
/** Tests terms which span across boundaries */
|
||||
public void testHugeDoc() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
char whitespace[] = new char[4094];
|
||||
Arrays.fill(whitespace, '\n');
|
||||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
assertAnalyzesTo(sentenceAndWord, input, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
/** Tests the handling of binary/malformed data */
|
||||
public void testHugeTerm() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < 40960; i++) {
|
||||
sb.append('a');
|
||||
}
|
||||
String input = sb.toString();
|
||||
char token[] = new char[4096];
|
||||
Arrays.fill(token, 'a');
|
||||
String expectedToken = new String(token);
|
||||
String expected[] = {
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken
|
||||
};
|
||||
assertAnalyzesTo(sentence, input, expected);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, sentence, 10000*RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, sentenceAndWord, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
// some tokenizers for testing
|
||||
|
||||
/** silly tokenizer that just returns whole sentences as tokens */
|
||||
static class WholeSentenceTokenizer extends SegmentingTokenizerBase {
|
||||
int sentenceStart, sentenceEnd;
|
||||
boolean hasSentence;
|
||||
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public WholeSentenceTokenizer(Reader input) {
|
||||
super(input, BreakIterator.getSentenceInstance(new Locale("")));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
|
||||
this.sentenceStart = sentenceStart;
|
||||
this.sentenceEnd = sentenceEnd;
|
||||
hasSentence = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean incrementWord() {
|
||||
if (hasSentence) {
|
||||
hasSentence = false;
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, sentenceStart, sentenceEnd-sentenceStart);
|
||||
offsetAtt.setOffset(correctOffset(offset+sentenceStart), correctOffset(offset+sentenceEnd));
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* simple tokenizer, that bumps posinc + 1 for tokens after a
|
||||
* sentence boundary to inhibit phrase queries without slop.
|
||||
*/
|
||||
static class SentenceAndWordTokenizer extends SegmentingTokenizerBase {
|
||||
int sentenceStart, sentenceEnd;
|
||||
int wordStart, wordEnd;
|
||||
int posBoost = -1; // initially set to -1 so the first word in the document doesn't get a pos boost
|
||||
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
public SentenceAndWordTokenizer(Reader input) {
|
||||
super(input, BreakIterator.getSentenceInstance(new Locale("")));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
|
||||
this.wordStart = this.wordEnd = this.sentenceStart = sentenceStart;
|
||||
this.sentenceEnd = sentenceEnd;
|
||||
posBoost++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
posBoost = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean incrementWord() {
|
||||
wordStart = wordEnd;
|
||||
while (wordStart < sentenceEnd) {
|
||||
if (Character.isLetterOrDigit(buffer[wordStart]))
|
||||
break;
|
||||
wordStart++;
|
||||
}
|
||||
|
||||
if (wordStart == sentenceEnd) return false;
|
||||
|
||||
wordEnd = wordStart+1;
|
||||
while (wordEnd < sentenceEnd && Character.isLetterOrDigit(buffer[wordEnd]))
|
||||
wordEnd++;
|
||||
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(buffer, wordStart, wordEnd-wordStart);
|
||||
offsetAtt.setOffset(correctOffset(offset+wordStart), correctOffset(offset+wordEnd));
|
||||
posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + posBoost);
|
||||
posBoost = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue