LUCENE-1906: Fix backwards problems with CharStream and Tokenizers with custom reset(Reader) method.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@813671 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2009-09-11 06:12:13 +00:00
parent 05fa98b1bb
commit 4666489857
18 changed files with 63 additions and 75 deletions

View File

@ -350,10 +350,12 @@ API Changes
a top level reader and docID.
(Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless)
* LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
CharFilter and MappingCharFilter, which allows chaining & mapping
of characters before tokenizers run. (Koji Sekiguchi via Mike
McCandless)
* LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows
chaining & mapping of characters before tokenizers run. CharStream (subclass of
Reader) is the base class for custom java.io.Reader's, that support offset
correction. Tokenizers got an additional method correctOffset() that is passed
down to the underlying CharStream if input is a subclass of CharStream/-Filter.
(Koji Sekiguchi via Mike McCandless, Uwe Schindler)
* LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike
McCandless)

View File

@ -42,7 +42,7 @@
<property name="Name" value="Lucene"/>
<property name="dev.version" value="2.9"/>
<property name="version" value="${dev.version}"/>
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090903"/>
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090911"/>
<property name="spec.version" value="${version}"/>
<property name="year" value="2000-${current.year}"/>
<property name="final.name" value="lucene-${name}-${version}"/>

View File

@ -285,7 +285,7 @@ public final class CJKTokenizer extends Tokenizer {
if (length > 0) {
termAtt.setTermBuffer(buffer, 0, length);
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
return true;
} else if (dataLen == -1) {

View File

@ -104,7 +104,7 @@ public final class ChineseTokenizer extends Tokenizer {
//System.out.println(new String(buffer, 0,
//length));
termAtt.setTermBuffer(buffer, 0, length);
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else

View File

@ -207,7 +207,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
int start = side == Side.FRONT ? 0 : inLen - gramSize;
int end = start + gramSize;
termAtt.setTermBuffer(inStr, start, gramSize);
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end));
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
gramSize++;
return true;
}

View File

@ -124,7 +124,7 @@ public class NGramTokenizer extends Tokenizer {
int oldPos = pos;
pos++;
termAtt.setTermBuffer(inStr, oldPos, gramSize);
offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
return true;
}

View File

@ -116,7 +116,7 @@ public final class SentenceTokenizer extends Tokenizer {
return false;
else {
termAtt.setTermBuffer(buffer.toString());
offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd));
offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
typeAtt.setType("sentence");
return true;
}

View File

@ -202,7 +202,7 @@ public abstract class AbstractTestCase extends TestCase {
return false;
termAtt.setTermBuffer(snippet, startTerm, lenTerm);
offsetAtt.setOffset(startOffset, startOffset + lenTerm);
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
return true;
}

View File

@ -17,7 +17,6 @@
package org.apache.lucene.wikipedia.analysis;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
@ -127,10 +126,6 @@ public class WikipediaTokenizer extends Tokenizer {
private TermAttribute termAtt;
private FlagsAttribute flagsAtt;
void setInput(Reader reader) {
this.input = CharReader.get(reader);
}
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
@ -267,7 +262,7 @@ public class WikipediaTokenizer extends Tokenizer {
//trim the buffer
String s = buffer.toString().trim();
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@ -305,7 +300,7 @@ public class WikipediaTokenizer extends Tokenizer {
//trim the buffer
String s = buffer.toString().trim();
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
@ -318,7 +313,7 @@ public class WikipediaTokenizer extends Tokenizer {
private void setupToken() {
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength()));
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength()));
}
/*
@ -332,7 +327,7 @@ public class WikipediaTokenizer extends Tokenizer {
}
public void reset(Reader reader) throws IOException {
setInput(reader);
super.reset(reader);
reset();
}

View File

@ -43,7 +43,7 @@ public abstract class BaseCharFilter extends CharFilter {
/** Retrieve the corrected offset. Note that this method
* is slow, if you correct positions far before the most
* recently added position, as it's a simple linear
* searhc backwards through all offset corrections added
* search backwards through all offset corrections added
* by {@link #addOffCorrectMap}. */
protected int correct(int currentOff) {
if (pcmList == null || pcmList.isEmpty()) {

View File

@ -21,6 +21,9 @@ import java.io.IOException;
/**
* Subclasses of CharFilter can be chained to filter CharStream.
* They can be used as {@link java.io.Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter/CharStream subclass is used.
*
* @version $Id$
*

View File

@ -20,12 +20,11 @@ package org.apache.lucene.analysis;
import java.io.Reader;
/**
* CharStream adds <a
* href="CharStream.html#correctOffset(int)">correctOffset</a>
* functionality over Reader. All Tokenizers accept a
* CharStream as input, which enables arbitrary character
* based filtering before tokenization. The {@link
* #correctOffset} method fixed offsets to account for
* CharStream adds {@link #correctOffset}
* functionality over {@link Reader}. All Tokenizers accept a
* CharStream instead of {@link Reader} as input, which enables
* arbitrary character based filtering before tokenization.
* The {@link #correctOffset} method fixed offsets to account for
* removal or insertion of characters, so that the offsets
* reported in the tokens match the character offsets of the
* original Reader.

View File

@ -104,13 +104,13 @@ public abstract class CharTokenizer extends Tokenizer {
}
termAtt.setTermLength(length);
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
public final void end() {
// set final offset
int finalOffset = input.correctOffset(offset);
int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
}

View File

@ -76,8 +76,8 @@ public class KeywordTokenizer extends Tokenizer {
buffer = termAtt.resizeTermBuffer(1+buffer.length);
}
termAtt.setTermLength(upto);
finalOffset = input.correctOffset(upto);
offsetAtt.setOffset(input.correctOffset(0), finalOffset);
finalOffset = correctOffset(upto);
offsetAtt.setOffset(correctOffset(0), finalOffset);
return true;
}
return false;

View File

@ -18,6 +18,7 @@
package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedList;
/**
@ -35,11 +36,18 @@ public class MappingCharFilter extends BaseCharFilter {
private int charPointer;
private int nextCharCounter;
/** Default constructor that takes a {@link CharStream}. */
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
super(in);
this.normMap = normMap;
}
/** Easy-use constructor that takes a {@link Reader}. */
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
super(CharReader.get(in));
this.normMap = normMap;
}
public int read() throws IOException {
while(true) {
if (replacement != null && charPointer < replacement.length()) {

View File

@ -40,7 +40,7 @@ import java.io.IOException;
public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
protected CharStream input;
protected Reader input;
/** Construct a tokenizer with null input. */
protected Tokenizer() {}
@ -49,11 +49,6 @@ public abstract class Tokenizer extends TokenStream {
protected Tokenizer(Reader input) {
this.input = CharReader.get(input);
}
/** Construct a token stream processing the given input. */
protected Tokenizer(CharStream input) {
this.input = input;
}
/** Construct a tokenizer with null input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory) {
@ -65,12 +60,6 @@ public abstract class Tokenizer extends TokenStream {
super(factory);
this.input = CharReader.get(input);
}
/** Construct a token stream processing the given input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory, CharStream input) {
super(factory);
this.input = input;
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source) {
@ -83,28 +72,25 @@ public abstract class Tokenizer extends TokenStream {
this.input = CharReader.get(input);
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source, CharStream input) {
super(source);
this.input = input;
}
/** By default, closes the input Reader. */
public void close() throws IOException {
input.close();
}
/** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
* this method calls {@link CharStream#correctOffset}, else returns <code>currentOff</code>.
* @param currentOff offset as seen in the output
* @return corrected offset based on the input
* @see CharStream#correctOffset
*/
protected final int correctOffset(int currentOff) {
return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
}
/** Expert: Reset the tokenizer to a new reader. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(Reader input) throws IOException {
this.input = CharReader.get(input);
}
/** Expert: Reset the tokenizer to a new CharStream. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
public void reset(CharStream input) throws IOException {
this.input = input;
}
}

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.standard;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -92,10 +91,6 @@ public class StandardTokenizer extends Tokenizer {
*/
private boolean replaceInvalidAcronym;
void setInput(Reader reader) {
input = CharReader.get(reader);
}
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
@ -152,7 +147,7 @@ public class StandardTokenizer extends Tokenizer {
private void init(Reader input, boolean replaceInvalidAcronym) {
this.replaceInvalidAcronym = replaceInvalidAcronym;
setInput(input);
this.input = input;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@ -186,7 +181,7 @@ public class StandardTokenizer extends Tokenizer {
posIncrAtt.setPositionIncrement(posIncr);
scanner.getText(termAtt);
final int start = scanner.yychar();
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
@ -210,7 +205,7 @@ public class StandardTokenizer extends Tokenizer {
public final void end() {
// set final offset
int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
offsetAtt.setOffset(finalOffset, finalOffset);
}
@ -237,7 +232,7 @@ public class StandardTokenizer extends Tokenizer {
}
public void reset(Reader reader) throws IOException {
setInput(reader);
super.reset(reader);
reset();
}

View File

@ -41,7 +41,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
}
public void testReaderReset() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
char[] buf = new char[10];
int len = cs.read(buf, 0, 10);
assertEquals( 1, len );
@ -57,55 +57,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
}
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) );
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokenStreamContents(ts, new String[0]);
}