mirror of https://github.com/apache/lucene.git
LUCENE-1906: Fix backwards problems with CharStream and Tokenizers with custom reset(Reader) method.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@813671 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
05fa98b1bb
commit
4666489857
10
CHANGES.txt
10
CHANGES.txt
|
@ -350,10 +350,12 @@ API Changes
|
|||
a top level reader and docID.
|
||||
(Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless)
|
||||
|
||||
* LUCENE-1466: Changed Tokenizer.input to be a CharStream; added
|
||||
CharFilter and MappingCharFilter, which allows chaining & mapping
|
||||
of characters before tokenizers run. (Koji Sekiguchi via Mike
|
||||
McCandless)
|
||||
* LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows
|
||||
chaining & mapping of characters before tokenizers run. CharStream (subclass of
|
||||
Reader) is the base class for custom java.io.Reader's, that support offset
|
||||
correction. Tokenizers got an additional method correctOffset() that is passed
|
||||
down to the underlying CharStream if input is a subclass of CharStream/-Filter.
|
||||
(Koji Sekiguchi via Mike McCandless, Uwe Schindler)
|
||||
|
||||
* LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike
|
||||
McCandless)
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
<property name="Name" value="Lucene"/>
|
||||
<property name="dev.version" value="2.9"/>
|
||||
<property name="version" value="${dev.version}"/>
|
||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090903"/>
|
||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090911"/>
|
||||
<property name="spec.version" value="${version}"/>
|
||||
<property name="year" value="2000-${current.year}"/>
|
||||
<property name="final.name" value="lucene-${name}-${version}"/>
|
||||
|
|
|
@ -285,7 +285,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
|
||||
if (length > 0) {
|
||||
termAtt.setTermBuffer(buffer, 0, length);
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
|
||||
return true;
|
||||
} else if (dataLen == -1) {
|
||||
|
|
|
@ -104,7 +104,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
//System.out.println(new String(buffer, 0,
|
||||
//length));
|
||||
termAtt.setTermBuffer(buffer, 0, length);
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
|
|
|
@ -207,7 +207,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||
int end = start + gramSize;
|
||||
termAtt.setTermBuffer(inStr, start, gramSize);
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
||||
gramSize++;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -124,7 +124,7 @@ public class NGramTokenizer extends Tokenizer {
|
|||
int oldPos = pos;
|
||||
pos++;
|
||||
termAtt.setTermBuffer(inStr, oldPos, gramSize);
|
||||
offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
|
||||
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ public final class SentenceTokenizer extends Tokenizer {
|
|||
return false;
|
||||
else {
|
||||
termAtt.setTermBuffer(buffer.toString());
|
||||
offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd));
|
||||
offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
|
||||
typeAtt.setType("sentence");
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -202,7 +202,7 @@ public abstract class AbstractTestCase extends TestCase {
|
|||
return false;
|
||||
|
||||
termAtt.setTermBuffer(snippet, startTerm, lenTerm);
|
||||
offsetAtt.setOffset(startOffset, startOffset + lenTerm);
|
||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
|
@ -127,10 +126,6 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
private TermAttribute termAtt;
|
||||
private FlagsAttribute flagsAtt;
|
||||
|
||||
void setInput(Reader reader) {
|
||||
this.input = CharReader.get(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||
* <code>input</code> to a newly created JFlex scanner.
|
||||
|
@ -267,7 +262,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
//trim the buffer
|
||||
String s = buffer.toString().trim();
|
||||
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
|
||||
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
||||
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||
|
@ -305,7 +300,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
//trim the buffer
|
||||
String s = buffer.toString().trim();
|
||||
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
|
||||
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
||||
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||
|
@ -318,7 +313,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
private void setupToken() {
|
||||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength()));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength()));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -332,7 +327,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public void reset(Reader reader) throws IOException {
|
||||
setInput(reader);
|
||||
super.reset(reader);
|
||||
reset();
|
||||
}
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ public abstract class BaseCharFilter extends CharFilter {
|
|||
/** Retrieve the corrected offset. Note that this method
|
||||
* is slow, if you correct positions far before the most
|
||||
* recently added position, as it's a simple linear
|
||||
* searhc backwards through all offset corrections added
|
||||
* search backwards through all offset corrections added
|
||||
* by {@link #addOffCorrectMap}. */
|
||||
protected int correct(int currentOff) {
|
||||
if (pcmList == null || pcmList.isEmpty()) {
|
||||
|
|
|
@ -21,6 +21,9 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* Subclasses of CharFilter can be chained to filter CharStream.
|
||||
* They can be used as {@link java.io.Reader} with additional offset
|
||||
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
|
||||
* if a CharFilter/CharStream subclass is used.
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
|
|
|
@ -20,12 +20,11 @@ package org.apache.lucene.analysis;
|
|||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* CharStream adds <a
|
||||
* href="CharStream.html#correctOffset(int)">correctOffset</a>
|
||||
* functionality over Reader. All Tokenizers accept a
|
||||
* CharStream as input, which enables arbitrary character
|
||||
* based filtering before tokenization. The {@link
|
||||
* #correctOffset} method fixed offsets to account for
|
||||
* CharStream adds {@link #correctOffset}
|
||||
* functionality over {@link Reader}. All Tokenizers accept a
|
||||
* CharStream instead of {@link Reader} as input, which enables
|
||||
* arbitrary character based filtering before tokenization.
|
||||
* The {@link #correctOffset} method fixed offsets to account for
|
||||
* removal or insertion of characters, so that the offsets
|
||||
* reported in the tokens match the character offsets of the
|
||||
* original Reader.
|
||||
|
|
|
@ -104,13 +104,13 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
termAtt.setTermLength(length);
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = input.correctOffset(offset);
|
||||
int finalOffset = correctOffset(offset);
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
|
|
@ -76,8 +76,8 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
termAtt.setTermLength(upto);
|
||||
finalOffset = input.correctOffset(upto);
|
||||
offsetAtt.setOffset(input.correctOffset(0), finalOffset);
|
||||
finalOffset = correctOffset(upto);
|
||||
offsetAtt.setOffset(correctOffset(0), finalOffset);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
|
@ -35,11 +36,18 @@ public class MappingCharFilter extends BaseCharFilter {
|
|||
private int charPointer;
|
||||
private int nextCharCounter;
|
||||
|
||||
/** Default constructor that takes a {@link CharStream}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, CharStream in) {
|
||||
super(in);
|
||||
this.normMap = normMap;
|
||||
}
|
||||
|
||||
/** Easy-use constructor that takes a {@link Reader}. */
|
||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
|
||||
super(CharReader.get(in));
|
||||
this.normMap = normMap;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
while(true) {
|
||||
if (replacement != null && charPointer < replacement.length()) {
|
||||
|
|
|
@ -40,7 +40,7 @@ import java.io.IOException;
|
|||
|
||||
public abstract class Tokenizer extends TokenStream {
|
||||
/** The text source for this Tokenizer. */
|
||||
protected CharStream input;
|
||||
protected Reader input;
|
||||
|
||||
/** Construct a tokenizer with null input. */
|
||||
protected Tokenizer() {}
|
||||
|
@ -50,11 +50,6 @@ public abstract class Tokenizer extends TokenStream {
|
|||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input. */
|
||||
protected Tokenizer(CharStream input) {
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/** Construct a tokenizer with null input using the given AttributeFactory. */
|
||||
protected Tokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
|
@ -66,12 +61,6 @@ public abstract class Tokenizer extends TokenStream {
|
|||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeFactory. */
|
||||
protected Tokenizer(AttributeFactory factory, CharStream input) {
|
||||
super(factory);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source) {
|
||||
super(source);
|
||||
|
@ -83,28 +72,25 @@ public abstract class Tokenizer extends TokenStream {
|
|||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */
|
||||
protected Tokenizer(AttributeSource source, CharStream input) {
|
||||
super(source);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/** By default, closes the input Reader. */
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
/** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass
|
||||
* this method calls {@link CharStream#correctOffset}, else returns <code>currentOff</code>.
|
||||
* @param currentOff offset as seen in the output
|
||||
* @return corrected offset based on the input
|
||||
* @see CharStream#correctOffset
|
||||
*/
|
||||
protected final int correctOffset(int currentOff) {
|
||||
return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff;
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new reader. Typically, an
|
||||
* analyzer (in its reusableTokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void reset(Reader input) throws IOException {
|
||||
this.input = CharReader.get(input);
|
||||
}
|
||||
|
||||
/** Expert: Reset the tokenizer to a new CharStream. Typically, an
|
||||
* analyzer (in its reusableTokenStream method) will use
|
||||
* this to re-use a previously created tokenizer. */
|
||||
public void reset(CharStream input) throws IOException {
|
||||
this.input = input;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.standard;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
@ -92,10 +91,6 @@ public class StandardTokenizer extends Tokenizer {
|
|||
*/
|
||||
private boolean replaceInvalidAcronym;
|
||||
|
||||
void setInput(Reader reader) {
|
||||
input = CharReader.get(reader);
|
||||
}
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/** Set the max allowed token length. Any token longer
|
||||
|
@ -152,7 +147,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
|
||||
private void init(Reader input, boolean replaceInvalidAcronym) {
|
||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
setInput(input);
|
||||
this.input = input;
|
||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -186,7 +181,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength()));
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength()));
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
|
@ -210,7 +205,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
|
||||
public final void end() {
|
||||
// set final offset
|
||||
int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength());
|
||||
int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
|
@ -237,7 +232,7 @@ public class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public void reset(Reader reader) throws IOException {
|
||||
setInput(reader);
|
||||
super.reset(reader);
|
||||
reset();
|
||||
}
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testReaderReset() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
char[] buf = new char[10];
|
||||
int len = cs.read(buf, 0, 10);
|
||||
assertEquals( 1, len );
|
||||
|
@ -57,55 +57,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNothingChange() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test1to3() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1});
|
||||
}
|
||||
|
||||
public void test2to4() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2});
|
||||
}
|
||||
|
||||
public void test2to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2});
|
||||
}
|
||||
|
||||
public void test3to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3});
|
||||
}
|
||||
|
||||
public void test4to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4});
|
||||
}
|
||||
|
||||
public void test5to0() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) );
|
||||
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokenStreamContents(ts, new String[0]);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue