LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1376158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-22 17:51:19 +00:00
parent eaad73b6d1
commit 568156b710
11 changed files with 176 additions and 27 deletions

View File

@ -61,6 +61,14 @@ API Changes
only exists so that this statistic can be accessed for Lucene 3.x
segments, which don't support Terms.size(). (Uwe Schindler, Robert Muir)
* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
overdelegates (read(), read(char[], int, int), skip, etc). This made it
hard to implement CharFilters that were correct. Instead only close() is
delegated by default: read(char[], int, int) and correct(int) are abstract
so that its obvious which methods you should implement. The protected
inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.
(Dawid Weiss, Uwe Schindler, Robert Muir)
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor

View File

@ -67,8 +67,8 @@ public class MappingCharFilter extends BaseCharFilter {
@Override
public void reset() throws IOException {
super.reset();
buffer.reset(in);
input.reset();
buffer.reset(input);
replacement = null;
inputOff = 0;
}

View File

@ -34,7 +34,7 @@ public class PersianCharFilter extends CharFilter {
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
final int charsRead = super.read(cbuf, off, len);
final int charsRead = input.read(cbuf, off, len);
if (charsRead > 0) {
final int end = off + charsRead;
while (off < end) {
@ -46,6 +46,17 @@ public class PersianCharFilter extends CharFilter {
return charsRead;
}
// optimized impl: some other charfilters consume with read()
@Override
public int read() throws IOException {
int ch = input.read();
if (ch == '\u200C') {
return ' ';
} else {
return ch;
}
}
@Override
protected int correct(int currentOff) {
return currentOff; // we don't change the length of the string

View File

@ -72,7 +72,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
private void fill() throws IOException {
StringBuilder buffered = new StringBuilder();
char [] temp = new char [1024];
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
buffered.append(temp, 0, cnt);
}
transformedInput = new StringReader(processPattern(buffered).toString());

View File

@ -782,31 +782,51 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
readSomething = true;
return in.read(cbuf, off, len);
return input.read(cbuf, off, len);
}
@Override
public int read() throws IOException {
readSomething = true;
return in.read();
return input.read();
}
@Override
public int read(CharBuffer target) throws IOException {
readSomething = true;
return in.read(target);
return input.read(target);
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
return in.read(cbuf);
return input.read(cbuf);
}
@Override
public long skip(long n) throws IOException {
readSomething = true;
return in.skip(n);
return input.skip(n);
}
@Override
public void mark(int readAheadLimit) throws IOException {
input.mark(readAheadLimit);
}
@Override
public boolean markSupported() {
return input.markSupported();
}
@Override
public boolean ready() throws IOException {
return input.ready();
}
@Override
public void reset() throws IOException {
input.reset();
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.analysis.fa;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
public class TestPersianCharFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new MockTokenizer(reader));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new PersianCharFilter(reader);
}
};
public void testBasics() throws Exception {
assertAnalyzesTo(analyzer, "this is a\u200Ctest",
new String[] { "this", "is", "a", "test" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
}

View File

@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.FilterReader;
import java.io.IOException;
import java.io.Reader;
/**
@ -25,15 +25,37 @@ import java.io.Reader;
* They can be used as {@link java.io.Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter subclass is used.
* <p>
* This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
* transforming the input in some way from {@link #input}, and {@link #correct(int)}
* to adjust the offsets to match the originals.
* <p>
* You can optionally provide more efficient implementations of additional methods
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
* but this is not required.
*/
public abstract class CharFilter extends FilterReader {
// the way java.io.FilterReader should work!
public abstract class CharFilter extends Reader {
/**
* The underlying character-input stream.
*/
protected final Reader input;
/**
* Create a new CharFilter wrapping the provided reader.
* @param in a Reader, can also be a CharFilter for chaining.
* @param input a Reader, can also be a CharFilter for chaining.
*/
public CharFilter(Reader in) {
super(in);
public CharFilter(Reader input) {
super(input);
this.input = input;
}
/**
* Closes the underlying input stream.
*/
@Override
public void close() throws IOException {
input.close();
}
/**
@ -50,6 +72,6 @@ public abstract class CharFilter extends FilterReader {
*/
public final int correctOffset(int currentOff) {
final int corrected = correct(currentOff);
return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -50,6 +51,11 @@ public class TestCharFilter extends LuceneTestCase {
super(in);
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len);
}
@Override
protected int correct(int currentOff) {
return currentOff + 1;
@ -62,6 +68,11 @@ public class TestCharFilter extends LuceneTestCase {
super(in);
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
return input.read(cbuf, off, len);
}
@Override
protected int correct(int currentOff) {
return currentOff + 2;

View File

@ -43,11 +43,6 @@ public class MockCharFilter extends CharFilter {
this(in, 0);
}
@Override
public void close() throws IOException {
in.close();
}
int currentOffset = -1;
int delta = 0;
int bufferedCh = -1;
@ -66,7 +61,7 @@ public class MockCharFilter extends CharFilter {
}
// otherwise actually read one
int ch = in.read();
int ch = input.read();
if (ch < 0)
return ch;

View File

@ -19,12 +19,16 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
import com.carrotsearch.randomizedtesting.RandomizedContext;
/**
* Tokenizer for testing.
* <p>
@ -77,6 +81,9 @@ public class MockTokenizer extends Tokenizer {
private int lastOffset = 0; // only for asserting
private boolean enableChecks = true;
// evil: but we don't change the behavior with this random, we only switch up how we read
private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
super(factory, input);
this.runAutomaton = runAutomaton;
@ -139,14 +146,14 @@ public class MockTokenizer extends Tokenizer {
}
protected int readCodePoint() throws IOException {
int ch = input.read();
int ch = readChar();
if (ch < 0) {
return ch;
} else {
assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
off++;
if (Character.isHighSurrogate((char) ch)) {
int ch2 = input.read();
int ch2 = readChar();
if (ch2 >= 0) {
off++;
assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
@ -159,6 +166,33 @@ public class MockTokenizer extends Tokenizer {
}
}
protected int readChar() throws IOException {
switch(random.nextInt(10)) {
case 0: {
// read(char[])
char c[] = new char[1];
int ret = input.read(c);
return ret < 0 ? ret : c[0];
}
case 1: {
// read(char[], int, int)
char c[] = new char[2];
int ret = input.read(c, 1, 1);
return ret < 0 ? ret : c[1];
}
case 2: {
// read(CharBuffer)
char c[] = new char[1];
CharBuffer cb = CharBuffer.wrap(c);
int ret = input.read(cb);
return ret < 0 ? ret : c[0];
}
default:
// read()
return input.read();
}
}
protected boolean isTokenChar(int c) {
state = runAutomaton.step(state, c);
if (state < 0) {

View File

@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
return ch;
}
numRead++;
return in.read();
return input.read();
}
private int nextSkipWS() throws IOException {
@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
return pushed.charAt(len-1);
}
numRead++;
int ch = in.read();
int ch = input.read();
push(ch);
return ch;
}
@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
private void saveState() throws IOException {
lastMark = numRead;
in.mark(readAheadLimit);
input.mark(readAheadLimit);
}
private void restoreState() throws IOException {
in.reset();
input.reset();
pushed.setLength(0);
}