mirror of https://github.com/apache/lucene.git
LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1376158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eaad73b6d1
commit
568156b710
|
@ -61,6 +61,14 @@ API Changes
|
||||||
only exists so that this statistic can be accessed for Lucene 3.x
|
only exists so that this statistic can be accessed for Lucene 3.x
|
||||||
segments, which don't support Terms.size(). (Uwe Schindler, Robert Muir)
|
segments, which don't support Terms.size(). (Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
|
||||||
|
overdelegates (read(), read(char[], int, int), skip, etc). This made it
|
||||||
|
hard to implement CharFilters that were correct. Instead only close() is
|
||||||
|
delegated by default: read(char[], int, int) and correct(int) are abstract
|
||||||
|
so that its obvious which methods you should implement. The protected
|
||||||
|
inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.
|
||||||
|
(Dawid Weiss, Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||||
|
|
|
@ -67,8 +67,8 @@ public class MappingCharFilter extends BaseCharFilter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
super.reset();
|
input.reset();
|
||||||
buffer.reset(in);
|
buffer.reset(input);
|
||||||
replacement = null;
|
replacement = null;
|
||||||
inputOff = 0;
|
inputOff = 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class PersianCharFilter extends CharFilter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
final int charsRead = super.read(cbuf, off, len);
|
final int charsRead = input.read(cbuf, off, len);
|
||||||
if (charsRead > 0) {
|
if (charsRead > 0) {
|
||||||
final int end = off + charsRead;
|
final int end = off + charsRead;
|
||||||
while (off < end) {
|
while (off < end) {
|
||||||
|
@ -46,6 +46,17 @@ public class PersianCharFilter extends CharFilter {
|
||||||
return charsRead;
|
return charsRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// optimized impl: some other charfilters consume with read()
|
||||||
|
@Override
|
||||||
|
public int read() throws IOException {
|
||||||
|
int ch = input.read();
|
||||||
|
if (ch == '\u200C') {
|
||||||
|
return ' ';
|
||||||
|
} else {
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int correct(int currentOff) {
|
protected int correct(int currentOff) {
|
||||||
return currentOff; // we don't change the length of the string
|
return currentOff; // we don't change the length of the string
|
||||||
|
|
|
@ -72,7 +72,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||||
private void fill() throws IOException {
|
private void fill() throws IOException {
|
||||||
StringBuilder buffered = new StringBuilder();
|
StringBuilder buffered = new StringBuilder();
|
||||||
char [] temp = new char [1024];
|
char [] temp = new char [1024];
|
||||||
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
|
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
|
||||||
buffered.append(temp, 0, cnt);
|
buffered.append(temp, 0, cnt);
|
||||||
}
|
}
|
||||||
transformedInput = new StringReader(processPattern(buffered).toString());
|
transformedInput = new StringReader(processPattern(buffered).toString());
|
||||||
|
|
|
@ -782,31 +782,51 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.read(cbuf, off, len);
|
return input.read(cbuf, off, len);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read() throws IOException {
|
public int read() throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.read();
|
return input.read();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(CharBuffer target) throws IOException {
|
public int read(CharBuffer target) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.read(target);
|
return input.read(target);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int read(char[] cbuf) throws IOException {
|
public int read(char[] cbuf) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.read(cbuf);
|
return input.read(cbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long skip(long n) throws IOException {
|
public long skip(long n) throws IOException {
|
||||||
readSomething = true;
|
readSomething = true;
|
||||||
return in.skip(n);
|
return input.skip(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mark(int readAheadLimit) throws IOException {
|
||||||
|
input.mark(readAheadLimit);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean markSupported() {
|
||||||
|
return input.markSupported();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean ready() throws IOException {
|
||||||
|
return input.ready();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
input.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.fa;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
|
||||||
|
public class TestPersianCharFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
return new TokenStreamComponents(new MockTokenizer(reader));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new PersianCharFilter(reader);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
assertAnalyzesTo(analyzer, "this is a\u200Ctest",
|
||||||
|
new String[] { "this", "is", "a", "test" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.FilterReader;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -25,15 +25,37 @@ import java.io.Reader;
|
||||||
* They can be used as {@link java.io.Reader} with additional offset
|
* They can be used as {@link java.io.Reader} with additional offset
|
||||||
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
|
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
|
||||||
* if a CharFilter subclass is used.
|
* if a CharFilter subclass is used.
|
||||||
|
* <p>
|
||||||
|
* This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
|
||||||
|
* transforming the input in some way from {@link #input}, and {@link #correct(int)}
|
||||||
|
* to adjust the offsets to match the originals.
|
||||||
|
* <p>
|
||||||
|
* You can optionally provide more efficient implementations of additional methods
|
||||||
|
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
|
||||||
|
* but this is not required.
|
||||||
*/
|
*/
|
||||||
public abstract class CharFilter extends FilterReader {
|
// the way java.io.FilterReader should work!
|
||||||
|
public abstract class CharFilter extends Reader {
|
||||||
|
/**
|
||||||
|
* The underlying character-input stream.
|
||||||
|
*/
|
||||||
|
protected final Reader input;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new CharFilter wrapping the provided reader.
|
* Create a new CharFilter wrapping the provided reader.
|
||||||
* @param in a Reader, can also be a CharFilter for chaining.
|
* @param input a Reader, can also be a CharFilter for chaining.
|
||||||
*/
|
*/
|
||||||
public CharFilter(Reader in) {
|
public CharFilter(Reader input) {
|
||||||
super(in);
|
super(input);
|
||||||
|
this.input = input;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the underlying input stream.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
input.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -50,6 +72,6 @@ public abstract class CharFilter extends FilterReader {
|
||||||
*/
|
*/
|
||||||
public final int correctOffset(int currentOff) {
|
public final int correctOffset(int currentOff) {
|
||||||
final int corrected = correct(currentOff);
|
final int corrected = correct(currentOff);
|
||||||
return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
|
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
@ -50,6 +51,11 @@ public class TestCharFilter extends LuceneTestCase {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
return input.read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int correct(int currentOff) {
|
protected int correct(int currentOff) {
|
||||||
return currentOff + 1;
|
return currentOff + 1;
|
||||||
|
@ -62,6 +68,11 @@ public class TestCharFilter extends LuceneTestCase {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
return input.read(cbuf, off, len);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int correct(int currentOff) {
|
protected int correct(int currentOff) {
|
||||||
return currentOff + 2;
|
return currentOff + 2;
|
||||||
|
|
|
@ -43,11 +43,6 @@ public class MockCharFilter extends CharFilter {
|
||||||
this(in, 0);
|
this(in, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws IOException {
|
|
||||||
in.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
int currentOffset = -1;
|
int currentOffset = -1;
|
||||||
int delta = 0;
|
int delta = 0;
|
||||||
int bufferedCh = -1;
|
int bufferedCh = -1;
|
||||||
|
@ -66,7 +61,7 @@ public class MockCharFilter extends CharFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// otherwise actually read one
|
// otherwise actually read one
|
||||||
int ch = in.read();
|
int ch = input.read();
|
||||||
if (ch < 0)
|
if (ch < 0)
|
||||||
return ch;
|
return ch;
|
||||||
|
|
||||||
|
|
|
@ -19,12 +19,16 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.nio.CharBuffer;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.RandomizedContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizer for testing.
|
* Tokenizer for testing.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -77,6 +81,9 @@ public class MockTokenizer extends Tokenizer {
|
||||||
private int lastOffset = 0; // only for asserting
|
private int lastOffset = 0; // only for asserting
|
||||||
private boolean enableChecks = true;
|
private boolean enableChecks = true;
|
||||||
|
|
||||||
|
// evil: but we don't change the behavior with this random, we only switch up how we read
|
||||||
|
private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
|
||||||
|
|
||||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||||
super(factory, input);
|
super(factory, input);
|
||||||
this.runAutomaton = runAutomaton;
|
this.runAutomaton = runAutomaton;
|
||||||
|
@ -139,14 +146,14 @@ public class MockTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected int readCodePoint() throws IOException {
|
protected int readCodePoint() throws IOException {
|
||||||
int ch = input.read();
|
int ch = readChar();
|
||||||
if (ch < 0) {
|
if (ch < 0) {
|
||||||
return ch;
|
return ch;
|
||||||
} else {
|
} else {
|
||||||
assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
|
assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
|
||||||
off++;
|
off++;
|
||||||
if (Character.isHighSurrogate((char) ch)) {
|
if (Character.isHighSurrogate((char) ch)) {
|
||||||
int ch2 = input.read();
|
int ch2 = readChar();
|
||||||
if (ch2 >= 0) {
|
if (ch2 >= 0) {
|
||||||
off++;
|
off++;
|
||||||
assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
|
assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
|
||||||
|
@ -159,6 +166,33 @@ public class MockTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected int readChar() throws IOException {
|
||||||
|
switch(random.nextInt(10)) {
|
||||||
|
case 0: {
|
||||||
|
// read(char[])
|
||||||
|
char c[] = new char[1];
|
||||||
|
int ret = input.read(c);
|
||||||
|
return ret < 0 ? ret : c[0];
|
||||||
|
}
|
||||||
|
case 1: {
|
||||||
|
// read(char[], int, int)
|
||||||
|
char c[] = new char[2];
|
||||||
|
int ret = input.read(c, 1, 1);
|
||||||
|
return ret < 0 ? ret : c[1];
|
||||||
|
}
|
||||||
|
case 2: {
|
||||||
|
// read(CharBuffer)
|
||||||
|
char c[] = new char[1];
|
||||||
|
CharBuffer cb = CharBuffer.wrap(c);
|
||||||
|
int ret = input.read(cb);
|
||||||
|
return ret < 0 ? ret : c[0];
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
// read()
|
||||||
|
return input.read();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected boolean isTokenChar(int c) {
|
protected boolean isTokenChar(int c) {
|
||||||
state = runAutomaton.step(state, c);
|
state = runAutomaton.step(state, c);
|
||||||
if (state < 0) {
|
if (state < 0) {
|
||||||
|
|
|
@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
numRead++;
|
numRead++;
|
||||||
return in.read();
|
return input.read();
|
||||||
}
|
}
|
||||||
|
|
||||||
private int nextSkipWS() throws IOException {
|
private int nextSkipWS() throws IOException {
|
||||||
|
@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
||||||
return pushed.charAt(len-1);
|
return pushed.charAt(len-1);
|
||||||
}
|
}
|
||||||
numRead++;
|
numRead++;
|
||||||
int ch = in.read();
|
int ch = input.read();
|
||||||
push(ch);
|
push(ch);
|
||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
|
@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
||||||
|
|
||||||
private void saveState() throws IOException {
|
private void saveState() throws IOException {
|
||||||
lastMark = numRead;
|
lastMark = numRead;
|
||||||
in.mark(readAheadLimit);
|
input.mark(readAheadLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void restoreState() throws IOException {
|
private void restoreState() throws IOException {
|
||||||
in.reset();
|
input.reset();
|
||||||
pushed.setLength(0);
|
pushed.setLength(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue