mirror of https://github.com/apache/lucene.git
LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1376158 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eaad73b6d1
commit
568156b710
|
@ -61,6 +61,14 @@ API Changes
|
|||
only exists so that this statistic can be accessed for Lucene 3.x
|
||||
segments, which don't support Terms.size(). (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
|
||||
overdelegates (read(), read(char[], int, int), skip, etc). This made it
|
||||
hard to implement CharFilters that were correct. Instead only close() is
|
||||
delegated by default: read(char[], int, int) and correct(int) are abstract
|
||||
so that its obvious which methods you should implement. The protected
|
||||
inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.
|
||||
(Dawid Weiss, Uwe Schindler, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||
|
|
|
@ -67,8 +67,8 @@ public class MappingCharFilter extends BaseCharFilter {
|
|||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer.reset(in);
|
||||
input.reset();
|
||||
buffer.reset(input);
|
||||
replacement = null;
|
||||
inputOff = 0;
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public class PersianCharFilter extends CharFilter {
|
|||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
final int charsRead = super.read(cbuf, off, len);
|
||||
final int charsRead = input.read(cbuf, off, len);
|
||||
if (charsRead > 0) {
|
||||
final int end = off + charsRead;
|
||||
while (off < end) {
|
||||
|
@ -45,6 +45,17 @@ public class PersianCharFilter extends CharFilter {
|
|||
}
|
||||
return charsRead;
|
||||
}
|
||||
|
||||
// optimized impl: some other charfilters consume with read()
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
int ch = input.read();
|
||||
if (ch == '\u200C') {
|
||||
return ' ';
|
||||
} else {
|
||||
return ch;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
|
|
|
@ -72,7 +72,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
|||
private void fill() throws IOException {
|
||||
StringBuilder buffered = new StringBuilder();
|
||||
char [] temp = new char [1024];
|
||||
for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
|
||||
for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
|
||||
buffered.append(temp, 0, cnt);
|
||||
}
|
||||
transformedInput = new StringReader(processPattern(buffered).toString());
|
||||
|
|
|
@ -782,31 +782,51 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
readSomething = true;
|
||||
return in.read(cbuf, off, len);
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
readSomething = true;
|
||||
return in.read();
|
||||
return input.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(CharBuffer target) throws IOException {
|
||||
readSomething = true;
|
||||
return in.read(target);
|
||||
return input.read(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf) throws IOException {
|
||||
readSomething = true;
|
||||
return in.read(cbuf);
|
||||
return input.read(cbuf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
readSomething = true;
|
||||
return in.skip(n);
|
||||
return input.skip(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark(int readAheadLimit) throws IOException {
|
||||
input.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return input.markSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean ready() throws IOException {
|
||||
return input.ready();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
input.reset();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
package org.apache.lucene.analysis.fa;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
|
||||
public class TestPersianCharFilter extends BaseTokenStreamTestCase {
|
||||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new MockTokenizer(reader));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Reader initReader(String fieldName, Reader reader) {
|
||||
return new PersianCharFilter(reader);
|
||||
}
|
||||
};
|
||||
|
||||
public void testBasics() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "this is a\u200Ctest",
|
||||
new String[] { "this", "is", "a", "test" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.FilterReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
|
@ -25,17 +25,39 @@ import java.io.Reader;
|
|||
* They can be used as {@link java.io.Reader} with additional offset
|
||||
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
|
||||
* if a CharFilter subclass is used.
|
||||
* <p>
|
||||
* This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
|
||||
* transforming the input in some way from {@link #input}, and {@link #correct(int)}
|
||||
* to adjust the offsets to match the originals.
|
||||
* <p>
|
||||
* You can optionally provide more efficient implementations of additional methods
|
||||
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
|
||||
* but this is not required.
|
||||
*/
|
||||
public abstract class CharFilter extends FilterReader {
|
||||
// the way java.io.FilterReader should work!
|
||||
public abstract class CharFilter extends Reader {
|
||||
/**
|
||||
* The underlying character-input stream.
|
||||
*/
|
||||
protected final Reader input;
|
||||
|
||||
/**
|
||||
* Create a new CharFilter wrapping the provided reader.
|
||||
* @param in a Reader, can also be a CharFilter for chaining.
|
||||
* @param input a Reader, can also be a CharFilter for chaining.
|
||||
*/
|
||||
public CharFilter(Reader in) {
|
||||
super(in);
|
||||
public CharFilter(Reader input) {
|
||||
super(input);
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the underlying input stream.
|
||||
*/
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses override to correct the current offset.
|
||||
*
|
||||
|
@ -50,6 +72,6 @@ public abstract class CharFilter extends FilterReader {
|
|||
*/
|
||||
public final int correctOffset(int currentOff) {
|
||||
final int corrected = correct(currentOff);
|
||||
return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
|
||||
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
|
@ -50,6 +51,11 @@ public class TestCharFilter extends LuceneTestCase {
|
|||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
return currentOff + 1;
|
||||
|
@ -61,6 +67,11 @@ public class TestCharFilter extends LuceneTestCase {
|
|||
protected CharFilter2(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correct(int currentOff) {
|
||||
|
|
|
@ -42,11 +42,6 @@ public class MockCharFilter extends CharFilter {
|
|||
public MockCharFilter(Reader in) {
|
||||
this(in, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
in.close();
|
||||
}
|
||||
|
||||
int currentOffset = -1;
|
||||
int delta = 0;
|
||||
|
@ -66,7 +61,7 @@ public class MockCharFilter extends CharFilter {
|
|||
}
|
||||
|
||||
// otherwise actually read one
|
||||
int ch = in.read();
|
||||
int ch = input.read();
|
||||
if (ch < 0)
|
||||
return ch;
|
||||
|
||||
|
|
|
@ -19,12 +19,16 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.RegExp;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.RandomizedContext;
|
||||
|
||||
/**
|
||||
* Tokenizer for testing.
|
||||
* <p>
|
||||
|
@ -77,6 +81,9 @@ public class MockTokenizer extends Tokenizer {
|
|||
private int lastOffset = 0; // only for asserting
|
||||
private boolean enableChecks = true;
|
||||
|
||||
// evil: but we don't change the behavior with this random, we only switch up how we read
|
||||
private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
|
||||
|
||||
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
|
||||
super(factory, input);
|
||||
this.runAutomaton = runAutomaton;
|
||||
|
@ -139,14 +146,14 @@ public class MockTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
protected int readCodePoint() throws IOException {
|
||||
int ch = input.read();
|
||||
int ch = readChar();
|
||||
if (ch < 0) {
|
||||
return ch;
|
||||
} else {
|
||||
assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
|
||||
off++;
|
||||
if (Character.isHighSurrogate((char) ch)) {
|
||||
int ch2 = input.read();
|
||||
int ch2 = readChar();
|
||||
if (ch2 >= 0) {
|
||||
off++;
|
||||
assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
|
||||
|
@ -158,6 +165,33 @@ public class MockTokenizer extends Tokenizer {
|
|||
return ch;
|
||||
}
|
||||
}
|
||||
|
||||
protected int readChar() throws IOException {
|
||||
switch(random.nextInt(10)) {
|
||||
case 0: {
|
||||
// read(char[])
|
||||
char c[] = new char[1];
|
||||
int ret = input.read(c);
|
||||
return ret < 0 ? ret : c[0];
|
||||
}
|
||||
case 1: {
|
||||
// read(char[], int, int)
|
||||
char c[] = new char[2];
|
||||
int ret = input.read(c, 1, 1);
|
||||
return ret < 0 ? ret : c[1];
|
||||
}
|
||||
case 2: {
|
||||
// read(CharBuffer)
|
||||
char c[] = new char[1];
|
||||
CharBuffer cb = CharBuffer.wrap(c);
|
||||
int ret = input.read(cb);
|
||||
return ret < 0 ? ret : c[0];
|
||||
}
|
||||
default:
|
||||
// read()
|
||||
return input.read();
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean isTokenChar(int c) {
|
||||
state = runAutomaton.step(state, c);
|
||||
|
|
|
@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
|||
return ch;
|
||||
}
|
||||
numRead++;
|
||||
return in.read();
|
||||
return input.read();
|
||||
}
|
||||
|
||||
private int nextSkipWS() throws IOException {
|
||||
|
@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
|||
return pushed.charAt(len-1);
|
||||
}
|
||||
numRead++;
|
||||
int ch = in.read();
|
||||
int ch = input.read();
|
||||
push(ch);
|
||||
return ch;
|
||||
}
|
||||
|
@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
|
|||
|
||||
private void saveState() throws IOException {
|
||||
lastMark = numRead;
|
||||
in.mark(readAheadLimit);
|
||||
input.mark(readAheadLimit);
|
||||
}
|
||||
|
||||
private void restoreState() throws IOException {
|
||||
in.reset();
|
||||
input.reset();
|
||||
pushed.setLength(0);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue