LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1376158 13f79535-47bb-0310-9956-ffa450edef68
2012-08-22 17:51:19 +00:00 · 2012-08-22 17:51:19 +00:00 · 568156b710
parent eaad73b6d1
commit 568156b710
11 changed files with 176 additions and 27 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -61,6 +61,14 @@ API Changes
  only exists so that this statistic can be accessed for Lucene 3.x 
  segments, which don't support Terms.size().  (Uwe Schindler, Robert Muir)

+* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
+  overdelegates (read(), read(char[], int, int), skip, etc). This made it
+  hard to implement CharFilters that were correct. Instead only close() is
+  delegated by default: read(char[], int, int) and correct(int) are abstract
+  so that its obvious which methods you should implement.  The protected 
+  inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.  
+  (Dawid Weiss, Uwe Schindler, Robert Muir)
+
 Bug Fixes

 * LUCENE-4297: BooleanScorer2 would multiply the coord() factor
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
@ -67,8 +67,8 @@ public class MappingCharFilter extends BaseCharFilter {

  @Override
  public void reset() throws IOException {
-    super.reset();
-    buffer.reset(in);
+    input.reset();
+    buffer.reset(input);
    replacement = null;
    inputOff = 0;
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
@ -34,7 +34,7 @@ public class PersianCharFilter extends CharFilter {
  
  @Override
  public int read(char[] cbuf, int off, int len) throws IOException {
-    final int charsRead = super.read(cbuf, off, len);
+    final int charsRead = input.read(cbuf, off, len);
    if (charsRead > 0) {
      final int end = off + charsRead;
      while (off < end) {
@ -45,6 +45,17 @@ public class PersianCharFilter extends CharFilter {
    }
    return charsRead;
  }
+  
+  // optimized impl: some other charfilters consume with read()
+  @Override
+  public int read() throws IOException {
+    int ch = input.read();
+    if (ch == '\u200C') {
+      return ' ';
+    } else {
+      return ch;
+    }
+  }

  @Override
  protected int correct(int currentOff) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
@ -72,7 +72,7 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
  private void fill() throws IOException {
    StringBuilder buffered = new StringBuilder();
    char [] temp = new char [1024];
-    for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
+    for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
      buffered.append(temp, 0, cnt);
    }
    transformedInput = new StringReader(processPattern(buffered).toString());
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@ -782,31 +782,51 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
    @Override
    public int read(char[] cbuf, int off, int len) throws IOException {
      readSomething = true;
-      return in.read(cbuf, off, len);
+      return input.read(cbuf, off, len);
    }

    @Override
    public int read() throws IOException {
      readSomething = true;
-      return in.read();
+      return input.read();
    }

    @Override
    public int read(CharBuffer target) throws IOException {
      readSomething = true;
-      return in.read(target);
+      return input.read(target);
    }

    @Override
    public int read(char[] cbuf) throws IOException {
      readSomething = true;
-      return in.read(cbuf);
+      return input.read(cbuf);
    }

    @Override
    public long skip(long n) throws IOException {
      readSomething = true;
-      return in.skip(n);
+      return input.skip(n);
+    }
+
+    @Override
+    public void mark(int readAheadLimit) throws IOException {
+      input.mark(readAheadLimit);
+    }
+
+    @Override
+    public boolean markSupported() {
+      return input.markSupported();
+    }
+
+    @Override
+    public boolean ready() throws IOException {
+      return input.ready();
+    }
+
+    @Override
+    public void reset() throws IOException {
+      input.reset();
    }
  }
  
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
@ -0,0 +1,48 @@
+package org.apache.lucene.analysis.fa;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+
+public class TestPersianCharFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      return new TokenStreamComponents(new MockTokenizer(reader));
+    }
+
+    @Override
+    protected Reader initReader(String fieldName, Reader reader) {
+      return new PersianCharFilter(reader);
+    }
+  };
+  
+  public void testBasics() throws Exception {
+    assertAnalyzesTo(analyzer, "this is a\u200Ctest",
+        new String[] { "this", "is", "a", "test" });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

-import java.io.FilterReader;
+import java.io.IOException;
 import java.io.Reader;

 /**
@ -25,17 +25,39 @@ import java.io.Reader;
 * They can be used as {@link java.io.Reader} with additional offset
 * correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
 * if a CharFilter subclass is used.
+ * <p>
+ * This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
+ * transforming the input in some way from {@link #input}, and {@link #correct(int)}
+ * to adjust the offsets to match the originals.
+ * <p>
+ * You can optionally provide more efficient implementations of additional methods 
+ * like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
+ * but this is not required.
 */
-public abstract class CharFilter extends FilterReader {
+// the way java.io.FilterReader should work!
+public abstract class CharFilter extends Reader {
+  /** 
+   * The underlying character-input stream. 
+   */
+  protected final Reader input;

  /**
   * Create a new CharFilter wrapping the provided reader.
-   * @param in a Reader, can also be a CharFilter for chaining.
+   * @param input a Reader, can also be a CharFilter for chaining.
   */
-  public CharFilter(Reader in) {
-    super(in);
+  public CharFilter(Reader input) {
+    super(input);
+    this.input = input;
  }
  
+  /** 
+   * Closes the underlying input stream.
+   */
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
  /**
   * Subclasses override to correct the current offset.
   *
@ -50,6 +72,6 @@ public abstract class CharFilter extends FilterReader {
   */
  public final int correctOffset(int currentOff) {
    final int corrected = correct(currentOff);
-    return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
+    return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;

@ -50,6 +51,11 @@ public class TestCharFilter extends LuceneTestCase {
      super(in);
    }

+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      return input.read(cbuf, off, len);
+    }
+
    @Override
    protected int correct(int currentOff) {
      return currentOff + 1;
@ -61,6 +67,11 @@ public class TestCharFilter extends LuceneTestCase {
    protected CharFilter2(Reader in) {
      super(in);
    }
+    
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      return input.read(cbuf, off, len);
+    }

    @Override
    protected int correct(int currentOff) {
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
@ -42,11 +42,6 @@ public class MockCharFilter extends CharFilter {
  public MockCharFilter(Reader in) {
    this(in, 0);
  }
-
-  @Override
-  public void close() throws IOException {
-    in.close();
-  }
  
  int currentOffset = -1;
  int delta = 0;
@ -66,7 +61,7 @@ public class MockCharFilter extends CharFilter {
    }
    
    // otherwise actually read one    
-    int ch = in.read();
+    int ch = input.read();
    if (ch < 0)
      return ch;
    
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
@ -19,12 +19,16 @@ package org.apache.lucene.analysis;

 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.Random;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;

+import com.carrotsearch.randomizedtesting.RandomizedContext;
+
 /**
 * Tokenizer for testing.
 * <p>
@ -77,6 +81,9 @@ public class MockTokenizer extends Tokenizer {
  private int lastOffset = 0; // only for asserting
  private boolean enableChecks = true;
  
+  // evil: but we don't change the behavior with this random, we only switch up how we read
+  private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
+  
  public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
    super(factory, input);
    this.runAutomaton = runAutomaton;
@ -139,14 +146,14 @@ public class MockTokenizer extends Tokenizer {
  }

  protected int readCodePoint() throws IOException {
-    int ch = input.read();
+    int ch = readChar();
    if (ch < 0) {
      return ch;
    } else {
      assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
      off++;
      if (Character.isHighSurrogate((char) ch)) {
-        int ch2 = input.read();
+        int ch2 = readChar();
        if (ch2 >= 0) {
          off++;
          assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
@ -158,6 +165,33 @@ public class MockTokenizer extends Tokenizer {
      return ch;
    }
  }
+  
+  protected int readChar() throws IOException {
+    switch(random.nextInt(10)) {
+      case 0: {
+        // read(char[])
+        char c[] = new char[1];
+        int ret = input.read(c);
+        return ret < 0 ? ret : c[0];
+      }
+      case 1: {
+        // read(char[], int, int)
+        char c[] = new char[2];
+        int ret = input.read(c, 1, 1);
+        return ret < 0 ? ret : c[1];
+      }
+      case 2: {
+        // read(CharBuffer)
+        char c[] = new char[1];
+        CharBuffer cb = CharBuffer.wrap(c);
+        int ret = input.read(cb);
+        return ret < 0 ? ret : c[0];
+      }
+      default: 
+        // read()
+        return input.read();
+    }
+  }

  protected boolean isTokenChar(int c) {
    state = runAutomaton.step(state, c);
--- a/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
+++ b/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
      return ch;
    }
    numRead++;
-    return in.read();
+    return input.read();
  }

  private int nextSkipWS() throws IOException {
@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {
      return pushed.charAt(len-1);
    }
    numRead++;
-    int ch = in.read();
+    int ch = input.read();
    push(ch);
    return ch;
  }
@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter extends BaseCharFilter {

  private void saveState() throws IOException {
    lastMark = numRead;
-    in.mark(readAheadLimit);
+    input.mark(readAheadLimit);
  }

  private void restoreState() throws IOException {
-    in.reset();
+    input.reset();
    pushed.setLength(0);
  }