diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java index 37cad99854e..60c62bf745a 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java @@ -33,6 +33,9 @@ import java.io.Reader; * You can optionally provide more efficient implementations of additional methods * like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)}, * but this is not required. + *

+ * For examples and integration with {@link Analyzer}, see the + * {@link org.apache.lucene.analysis Analysis package documentation}. */ // the way java.io.FilterReader should work! public abstract class CharFilter extends Reader { diff --git a/lucene/core/src/java/org/apache/lucene/analysis/package.html b/lucene/core/src/java/org/apache/lucene/analysis/package.html index 0495e488486..b298263dd7f 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/package.html +++ b/lucene/core/src/java/org/apache/lucene/analysis/package.html @@ -817,5 +817,30 @@ As a small hint, this is how the new Attribute class could begin: ... +

Adding a CharFilter chain

+Analyzers take Java {@link java.io.Reader}s as input. Of course you can wrap your Readers with {@link java.io.FilterReader}s +to manipulate content, but this would have the big disadvantage that character offsets might be inconsistent with your original +text. +

+{@link org.apache.lucene.analysis.CharFilter} is designed to allow you to pre-process input like a FilterReader would, but also +preserve the original offsets associated with those characters. This way mechanisms like highlighting still work correctly. +CharFilters can be chained. +

+Example: +

+public class MyAnalyzer extends Analyzer {
+
+  {@literal @Override}
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    return new TokenStreamComponents(new MyTokenizer(reader));
+  }
+  
+  {@literal @Override}
+  protected Reader initReader(String fieldName, Reader reader) {
+    // wrap the Reader in a CharFilter chain.
+    return new SecondCharFilter(new FirstCharFilter(reader));
+  }
+}
+