From 46664898578f967dc959eb3b55e4d547c5407c93 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 11 Sep 2009 06:12:13 +0000 Subject: [PATCH] LUCENE-1906: Fix backwards problems with CharStream and Tokenizers with custom reset(Reader) method. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@813671 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 10 +++--- common-build.xml | 2 +- .../lucene/analysis/cjk/CJKTokenizer.java | 2 +- .../lucene/analysis/cn/ChineseTokenizer.java | 2 +- .../analysis/ngram/EdgeNGramTokenizer.java | 2 +- .../lucene/analysis/ngram/NGramTokenizer.java | 2 +- .../analysis/cn/smart/SentenceTokenizer.java | 2 +- .../vectorhighlight/AbstractTestCase.java | 2 +- .../analysis/WikipediaTokenizer.java | 13 +++---- .../lucene/analysis/BaseCharFilter.java | 2 +- .../apache/lucene/analysis/CharFilter.java | 3 ++ .../apache/lucene/analysis/CharStream.java | 11 +++--- .../apache/lucene/analysis/CharTokenizer.java | 4 +-- .../lucene/analysis/KeywordTokenizer.java | 4 +-- .../lucene/analysis/MappingCharFilter.java | 8 +++++ .../org/apache/lucene/analysis/Tokenizer.java | 36 ++++++------------- .../analysis/standard/StandardTokenizer.java | 13 +++---- .../analysis/TestMappingCharFilter.java | 20 +++++------ 18 files changed, 63 insertions(+), 75 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 9ac6e9f0656..d2f5ac0171f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -350,10 +350,12 @@ API Changes a top level reader and docID. (Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless) - * LUCENE-1466: Changed Tokenizer.input to be a CharStream; added - CharFilter and MappingCharFilter, which allows chaining & mapping - of characters before tokenizers run. (Koji Sekiguchi via Mike - McCandless) + * LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows + chaining & mapping of characters before tokenizers run. CharStream (subclass of + Reader) is the base class for custom java.io.Reader's, that support offset + correction. Tokenizers got an additional method correctOffset() that is passed + down to the underlying CharStream if input is a subclass of CharStream/-Filter. + (Koji Sekiguchi via Mike McCandless, Uwe Schindler) * LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike McCandless) diff --git a/common-build.xml b/common-build.xml index 64533e3dfbc..186f4948486 100644 --- a/common-build.xml +++ b/common-build.xml @@ -42,7 +42,7 @@ - + diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java index aa01d218141..a1489eb374e 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java @@ -285,7 +285,7 @@ public final class CJKTokenizer extends Tokenizer { if (length > 0) { termAtt.setTermBuffer(buffer, 0, length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); return true; } else if (dataLen == -1) { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java index 378162fed45..2507cacec06 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java @@ -104,7 +104,7 @@ public final class ChineseTokenizer extends Tokenizer { //System.out.println(new String(buffer, 0, //length)); termAtt.setTermBuffer(buffer, 0, length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } else diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java index ad37a9b11c4..47e599505a8 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java @@ -207,7 +207,7 @@ public class EdgeNGramTokenizer extends Tokenizer { int start = side == Side.FRONT ? 0 : inLen - gramSize; int end = start + gramSize; termAtt.setTermBuffer(inStr, start, gramSize); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end)); + offsetAtt.setOffset(correctOffset(start), correctOffset(end)); gramSize++; return true; } diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java index 8ad9ea85264..ce2acb9bea1 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java @@ -124,7 +124,7 @@ public class NGramTokenizer extends Tokenizer { int oldPos = pos; pos++; termAtt.setTermBuffer(inStr, oldPos, gramSize); - offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize)); + offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); return true; } diff --git a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java index 21195c31444..dea031733e8 100644 --- a/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java +++ b/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java @@ -116,7 +116,7 @@ public final class SentenceTokenizer extends Tokenizer { return false; else { termAtt.setTermBuffer(buffer.toString()); - offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd)); + offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence"); return true; } diff --git a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java index f65a731379f..7356a31079a 100644 --- a/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java +++ b/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java @@ -202,7 +202,7 @@ public abstract class AbstractTestCase extends TestCase { return false; termAtt.setTermBuffer(snippet, startTerm, lenTerm); - offsetAtt.setOffset(startOffset, startOffset + lenTerm); + offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; } diff --git a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java index 7b9efbde048..e02aa193c1b 100644 --- a/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java +++ b/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java @@ -17,7 +17,6 @@ package org.apache.lucene.wikipedia.analysis; -import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; @@ -127,10 +126,6 @@ public class WikipediaTokenizer extends Tokenizer { private TermAttribute termAtt; private FlagsAttribute flagsAtt; - void setInput(Reader reader) { - this.input = CharReader.get(reader); - } - /** * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the * input to a newly created JFlex scanner. @@ -267,7 +262,7 @@ public class WikipediaTokenizer extends Tokenizer { //trim the buffer String s = buffer.toString().trim(); termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); - offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -305,7 +300,7 @@ public class WikipediaTokenizer extends Tokenizer { //trim the buffer String s = buffer.toString().trim(); termAtt.setTermBuffer(s.toCharArray(), 0, s.length()); - offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length())); + offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ @@ -318,7 +313,7 @@ public class WikipediaTokenizer extends Tokenizer { private void setupToken() { scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength())); + offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength())); } /* @@ -332,7 +327,7 @@ public class WikipediaTokenizer extends Tokenizer { } public void reset(Reader reader) throws IOException { - setInput(reader); + super.reset(reader); reset(); } diff --git a/src/java/org/apache/lucene/analysis/BaseCharFilter.java b/src/java/org/apache/lucene/analysis/BaseCharFilter.java index 7f2a000c86c..8ea83684f75 100644 --- a/src/java/org/apache/lucene/analysis/BaseCharFilter.java +++ b/src/java/org/apache/lucene/analysis/BaseCharFilter.java @@ -43,7 +43,7 @@ public abstract class BaseCharFilter extends CharFilter { /** Retrieve the corrected offset. Note that this method * is slow, if you correct positions far before the most * recently added position, as it's a simple linear - * searhc backwards through all offset corrections added + * search backwards through all offset corrections added * by {@link #addOffCorrectMap}. */ protected int correct(int currentOff) { if (pcmList == null || pcmList.isEmpty()) { diff --git a/src/java/org/apache/lucene/analysis/CharFilter.java b/src/java/org/apache/lucene/analysis/CharFilter.java index f0012fdf7c7..4ff0482eaa6 100644 --- a/src/java/org/apache/lucene/analysis/CharFilter.java +++ b/src/java/org/apache/lucene/analysis/CharFilter.java @@ -21,6 +21,9 @@ import java.io.IOException; /** * Subclasses of CharFilter can be chained to filter CharStream. + * They can be used as {@link java.io.Reader} with additional offset + * correction. {@link Tokenizer}s will automatically use {@link #correctOffset} + * if a CharFilter/CharStream subclass is used. * * @version $Id$ * diff --git a/src/java/org/apache/lucene/analysis/CharStream.java b/src/java/org/apache/lucene/analysis/CharStream.java index 24ae0de142f..3a112535660 100644 --- a/src/java/org/apache/lucene/analysis/CharStream.java +++ b/src/java/org/apache/lucene/analysis/CharStream.java @@ -20,12 +20,11 @@ package org.apache.lucene.analysis; import java.io.Reader; /** - * CharStream adds correctOffset - * functionality over Reader. All Tokenizers accept a - * CharStream as input, which enables arbitrary character - * based filtering before tokenization. The {@link - * #correctOffset} method fixed offsets to account for + * CharStream adds {@link #correctOffset} + * functionality over {@link Reader}. All Tokenizers accept a + * CharStream instead of {@link Reader} as input, which enables + * arbitrary character based filtering before tokenization. + * The {@link #correctOffset} method fixed offsets to account for * removal or insertion of characters, so that the offsets * reported in the tokens match the character offsets of the * original Reader. diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java index 7f8038983e5..1689585afce 100644 --- a/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java @@ -104,13 +104,13 @@ public abstract class CharTokenizer extends Tokenizer { } termAtt.setTermLength(length); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length)); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } public final void end() { // set final offset - int finalOffset = input.correctOffset(offset); + int finalOffset = correctOffset(offset); offsetAtt.setOffset(finalOffset, finalOffset); } diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java index 6caa8f2ee3b..2363bb85e55 100644 --- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java +++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java @@ -76,8 +76,8 @@ public class KeywordTokenizer extends Tokenizer { buffer = termAtt.resizeTermBuffer(1+buffer.length); } termAtt.setTermLength(upto); - finalOffset = input.correctOffset(upto); - offsetAtt.setOffset(input.correctOffset(0), finalOffset); + finalOffset = correctOffset(upto); + offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; diff --git a/src/java/org/apache/lucene/analysis/MappingCharFilter.java b/src/java/org/apache/lucene/analysis/MappingCharFilter.java index a558cd1154e..2b5a93fafce 100644 --- a/src/java/org/apache/lucene/analysis/MappingCharFilter.java +++ b/src/java/org/apache/lucene/analysis/MappingCharFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis; import java.io.IOException; +import java.io.Reader; import java.util.LinkedList; /** @@ -35,11 +36,18 @@ public class MappingCharFilter extends BaseCharFilter { private int charPointer; private int nextCharCounter; + /** Default constructor that takes a {@link CharStream}. */ public MappingCharFilter(NormalizeCharMap normMap, CharStream in) { super(in); this.normMap = normMap; } + /** Easy-use constructor that takes a {@link Reader}. */ + public MappingCharFilter(NormalizeCharMap normMap, Reader in) { + super(CharReader.get(in)); + this.normMap = normMap; + } + public int read() throws IOException { while(true) { if (replacement != null && charPointer < replacement.length()) { diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index 7b63a62a2d9..962d9f69c81 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -40,7 +40,7 @@ import java.io.IOException; public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ - protected CharStream input; + protected Reader input; /** Construct a tokenizer with null input. */ protected Tokenizer() {} @@ -49,11 +49,6 @@ public abstract class Tokenizer extends TokenStream { protected Tokenizer(Reader input) { this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input. */ - protected Tokenizer(CharStream input) { - this.input = input; - } /** Construct a tokenizer with null input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory) { @@ -65,12 +60,6 @@ public abstract class Tokenizer extends TokenStream { super(factory); this.input = CharReader.get(input); } - - /** Construct a token stream processing the given input using the given AttributeFactory. */ - protected Tokenizer(AttributeFactory factory, CharStream input) { - super(factory); - this.input = input; - } /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source) { @@ -83,28 +72,25 @@ public abstract class Tokenizer extends TokenStream { this.input = CharReader.get(input); } - /** Construct a token stream processing the given input using the given AttributeSource. */ - protected Tokenizer(AttributeSource source, CharStream input) { - super(source); - this.input = input; - } - /** By default, closes the input Reader. */ public void close() throws IOException { input.close(); } + + /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass + * this method calls {@link CharStream#correctOffset}, else returns currentOff. + * @param currentOff offset as seen in the output + * @return corrected offset based on the input + * @see CharStream#correctOffset + */ + protected final int correctOffset(int currentOff) { + return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff; + } /** Expert: Reset the tokenizer to a new reader. Typically, an * analyzer (in its reusableTokenStream method) will use * this to re-use a previously created tokenizer. */ public void reset(Reader input) throws IOException { - this.input = CharReader.get(input); - } - - /** Expert: Reset the tokenizer to a new CharStream. Typically, an - * analyzer (in its reusableTokenStream method) will use - * this to re-use a previously created tokenizer. */ - public void reset(CharStream input) throws IOException { this.input = input; } } diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index b5c670b4606..c0e3bdcaa0b 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.standard; import java.io.IOException; import java.io.Reader; -import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -92,10 +91,6 @@ public class StandardTokenizer extends Tokenizer { */ private boolean replaceInvalidAcronym; - void setInput(Reader reader) { - input = CharReader.get(reader); - } - private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; /** Set the max allowed token length. Any token longer @@ -152,7 +147,7 @@ public class StandardTokenizer extends Tokenizer { private void init(Reader input, boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; - setInput(input); + this.input = input; termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); @@ -186,7 +181,7 @@ public class StandardTokenizer extends Tokenizer { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); - offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+termAtt.termLength())); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. @@ -210,7 +205,7 @@ public class StandardTokenizer extends Tokenizer { public final void end() { // set final offset - int finalOffset = input.correctOffset(scanner.yychar() + scanner.yylength()); + int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); } @@ -237,7 +232,7 @@ public class StandardTokenizer extends Tokenizer { } public void reset(Reader reader) throws IOException { - setInput(reader); + super.reset(reader); reset(); } diff --git a/src/test/org/apache/lucene/analysis/TestMappingCharFilter.java b/src/test/org/apache/lucene/analysis/TestMappingCharFilter.java index 3f64d1eafad..91e541db6eb 100644 --- a/src/test/org/apache/lucene/analysis/TestMappingCharFilter.java +++ b/src/test/org/apache/lucene/analysis/TestMappingCharFilter.java @@ -41,7 +41,7 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase { } public void testReaderReset() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); char[] buf = new char[10]; int len = cs.read(buf, 0, 10); assertEquals( 1, len ); @@ -57,55 +57,55 @@ public class TestMappingCharFilter extends BaseTokenStreamTestCase { } public void testNothingChange() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}); } public void test1to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}); } public void test1to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}); } public void test1to3() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}); } public void test2to4() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}); } public void test2to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}); } public void test3to1() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}); } public void test4to2() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}); } public void test5to0() throws Exception { - CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) ); + CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) ); TokenStream ts = new WhitespaceTokenizer( cs ); assertTokenStreamContents(ts, new String[0]); }