From e670e324717e6f0098cc4f1211ba276f6cd2989e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 23 Oct 2012 20:32:27 +0000 Subject: [PATCH] throw a best-effort NPE from the jflex-based tokenizers if you don't consume the TS correctly git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1401449 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/standard/ClassicTokenizer.java | 2 +- .../lucene/analysis/standard/StandardTokenizer.java | 2 +- .../analysis/standard/UAX29URLEmailTokenizer.java | 10 +++++----- .../lucene/analysis/wikipedia/WikipediaTokenizer.java | 6 +++--- .../org/apache/lucene/analysis/util/TestElision.java | 3 +++ .../analysis/morfologik/TestMorfologikAnalyzer.java | 7 +++++++ 6 files changed, 20 insertions(+), 10 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java index 93192d9e2b4..c837ab3e8d8 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizer.java @@ -120,7 +120,7 @@ public final class ClassicTokenizer extends Tokenizer { } private void init(Version matchVersion) { - this.scanner = new ClassicTokenizerImpl(input); + this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset } // this tokenizer generates three attributes: diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index 97e512d1c8e..ed83d9e8739 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -134,7 +134,7 @@ public final class StandardTokenizer extends Tokenizer { } private final void init(Version matchVersion) { - this.scanner = new StandardTokenizerImpl(input); + this.scanner = new StandardTokenizerImpl(null); // best effort NPE if you dont call reset } // this tokenizer generates three attributes: diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java index d5442534fa8..6d3251befcd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java @@ -98,7 +98,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { */ public UAX29URLEmailTokenizer(Version matchVersion, Reader input) { super(input); - this.scanner = getScannerFor(matchVersion, input); + this.scanner = getScannerFor(matchVersion); } /** @@ -106,7 +106,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { */ public UAX29URLEmailTokenizer(Version matchVersion, AttributeSource source, Reader input) { super(source, input); - this.scanner = getScannerFor(matchVersion, input); + this.scanner = getScannerFor(matchVersion); } /** @@ -114,11 +114,11 @@ public final class UAX29URLEmailTokenizer extends Tokenizer { */ public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); - this.scanner = getScannerFor(matchVersion, input); + this.scanner = getScannerFor(matchVersion); } - private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) { - return new UAX29URLEmailTokenizerImpl(input); + private static StandardTokenizerInterface getScannerFor(Version matchVersion) { + return new UAX29URLEmailTokenizerImpl(null); // best effort NPE if you dont call reset } // this tokenizer generates three attributes: diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java index 644fe7fad39..0d8029af649 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java @@ -143,7 +143,7 @@ public final class WikipediaTokenizer extends Tokenizer { */ public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) { super(input); - this.scanner = new WikipediaTokenizerImpl(input); + this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset init(tokenOutput, untokenizedTypes); } @@ -156,7 +156,7 @@ public final class WikipediaTokenizer extends Tokenizer { */ public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set untokenizedTypes) { super(factory, input); - this.scanner = new WikipediaTokenizerImpl(input); + this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset init(tokenOutput, untokenizedTypes); } @@ -169,7 +169,7 @@ public final class WikipediaTokenizer extends Tokenizer { */ public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set untokenizedTypes) { super(source, input); - this.scanner = new WikipediaTokenizerImpl(input); + this.scanner = new WikipediaTokenizerImpl(null); // best effort NPE if you dont call reset init(tokenOutput, untokenizedTypes); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java index c48c86c47c8..0d5cead735f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestElision.java @@ -52,9 +52,12 @@ public class TestElision extends BaseTokenStreamTestCase { private List filter(TokenFilter filter) throws IOException { List tas = new ArrayList(); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); + filter.reset(); while (filter.incrementToken()) { tas.add(termAtt.toString()); } + filter.end(); + filter.close(); return tas; } diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java index 0bb14762678..c26d403127b 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java @@ -62,12 +62,16 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { ts_1.reset(); ts_1.incrementToken(); assertEquals("first stream", "liście", termAtt_1.toString()); + ts_1.end(); + ts_1.close(); TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych")); CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class); ts_2.reset(); ts_2.incrementToken(); assertEquals("second stream", "dany", termAtt_2.toString()); + ts_2.end(); + ts_2.close(); } /** Test stemming of mixed-case tokens. */ @@ -110,6 +114,7 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { public final void testPOSAttribute() throws IOException { TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liście")); + ts.reset(); assertPOSToken(ts, "liście", "subst:sg:acc:n2", "subst:sg:nom:n2", @@ -127,6 +132,8 @@ public class TestMorfologikAnalyzer extends BaseTokenStreamTestCase { assertPOSToken(ts, "lista", "subst:sg:dat:f", "subst:sg:loc:f"); + ts.end(); + ts.close(); } /** blast some random strings through the analyzer */