LUCENE-5240: additional safety in Tokenizer state machine

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1529482 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-10-05 16:41:28 +00:00
parent 2cd31e54b5
commit 8fba601ed1
14 changed files with 76 additions and 245 deletions

View File

@ -80,6 +80,10 @@ New Features
on best effort which was not user-friendly.
(Uwe Schindler, Robert Muir)
* LUCENE-5240: Tokenizers now throw an IllegalStateException if the
consumer neglects to call close() on the previous stream before consuming
the next one. (Uwe Schindler, Robert Muir)
* LUCENE-5214: Add new FreeTextSuggester, to predict the next word
using a simple ngram language model. This is useful for the "long
tail" suggestions, when a primary suggester fails to find a

View File

@ -48,6 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
assertEquals("the", term.toString());
assertTrue(cgf.incrementToken());
assertEquals("the_s", term.toString());
cgf.close();
wt.setReader(new StringReader(input));
cgf.reset();
@ -67,6 +68,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
assertEquals("How_the", term.toString());
assertTrue(nsf.incrementToken());
assertEquals("the_s", term.toString());
nsf.close();
wt.setReader(new StringReader(input));
nsf.reset();

View File

@ -240,6 +240,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
assertTrue(tf.incrementToken());
assertEquals("Rind", termAtt.toString());
tf.end();
tf.close();
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
tf.reset();
assertTrue(tf.incrementToken());

View File

@ -59,6 +59,8 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
ts.close();
}
}

View File

@ -217,6 +217,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
ts.reset();
while (ts.incrementToken()) {
}
ts.end();
ts.close();
}
}
@ -240,6 +242,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
while (ts.incrementToken()) {
assertTrue(UnicodeUtil.validUTF16String(termAtt));
}
ts.end();
ts.close();
}
}
@ -630,6 +634,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
final TokenStream ts = analyzer.tokenStream("ignored", line);
ts.reset();
while(ts.incrementToken());
ts.end();
ts.close();
}
String[] sentences = line.split("、|。");
if (VERBOSE) {
@ -642,6 +648,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
final TokenStream ts = analyzer.tokenStream("ignored", sentence);
ts.reset();
while(ts.incrementToken());
ts.end();
ts.close();
}
}
if (VERBOSE) {

View File

@ -90,6 +90,8 @@ public class ReadTokensTask extends PerfTask {
termAtt.fillBytesRef();
tokenCount++;
}
stream.end();
stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;

View File

@ -85,8 +85,9 @@ public abstract class Tokenizer extends TokenStream {
public final void setReader(Reader input) throws IOException {
if (input == null) {
throw new NullPointerException("input must not be null");
} else if (this.input != ILLEGAL_STATE_READER) {
throw new IllegalStateException("TokenStream contract violation: close() call missing");
}
this.input = ILLEGAL_STATE_READER;
this.inputPending = input;
assert setReaderTestPoint();
}

View File

@ -401,6 +401,20 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.end();
ts.close();
}
// check for a missing close()
ts = a.tokenStream("bogus", input);
ts.reset();
while (ts.incrementToken()) {}
ts.end();
try {
ts = a.tokenStream("bogus", input);
fail("didn't get expected exception when close() not called");
} catch (IllegalStateException expected) {
// ok
} finally {
ts.close();
}
}
// simple utility method for testing stemmers

View File

@ -266,6 +266,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
termAtt.fillBytesRef();
// ensure we make a copy of the actual bytes too
map.put(term, BytesRef.deepCopyOf(bytes));
assertFalse(ts.incrementToken());
ts.end();
ts.close();
}
Thread threads[] = new Thread[numThreads];
@ -284,6 +287,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
assertTrue(ts.incrementToken());
termAtt.fillBytesRef();
assertEquals(expected, bytes);
assertFalse(ts.incrementToken());
ts.end();
ts.close();
}
} catch (IOException e) {
throw new RuntimeException(e);

View File

@ -1,177 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.schema.DateField;
import static org.apache.solr.schema.TrieField.TrieTypes;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
/**
* Tokenizer for trie fields. It uses NumericTokenStream to create multiple trie encoded string per number.
* Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
* For query time token streams that only contain the highest precision term, use 32/64 as precisionStep.
* <p/>
* Refer to {@link org.apache.lucene.search.NumericRangeQuery} for more details.
*
*
* @see org.apache.lucene.search.NumericRangeQuery
* @see org.apache.solr.schema.TrieField
* @since solr 1.4
*/
public class TrieTokenizerFactory extends TokenizerFactory {
protected final int precisionStep;
protected final TrieTypes type;
public TrieTokenizerFactory(TrieTypes type, int precisionStep) {
super(new HashMap<String,String>());
this.type = type;
this.precisionStep = precisionStep;
}
@Override
public TrieTokenizer create(AttributeFactory factory, Reader input) {
return new TrieTokenizer(input, type, TrieTokenizer.getNumericTokenStream(factory, precisionStep));
}
}
final class TrieTokenizer extends Tokenizer {
protected static final DateField dateField = new DateField();
protected final TrieTypes type;
protected final NumericTokenStream ts;
// NumericTokenStream does not support CharTermAttribute so keep it local
private final CharTermAttribute termAtt = new CharTermAttributeImpl();
protected final OffsetAttribute ofsAtt = addAttribute(OffsetAttribute.class);
protected int startOfs, endOfs;
protected boolean hasValue;
static NumericTokenStream getNumericTokenStream(AttributeFactory factory, int precisionStep) {
return new NumericTokenStream(factory, precisionStep);
}
public TrieTokenizer(Reader input, TrieTypes type, final NumericTokenStream ts) {
// Häckidy-Hick-Hack: must share the attributes with the NumericTokenStream we delegate to, so we create a fake factory:
super(new AttributeFactory() {
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
return (AttributeImpl) ts.addAttribute(attClass);
}
}, input);
// add all attributes:
for (Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); it.hasNext();) {
addAttribute(it.next());
}
this.type = type;
this.ts = ts;
// dates tend to be longer, especially when math is involved
termAtt.resizeBuffer( type == TrieTypes.DATE ? 128 : 32 );
}
@Override
public void reset() throws IOException {
super.reset();
try {
int upto = 0;
char[] buf = termAtt.buffer();
while (true) {
final int length = input.read(buf, upto, buf.length-upto);
if (length == -1) break;
upto += length;
if (upto == buf.length)
buf = termAtt.resizeBuffer(1+buf.length);
}
termAtt.setLength(upto);
this.startOfs = correctOffset(0);
this.endOfs = correctOffset(upto);
if (upto == 0) {
hasValue = false;
return;
}
final String v = new String(buf, 0, upto);
try {
switch (type) {
case INTEGER:
ts.setIntValue(Integer.parseInt(v));
break;
case FLOAT:
ts.setFloatValue(Float.parseFloat(v));
break;
case LONG:
ts.setLongValue(Long.parseLong(v));
break;
case DOUBLE:
ts.setDoubleValue(Double.parseDouble(v));
break;
case DATE:
ts.setLongValue(dateField.parseMath(null, v).getTime());
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
}
} catch (NumberFormatException nfe) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Invalid Number: " + v);
}
hasValue = true;
ts.reset();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
}
}
@Override
public void close() throws IOException {
super.close();
if (hasValue) {
ts.close();
}
}
@Override
public boolean incrementToken() {
if (hasValue && ts.incrementToken()) {
ofsAtt.setOffset(startOfs, endOfs);
return true;
}
return false;
}
@Override
public void end() throws IOException {
super.end();
if (hasValue) {
ts.end();
}
ofsAtt.setOffset(endOfs, endOfs);
}
}

View File

@ -30,6 +30,7 @@ import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
@ -138,9 +139,10 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
* @param analyzer The analyzer to use.
*/
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
TokenStream tokenStream = null;
try {
final Set<BytesRef> tokens = new HashSet<BytesRef>();
final TokenStream tokenStream = analyzer.tokenStream("", query);
tokenStream = analyzer.tokenStream("", query);
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
final BytesRef bytes = bytesAtt.getBytesRef();
@ -152,10 +154,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
}
tokenStream.end();
tokenStream.close();
return tokens;
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
}
@ -181,8 +184,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
trackerAtt.setActPosition(position);
tokens.add(tokenStream.cloneAttributes());
}
tokenStream.end();
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;

View File

@ -24,8 +24,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FieldType.NumericType;
@ -51,8 +49,6 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.mutable.MutableValueDate;
import org.apache.lucene.util.mutable.MutableValueLong;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TrieTokenizerFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
@ -111,12 +107,6 @@ public class TrieField extends PrimitiveFieldType {
"Invalid type specified in schema.xml for field: " + args.get("name"), e);
}
}
CharFilterFactory[] filterFactories = new CharFilterFactory[0];
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, precisionStep), tokenFilterFactories);
// for query time we only need one token, so we use the biggest possible precisionStep:
queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, Integer.MAX_VALUE), tokenFilterFactories);
}
@Override
@ -223,7 +213,7 @@ public class TrieField extends PrimitiveFieldType {
@Override
public boolean isTokenized() {
return true;
return false;
}
@Override
@ -382,24 +372,29 @@ public class TrieField extends PrimitiveFieldType {
@Override
public void readableToIndexed(CharSequence val, BytesRef result) {
String s = val.toString();
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
break;
case FLOAT:
NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
break;
case LONG:
NumericUtils.longToPrefixCodedBytes(Long.parseLong(s), 0, result);
break;
case DOUBLE:
NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
break;
case DATE:
NumericUtils.longToPrefixCodedBytes(dateField.parseMath(null, s).getTime(), 0, result);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
try {
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
break;
case FLOAT:
NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
break;
case LONG:
NumericUtils.longToPrefixCodedBytes(Long.parseLong(s), 0, result);
break;
case DOUBLE:
NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
break;
case DATE:
NumericUtils.longToPrefixCodedBytes(dateField.parseMath(null, s).getTime(), 0, result);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
}
} catch (NumberFormatException nfe) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Invalid Number: " + val);
}
}

View File

@ -16,8 +16,6 @@
*/
package org.apache.solr;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.FieldType;
@ -49,38 +47,6 @@ public class TestTrie extends SolrTestCaseJ4 {
clearIndex();
super.tearDown();
}
@Test
public void testTokenizer() throws Exception {
FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
assertTrue(type instanceof TrieField);
String value = String.valueOf(random().nextInt());
TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
int count = 0;
while (ts.incrementToken()) {
count++;
assertEquals(0, ofsAtt.startOffset());
assertEquals(value.length(), ofsAtt.endOffset());
}
final int precStep = ((TrieField) type).getPrecisionStep();
assertEquals( (32 + precStep - 1) / precStep, count);
ts.end();
assertEquals(value.length(), ofsAtt.startOffset());
assertEquals(value.length(), ofsAtt.endOffset());
ts.close();
// Test empty one:
ts = type.getAnalyzer().tokenStream("dummy", "");
ts.reset();
assertFalse(ts.incrementToken());
ts.end();
assertEquals(0, ofsAtt.startOffset());
assertEquals(0, ofsAtt.endOffset());
ts.close();
}
@Test
public void testTrieIntRangeSearch() throws Exception {

View File

@ -39,7 +39,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
"/response/lst[@name='fieldType']/bool[@name='omitPositions'] = 'false'",
"/response/lst[@name='fieldType']/bool[@name='storeOffsetsWithPositions'] = 'false'",
"/response/lst[@name='fieldType']/bool[@name='multiValued'] = 'false'",
"/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'true'",
"/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'false'",
"/response/lst[@name='fieldType']/arr[@name='fields']/str = 'weight'",
"/response/lst[@name='fieldType']/arr[@name='dynamicFields']/str = '*_f'");
}
@ -69,7 +69,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
"/fieldType/omitPositions==false",
"/fieldType/storeOffsetsWithPositions==false",
"/fieldType/multiValued==false",
"/fieldType/tokenized==true",
"/fieldType/tokenized==false",
"/fieldType/fields==['weight']",
"/fieldType/dynamicFields==['*_f']");
}