mirror of https://github.com/apache/lucene.git
LUCENE-5240: additional safety in Tokenizer state machine
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1529482 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2cd31e54b5
commit
8fba601ed1
|
@ -80,6 +80,10 @@ New Features
|
|||
on best effort which was not user-friendly.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-5240: Tokenizers now throw an IllegalStateException if the
|
||||
consumer neglects to call close() on the previous stream before consuming
|
||||
the next one. (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-5214: Add new FreeTextSuggester, to predict the next word
|
||||
using a simple ngram language model. This is useful for the "long
|
||||
tail" suggestions, when a primary suggester fails to find a
|
||||
|
|
|
@ -48,6 +48,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
assertEquals("the", term.toString());
|
||||
assertTrue(cgf.incrementToken());
|
||||
assertEquals("the_s", term.toString());
|
||||
cgf.close();
|
||||
|
||||
wt.setReader(new StringReader(input));
|
||||
cgf.reset();
|
||||
|
@ -67,6 +68,7 @@ public class CommonGramsFilterTest extends BaseTokenStreamTestCase {
|
|||
assertEquals("How_the", term.toString());
|
||||
assertTrue(nsf.incrementToken());
|
||||
assertEquals("the_s", term.toString());
|
||||
nsf.close();
|
||||
|
||||
wt.setReader(new StringReader(input));
|
||||
nsf.reset();
|
||||
|
|
|
@ -240,6 +240,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||
assertTrue(tf.incrementToken());
|
||||
assertEquals("Rind", termAtt.toString());
|
||||
tf.end();
|
||||
tf.close();
|
||||
wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||
tf.reset();
|
||||
assertTrue(tf.incrementToken());
|
||||
|
|
|
@ -59,6 +59,8 @@ public class TestExtendedMode extends BaseTokenStreamTestCase {
|
|||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -217,6 +217,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -240,6 +242,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
while (ts.incrementToken()) {
|
||||
assertTrue(UnicodeUtil.validUTF16String(termAtt));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -630,6 +634,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
final TokenStream ts = analyzer.tokenStream("ignored", line);
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
String[] sentences = line.split("、|。");
|
||||
if (VERBOSE) {
|
||||
|
@ -642,6 +648,8 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
final TokenStream ts = analyzer.tokenStream("ignored", sentence);
|
||||
ts.reset();
|
||||
while(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
if (VERBOSE) {
|
||||
|
|
|
@ -90,6 +90,8 @@ public class ReadTokensTask extends PerfTask {
|
|||
termAtt.fillBytesRef();
|
||||
tokenCount++;
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
totalTokenCount += tokenCount;
|
||||
return tokenCount;
|
||||
|
|
|
@ -85,8 +85,9 @@ public abstract class Tokenizer extends TokenStream {
|
|||
public final void setReader(Reader input) throws IOException {
|
||||
if (input == null) {
|
||||
throw new NullPointerException("input must not be null");
|
||||
} else if (this.input != ILLEGAL_STATE_READER) {
|
||||
throw new IllegalStateException("TokenStream contract violation: close() call missing");
|
||||
}
|
||||
this.input = ILLEGAL_STATE_READER;
|
||||
this.inputPending = input;
|
||||
assert setReaderTestPoint();
|
||||
}
|
||||
|
|
|
@ -401,6 +401,20 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
// check for a missing close()
|
||||
ts = a.tokenStream("bogus", input);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {}
|
||||
ts.end();
|
||||
try {
|
||||
ts = a.tokenStream("bogus", input);
|
||||
fail("didn't get expected exception when close() not called");
|
||||
} catch (IllegalStateException expected) {
|
||||
// ok
|
||||
} finally {
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
// simple utility method for testing stemmers
|
||||
|
|
|
@ -266,6 +266,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
|||
termAtt.fillBytesRef();
|
||||
// ensure we make a copy of the actual bytes too
|
||||
map.put(term, BytesRef.deepCopyOf(bytes));
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
Thread threads[] = new Thread[numThreads];
|
||||
|
@ -284,6 +287,9 @@ public abstract class CollationTestBase extends LuceneTestCase {
|
|||
assertTrue(ts.incrementToken());
|
||||
termAtt.fillBytesRef();
|
||||
assertEquals(expected, bytes);
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -1,177 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.NumericTokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.schema.DateField;
|
||||
import static org.apache.solr.schema.TrieField.TrieTypes;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Tokenizer for trie fields. It uses NumericTokenStream to create multiple trie encoded string per number.
|
||||
* Each string created by this tokenizer for a given number differs from the previous by the given precisionStep.
|
||||
* For query time token streams that only contain the highest precision term, use 32/64 as precisionStep.
|
||||
* <p/>
|
||||
* Refer to {@link org.apache.lucene.search.NumericRangeQuery} for more details.
|
||||
*
|
||||
*
|
||||
* @see org.apache.lucene.search.NumericRangeQuery
|
||||
* @see org.apache.solr.schema.TrieField
|
||||
* @since solr 1.4
|
||||
*/
|
||||
public class TrieTokenizerFactory extends TokenizerFactory {
|
||||
protected final int precisionStep;
|
||||
protected final TrieTypes type;
|
||||
|
||||
public TrieTokenizerFactory(TrieTypes type, int precisionStep) {
|
||||
super(new HashMap<String,String>());
|
||||
this.type = type;
|
||||
this.precisionStep = precisionStep;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TrieTokenizer create(AttributeFactory factory, Reader input) {
|
||||
return new TrieTokenizer(input, type, TrieTokenizer.getNumericTokenStream(factory, precisionStep));
|
||||
}
|
||||
}
|
||||
|
||||
final class TrieTokenizer extends Tokenizer {
|
||||
protected static final DateField dateField = new DateField();
|
||||
protected final TrieTypes type;
|
||||
protected final NumericTokenStream ts;
|
||||
|
||||
// NumericTokenStream does not support CharTermAttribute so keep it local
|
||||
private final CharTermAttribute termAtt = new CharTermAttributeImpl();
|
||||
protected final OffsetAttribute ofsAtt = addAttribute(OffsetAttribute.class);
|
||||
protected int startOfs, endOfs;
|
||||
protected boolean hasValue;
|
||||
|
||||
static NumericTokenStream getNumericTokenStream(AttributeFactory factory, int precisionStep) {
|
||||
return new NumericTokenStream(factory, precisionStep);
|
||||
}
|
||||
|
||||
public TrieTokenizer(Reader input, TrieTypes type, final NumericTokenStream ts) {
|
||||
// Häckidy-Hick-Hack: must share the attributes with the NumericTokenStream we delegate to, so we create a fake factory:
|
||||
super(new AttributeFactory() {
|
||||
@Override
|
||||
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
|
||||
return (AttributeImpl) ts.addAttribute(attClass);
|
||||
}
|
||||
}, input);
|
||||
// add all attributes:
|
||||
for (Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); it.hasNext();) {
|
||||
addAttribute(it.next());
|
||||
}
|
||||
this.type = type;
|
||||
this.ts = ts;
|
||||
// dates tend to be longer, especially when math is involved
|
||||
termAtt.resizeBuffer( type == TrieTypes.DATE ? 128 : 32 );
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
try {
|
||||
int upto = 0;
|
||||
char[] buf = termAtt.buffer();
|
||||
while (true) {
|
||||
final int length = input.read(buf, upto, buf.length-upto);
|
||||
if (length == -1) break;
|
||||
upto += length;
|
||||
if (upto == buf.length)
|
||||
buf = termAtt.resizeBuffer(1+buf.length);
|
||||
}
|
||||
termAtt.setLength(upto);
|
||||
this.startOfs = correctOffset(0);
|
||||
this.endOfs = correctOffset(upto);
|
||||
|
||||
if (upto == 0) {
|
||||
hasValue = false;
|
||||
return;
|
||||
}
|
||||
|
||||
final String v = new String(buf, 0, upto);
|
||||
try {
|
||||
switch (type) {
|
||||
case INTEGER:
|
||||
ts.setIntValue(Integer.parseInt(v));
|
||||
break;
|
||||
case FLOAT:
|
||||
ts.setFloatValue(Float.parseFloat(v));
|
||||
break;
|
||||
case LONG:
|
||||
ts.setLongValue(Long.parseLong(v));
|
||||
break;
|
||||
case DOUBLE:
|
||||
ts.setDoubleValue(Double.parseDouble(v));
|
||||
break;
|
||||
case DATE:
|
||||
ts.setLongValue(dateField.parseMath(null, v).getTime());
|
||||
break;
|
||||
default:
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field");
|
||||
}
|
||||
} catch (NumberFormatException nfe) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"Invalid Number: " + v);
|
||||
}
|
||||
hasValue = true;
|
||||
ts.reset();
|
||||
} catch (IOException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unable to create TrieIndexTokenizer", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
if (hasValue) {
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
if (hasValue && ts.incrementToken()) {
|
||||
ofsAtt.setOffset(startOfs, endOfs);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
super.end();
|
||||
if (hasValue) {
|
||||
ts.end();
|
||||
}
|
||||
ofsAtt.setOffset(endOfs, endOfs);
|
||||
}
|
||||
}
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||
import org.apache.lucene.util.AttributeReflector;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
|
@ -138,9 +139,10 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
* @param analyzer The analyzer to use.
|
||||
*/
|
||||
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
|
||||
TokenStream tokenStream = null;
|
||||
try {
|
||||
final Set<BytesRef> tokens = new HashSet<BytesRef>();
|
||||
final TokenStream tokenStream = analyzer.tokenStream("", query);
|
||||
tokenStream = analyzer.tokenStream("", query);
|
||||
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
|
||||
final BytesRef bytes = bytesAtt.getBytesRef();
|
||||
|
||||
|
@ -152,10 +154,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
}
|
||||
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
return tokens;
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(tokenStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -181,8 +184,11 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
trackerAtt.setActPosition(position);
|
||||
tokens.add(tokenStream.cloneAttributes());
|
||||
}
|
||||
tokenStream.end();
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(tokenStream);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
|
|
|
@ -24,8 +24,6 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.document.DoubleField;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.FieldType.NumericType;
|
||||
|
@ -51,8 +49,6 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.lucene.util.mutable.MutableValueDate;
|
||||
import org.apache.lucene.util.mutable.MutableValueLong;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.analysis.TrieTokenizerFactory;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
import org.apache.solr.search.QParser;
|
||||
|
@ -111,12 +107,6 @@ public class TrieField extends PrimitiveFieldType {
|
|||
"Invalid type specified in schema.xml for field: " + args.get("name"), e);
|
||||
}
|
||||
}
|
||||
|
||||
CharFilterFactory[] filterFactories = new CharFilterFactory[0];
|
||||
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
|
||||
analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, precisionStep), tokenFilterFactories);
|
||||
// for query time we only need one token, so we use the biggest possible precisionStep:
|
||||
queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(type, Integer.MAX_VALUE), tokenFilterFactories);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -223,7 +213,7 @@ public class TrieField extends PrimitiveFieldType {
|
|||
|
||||
@Override
|
||||
public boolean isTokenized() {
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -382,6 +372,7 @@ public class TrieField extends PrimitiveFieldType {
|
|||
@Override
|
||||
public void readableToIndexed(CharSequence val, BytesRef result) {
|
||||
String s = val.toString();
|
||||
try {
|
||||
switch (type) {
|
||||
case INTEGER:
|
||||
NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
|
||||
|
@ -401,6 +392,10 @@ public class TrieField extends PrimitiveFieldType {
|
|||
default:
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
|
||||
}
|
||||
} catch (NumberFormatException nfe) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
|
||||
"Invalid Number: " + val);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -16,8 +16,6 @@
|
|||
*/
|
||||
package org.apache.solr;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.DateField;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
|
@ -50,38 +48,6 @@ public class TestTrie extends SolrTestCaseJ4 {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTokenizer() throws Exception {
|
||||
FieldType type = h.getCore().getLatestSchema().getFieldType("tint");
|
||||
assertTrue(type instanceof TrieField);
|
||||
|
||||
String value = String.valueOf(random().nextInt());
|
||||
TokenStream ts = type.getAnalyzer().tokenStream("dummy", value);
|
||||
OffsetAttribute ofsAtt = ts.addAttribute(OffsetAttribute.class);
|
||||
ts.reset();
|
||||
int count = 0;
|
||||
while (ts.incrementToken()) {
|
||||
count++;
|
||||
assertEquals(0, ofsAtt.startOffset());
|
||||
assertEquals(value.length(), ofsAtt.endOffset());
|
||||
}
|
||||
final int precStep = ((TrieField) type).getPrecisionStep();
|
||||
assertEquals( (32 + precStep - 1) / precStep, count);
|
||||
ts.end();
|
||||
assertEquals(value.length(), ofsAtt.startOffset());
|
||||
assertEquals(value.length(), ofsAtt.endOffset());
|
||||
ts.close();
|
||||
|
||||
// Test empty one:
|
||||
ts = type.getAnalyzer().tokenStream("dummy", "");
|
||||
ts.reset();
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.end();
|
||||
assertEquals(0, ofsAtt.startOffset());
|
||||
assertEquals(0, ofsAtt.endOffset());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTrieIntRangeSearch() throws Exception {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
|
|
|
@ -39,7 +39,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
|
|||
"/response/lst[@name='fieldType']/bool[@name='omitPositions'] = 'false'",
|
||||
"/response/lst[@name='fieldType']/bool[@name='storeOffsetsWithPositions'] = 'false'",
|
||||
"/response/lst[@name='fieldType']/bool[@name='multiValued'] = 'false'",
|
||||
"/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'true'",
|
||||
"/response/lst[@name='fieldType']/bool[@name='tokenized'] = 'false'",
|
||||
"/response/lst[@name='fieldType']/arr[@name='fields']/str = 'weight'",
|
||||
"/response/lst[@name='fieldType']/arr[@name='dynamicFields']/str = '*_f'");
|
||||
}
|
||||
|
@ -69,7 +69,7 @@ public class TestFieldTypeResource extends SolrRestletTestBase {
|
|||
"/fieldType/omitPositions==false",
|
||||
"/fieldType/storeOffsetsWithPositions==false",
|
||||
"/fieldType/multiValued==false",
|
||||
"/fieldType/tokenized==true",
|
||||
"/fieldType/tokenized==false",
|
||||
"/fieldType/fields==['weight']",
|
||||
"/fieldType/dynamicFields==['*_f']");
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue