Fix NumericTokenizer.

NumericTokenizer is a simple wrapper aroung a NumericTokenStream. However, its
implementations had a few issues: its reset() method was not idempotent,
causing exceptions if reset() was called twice (causing #3211) and it had no
attributes, meaning that the only thing it allowed to do is counting the number
of generated tokens. The reason why indexing numeric data worked is that
the mapper's parseCreateField directly generates a NumericTokenStream and
by-passes the analyzer.

This commit makes NumericTokenizer.reset idempotent and makes consuming a
NumericTokenizer behave the same way as consuming the underlying
NumericTokenStream.
This commit is contained in:
Adrien Grand 2013-06-21 14:55:40 +02:00
parent 58e68db148
commit 432628086f
12 changed files with 222 additions and 46 deletions

View File

@ -19,14 +19,12 @@
package org.elasticsearch.common.io;
import com.google.common.base.Charsets;
import org.elasticsearch.common.Preconditions;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.CachedStreamOutput;
import com.google.common.base.Charsets;
import java.io.*;
import java.nio.charset.Charset;
/**
* Simple utility methods for file and stream copying.
@ -273,4 +271,36 @@ public abstract class Streams {
}
return copyToByteArray(is);
}
public static int readFully(Reader reader, char[] dest) throws IOException {
return readFully(reader, dest, 0, dest.length);
}
public static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
int read = 0;
while (read < len) {
final int r = reader.read(dest, offset + read, len - read);
if (r == -1) {
break;
}
read += r;
}
return read;
}
public static int readFully(InputStream reader, byte[] dest) throws IOException {
return readFully(reader, dest, 0, dest.length);
}
public static int readFully(InputStream reader, byte[] dest, int offset, int len) throws IOException {
int read = 0;
while (read < len) {
final int r = reader.read(dest, offset + read, len - read);
if (r == -1) {
break;
}
read += r;
}
return read;
}
}

View File

@ -19,6 +19,8 @@
package org.elasticsearch.common.io.stream;
import org.elasticsearch.common.io.Streams;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
@ -46,12 +48,9 @@ public class InputStreamStreamInput extends StreamInput {
public void readBytes(byte[] b, int offset, int len) throws IOException {
if (len < 0)
throw new IndexOutOfBoundsException();
int n = 0;
while (n < len) {
int count = is.read(b, offset + n, len - n);
if (count < 0)
throw new EOFException();
n += count;
final int read = Streams.readFully(is, b, offset, len);
if (read != len) {
throw new EOFException();
}
}

View File

@ -30,10 +30,6 @@ import java.io.Reader;
*/
public class NumericDateTokenizer extends NumericTokenizer {
public NumericDateTokenizer(Reader reader, int precisionStep, DateTimeFormatter dateTimeFormatter) throws IOException {
super(reader, new NumericTokenStream(precisionStep), dateTimeFormatter);
}
public NumericDateTokenizer(Reader reader, int precisionStep, char[] buffer, DateTimeFormatter dateTimeFormatter) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, dateTimeFormatter);
}

View File

@ -29,10 +29,6 @@ import java.io.Reader;
*/
public class NumericDoubleTokenizer extends NumericTokenizer {
public NumericDoubleTokenizer(Reader reader, int precisionStep) throws IOException {
super(reader, new NumericTokenStream(precisionStep), null);
}
public NumericDoubleTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, null);
}

View File

@ -29,10 +29,6 @@ import java.io.Reader;
*/
public class NumericFloatTokenizer extends NumericTokenizer {
public NumericFloatTokenizer(Reader reader, int precisionStep) throws IOException {
super(reader, new NumericTokenStream(precisionStep), null);
}
public NumericFloatTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, null);
}

View File

@ -29,10 +29,6 @@ import java.io.Reader;
*/
public class NumericIntegerTokenizer extends NumericTokenizer {
public NumericIntegerTokenizer(Reader reader, int precisionStep) throws IOException {
super(reader, new NumericTokenStream(precisionStep), null);
}
public NumericIntegerTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, null);
}

View File

@ -29,10 +29,6 @@ import java.io.Reader;
*/
public class NumericLongTokenizer extends NumericTokenizer {
public NumericLongTokenizer(Reader reader, int precisionStep) throws IOException {
super(reader, new NumericTokenStream(precisionStep), null);
}
public NumericLongTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, null);
}

View File

@ -21,46 +21,79 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import org.elasticsearch.common.io.Streams;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;
/**
*
*/
public abstract class NumericTokenizer extends Tokenizer {
/** Make this tokenizer get attributes from the delegate token stream. */
private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
return new AttributeFactory() {
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
return (AttributeImpl) source.addAttribute(attClass);
}
};
}
private final NumericTokenStream numericTokenStream;
private final char[] buffer;
protected final Object extra;
protected NumericTokenizer(Reader reader, NumericTokenStream numericTokenStream, Object extra) throws IOException {
this(reader, numericTokenStream, new char[32], extra);
}
private boolean started;
protected NumericTokenizer(Reader reader, NumericTokenStream numericTokenStream, char[] buffer, Object extra) throws IOException {
super(reader);
super(delegatingAttributeFactory(numericTokenStream), reader);
this.numericTokenStream = numericTokenStream;
// Add attributes from the numeric token stream, this works fine because the attribute factory delegates to numericTokenStream
for (Iterator<Class<? extends Attribute>> it = numericTokenStream.getAttributeClassesIterator(); it.hasNext();) {
addAttribute(it.next());
}
this.extra = extra;
this.buffer = buffer;
started = true;
}
@Override
public void reset() throws IOException {
reset(buffer);
}
public void reset(char[] buffer) throws IOException {
int len = input.read(buffer);
String value = new String(buffer, 0, len);
setValue(numericTokenStream, value);
numericTokenStream.reset();
super.reset();
started = false;
}
@Override
public final boolean incrementToken() throws IOException {
if (!started) {
// reset() must be idempotent, this is why we read data in incrementToken
final int len = Streams.readFully(input, buffer);
if (len == buffer.length && input.read() != -1) {
throw new IOException("Cannot read numeric data larger than " + buffer.length + " chars");
}
setValue(numericTokenStream, new String(buffer, 0, len));
numericTokenStream.reset();
started = true;
}
return numericTokenStream.incrementToken();
}
@Override
public void end() throws IOException {
super.end();
numericTokenStream.end();
}
@Override
public void close() throws IOException {
super.close();
numericTokenStream.close();
}
protected abstract void setValue(NumericTokenStream tokenStream, String value);
}

View File

@ -342,10 +342,6 @@ public class IpFieldMapper extends NumberFieldMapper<Long> {
public static class NumericIpTokenizer extends NumericTokenizer {
public NumericIpTokenizer(Reader reader, int precisionStep) throws IOException {
super(reader, new NumericTokenStream(precisionStep), null);
}
public NumericIpTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
super(reader, new NumericTokenStream(precisionStep), buffer, null);
}

View File

@ -23,6 +23,8 @@ import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.util.CollectionUtil;
@ -126,6 +128,10 @@ public class PlainHighlighter implements Highlighter {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers().indexAnalyzer();
TokenStream tokenStream = analyzer.tokenStream(mapper.names().indexName(), new FastStringReader(text));
tokenStream.reset();
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
// can't perform highlighting if the stream has no terms (binary token stream) or no offsets
continue;
}
TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment != null && bestTextFragment.getScore() > 0) {

View File

@ -1431,4 +1431,70 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
assertThat(response.getHits().hits()[0].highlightFields().isEmpty(), equalTo(true));
}
@Test
// https://github.com/elasticsearch/elasticsearch/issues/3211
public void testNumericHighlighting() throws Exception {
wipeIndex("test");
prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("text")
.field("type", "string")
.field("index", "analyzed")
.endObject()
.startObject("byte")
.field("type", "byte")
.endObject()
.startObject("short")
.field("type", "short")
.endObject()
.startObject("int")
.field("type", "integer")
.endObject()
.startObject("long")
.field("type", "long")
.endObject()
.startObject("float")
.field("type", "float")
.endObject()
.startObject("double")
.field("type", "double")
.endObject()
.endObject()
.endObject()
.endObject())
.execute().actionGet();
ensureGreen();
client().prepareIndex("test", "type1", "1")
.setSource(jsonBuilder().startObject()
.field("text", "elasticsearch test")
.field("byte", 25)
.field("short", 42)
.field("int", 100)
.field("long", -1)
.field("float", 3.2f)
.field("double", 42.42)
.endObject())
.setRefresh(true).execute().actionGet();
SearchResponse response = client().prepareSearch("test")
.setQuery(QueryBuilders.matchQuery("text", "test").type(MatchQueryBuilder.Type.BOOLEAN))
.addHighlightedField("text")
.addHighlightedField("byte")
.addHighlightedField("short")
.addHighlightedField("int")
.addHighlightedField("long")
.addHighlightedField("float")
.addHighlightedField("double")
.execute().actionGet();
assertThat(response.getHits().totalHits(), equalTo(1L));
// Highlighting of numeric fields is not supported, but it should not raise errors
// (this behavior is consistent with version 0.20)
assertThat(response.getFailedShards(), equalTo(0));
}
}

View File

@ -0,0 +1,66 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.analysis;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.index.analysis.NumericDoubleAnalyzer;
import org.testng.annotations.Test;
import java.io.IOException;
import java.io.StringReader;
import java.util.Random;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
public class NumericAnalyzerTests {
@Test
public void testAttributeEqual() throws IOException {
final int precisionStep = 8;
final double value = new Random().nextDouble();
NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);
final TokenStream ts1 = analyzer.tokenStream("dummy", new StringReader(String.valueOf(value)));
final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
ts2.setDoubleValue(value);
final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
ts1.reset();
ts2.reset();
while (ts1.incrementToken()) {
assertThat(ts2.incrementToken(), is(true));
assertThat(posInc1, equalTo(posInc2));
// can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
}
assertThat(ts2.incrementToken(), is(false));
ts1.end();
ts2.end();
}
}