Fix NumericTokenizer.

NumericTokenizer is a simple wrapper aroung a NumericTokenStream. However, its implementations had a few issues: its reset() method was not idempotent, causing exceptions if reset() was called twice (causing #3211) and it had no attributes, meaning that the only thing it allowed to do is counting the number of generated tokens. The reason why indexing numeric data worked is that the mapper's parseCreateField directly generates a NumericTokenStream and by-passes the analyzer. This commit makes NumericTokenizer.reset idempotent and makes consuming a NumericTokenizer behave the same way as consuming the underlying NumericTokenStream.
2025-03-25 01:19:02 +00:00 · 2013-06-21 14:55:40 +02:00 · 2013-06-21 14:55:40 +02:00 · 432628086f
commit 432628086f
parent 58e68db148
12 changed files with 222 additions and 46 deletions
--- a/src/main/java/org/elasticsearch/common/io/Streams.java
+++ b/src/main/java/org/elasticsearch/common/io/Streams.java
@ -19,14 +19,12 @@

 package org.elasticsearch.common.io;

+import com.google.common.base.Charsets;
 import org.elasticsearch.common.Preconditions;
 import org.elasticsearch.common.io.stream.BytesStreamOutput;
 import org.elasticsearch.common.io.stream.CachedStreamOutput;

-import com.google.common.base.Charsets;
-
 import java.io.*;
-import java.nio.charset.Charset;

 /**
 * Simple utility methods for file and stream copying.
@ -273,4 +271,36 @@ public abstract class Streams {
        }
        return copyToByteArray(is);
    }
+
+    public static int readFully(Reader reader, char[] dest) throws IOException {
+        return readFully(reader, dest, 0, dest.length);
+    }
+
+    public static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
+        int read = 0;
+        while (read < len) {
+            final int r = reader.read(dest, offset + read, len - read);
+            if (r == -1) {
+                break;
+            }
+            read += r;
+        }
+        return read;
+    }
+
+    public static int readFully(InputStream reader, byte[] dest) throws IOException {
+        return readFully(reader, dest, 0, dest.length);
+    }
+
+    public static int readFully(InputStream reader, byte[] dest, int offset, int len) throws IOException {
+        int read = 0;
+        while (read < len) {
+            final int r = reader.read(dest, offset + read, len - read);
+            if (r == -1) {
+                break;
+            }
+            read += r;
+        }
+        return read;
+    }
 }
--- a/src/main/java/org/elasticsearch/common/io/stream/InputStreamStreamInput.java
+++ b/src/main/java/org/elasticsearch/common/io/stream/InputStreamStreamInput.java
@ -19,6 +19,8 @@

 package org.elasticsearch.common.io.stream;

+import org.elasticsearch.common.io.Streams;
+
 import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
@ -46,12 +48,9 @@ public class InputStreamStreamInput extends StreamInput {
    public void readBytes(byte[] b, int offset, int len) throws IOException {
        if (len < 0)
            throw new IndexOutOfBoundsException();
-        int n = 0;
-        while (n < len) {
-            int count = is.read(b, offset + n, len - n);
-            if (count < 0)
-                throw new EOFException();
-            n += count;
+        final int read = Streams.readFully(is, b, offset, len);
+        if (read != len) {
+            throw new EOFException();
        }
    }

--- a/src/main/java/org/elasticsearch/index/analysis/NumericDateTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericDateTokenizer.java
@ -30,10 +30,6 @@ import java.io.Reader;
 */
 public class NumericDateTokenizer extends NumericTokenizer {

-    public NumericDateTokenizer(Reader reader, int precisionStep, DateTimeFormatter dateTimeFormatter) throws IOException {
-        super(reader, new NumericTokenStream(precisionStep), dateTimeFormatter);
-    }
-
    public NumericDateTokenizer(Reader reader, int precisionStep, char[] buffer, DateTimeFormatter dateTimeFormatter) throws IOException {
        super(reader, new NumericTokenStream(precisionStep), buffer, dateTimeFormatter);
    }
--- a/src/main/java/org/elasticsearch/index/analysis/NumericDoubleTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericDoubleTokenizer.java
@ -29,10 +29,6 @@ import java.io.Reader;
 */
 public class NumericDoubleTokenizer extends NumericTokenizer {

-    public NumericDoubleTokenizer(Reader reader, int precisionStep) throws IOException {
-        super(reader, new NumericTokenStream(precisionStep), null);
-    }
-
    public NumericDoubleTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
        super(reader, new NumericTokenStream(precisionStep), buffer, null);
    }
--- a/src/main/java/org/elasticsearch/index/analysis/NumericFloatTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericFloatTokenizer.java
@ -29,10 +29,6 @@ import java.io.Reader;
 */
 public class NumericFloatTokenizer extends NumericTokenizer {

-    public NumericFloatTokenizer(Reader reader, int precisionStep) throws IOException {
-        super(reader, new NumericTokenStream(precisionStep), null);
-    }
-
    public NumericFloatTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
        super(reader, new NumericTokenStream(precisionStep), buffer, null);
    }
--- a/src/main/java/org/elasticsearch/index/analysis/NumericIntegerTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericIntegerTokenizer.java
@ -29,10 +29,6 @@ import java.io.Reader;
 */
 public class NumericIntegerTokenizer extends NumericTokenizer {

-    public NumericIntegerTokenizer(Reader reader, int precisionStep) throws IOException {
-        super(reader, new NumericTokenStream(precisionStep), null);
-    }
-
    public NumericIntegerTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
        super(reader, new NumericTokenStream(precisionStep), buffer, null);
    }
--- a/src/main/java/org/elasticsearch/index/analysis/NumericLongTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericLongTokenizer.java
@ -29,10 +29,6 @@ import java.io.Reader;
 */
 public class NumericLongTokenizer extends NumericTokenizer {

-    public NumericLongTokenizer(Reader reader, int precisionStep) throws IOException {
-        super(reader, new NumericTokenStream(precisionStep), null);
-    }
-
    public NumericLongTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
        super(reader, new NumericTokenStream(precisionStep), buffer, null);
    }
--- a/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NumericTokenizer.java
@ -21,46 +21,79 @@ package org.elasticsearch.index.analysis;

 import org.apache.lucene.analysis.NumericTokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeSource;
+import org.elasticsearch.common.io.Streams;

 import java.io.IOException;
 import java.io.Reader;
+import java.util.Iterator;

 /**
 *
 */
 public abstract class NumericTokenizer extends Tokenizer {

+    /** Make this tokenizer get attributes from the delegate token stream. */
+    private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) {
+        return new AttributeFactory() {
+            @Override
+            public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
+                return (AttributeImpl) source.addAttribute(attClass);
+            }
+        };
+    }
+
    private final NumericTokenStream numericTokenStream;
    private final char[] buffer;
    protected final Object extra;
-
-    protected NumericTokenizer(Reader reader, NumericTokenStream numericTokenStream, Object extra) throws IOException {
-        this(reader, numericTokenStream, new char[32], extra);
-    }
+    private boolean started;

    protected NumericTokenizer(Reader reader, NumericTokenStream numericTokenStream, char[] buffer, Object extra) throws IOException {
-        super(reader);
+        super(delegatingAttributeFactory(numericTokenStream), reader);
        this.numericTokenStream = numericTokenStream;
+        // Add attributes from the numeric token stream, this works fine because the attribute factory delegates to numericTokenStream
+        for (Iterator<Class<? extends Attribute>> it = numericTokenStream.getAttributeClassesIterator(); it.hasNext();) {
+            addAttribute(it.next());
+        }
        this.extra = extra;
        this.buffer = buffer;
+        started = true;
    }

    @Override
    public void reset() throws IOException {
-        reset(buffer);
-    }
-
-    public void reset(char[] buffer) throws IOException {
-        int len = input.read(buffer);
-        String value = new String(buffer, 0, len);
-        setValue(numericTokenStream, value);
-        numericTokenStream.reset();
+        super.reset();
+        started = false;
    }

    @Override
    public final boolean incrementToken() throws IOException {
+        if (!started) {
+            // reset() must be idempotent, this is why we read data in incrementToken
+            final int len = Streams.readFully(input, buffer);
+            if (len == buffer.length && input.read() != -1) {
+                throw new IOException("Cannot read numeric data larger than " + buffer.length + " chars");
+            }
+            setValue(numericTokenStream, new String(buffer, 0, len));
+            numericTokenStream.reset();
+            started = true;
+        }
        return numericTokenStream.incrementToken();
    }

+    @Override
+    public void end() throws IOException {
+        super.end();
+        numericTokenStream.end();
+    }
+
+    @Override
+    public void close() throws IOException {
+        super.close();
+        numericTokenStream.close();
+    }
+
    protected abstract void setValue(NumericTokenStream tokenStream, String value);
 }
--- a/src/main/java/org/elasticsearch/index/mapper/ip/IpFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/ip/IpFieldMapper.java
@ -342,10 +342,6 @@ public class IpFieldMapper extends NumberFieldMapper<Long> {

    public static class NumericIpTokenizer extends NumericTokenizer {

-        public NumericIpTokenizer(Reader reader, int precisionStep) throws IOException {
-            super(reader, new NumericTokenStream(precisionStep), null);
-        }
-
        public NumericIpTokenizer(Reader reader, int precisionStep, char[] buffer) throws IOException {
            super(reader, new NumericTokenStream(precisionStep), buffer, null);
        }
--- a/src/main/java/org/elasticsearch/search/highlight/PlainHighlighter.java
+++ b/src/main/java/org/elasticsearch/search/highlight/PlainHighlighter.java
@ -23,6 +23,8 @@ import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Maps;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.*;
 import org.apache.lucene.util.CollectionUtil;
@ -126,6 +128,10 @@ public class PlainHighlighter implements Highlighter {
                Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers().indexAnalyzer();
                TokenStream tokenStream = analyzer.tokenStream(mapper.names().indexName(), new FastStringReader(text));
                tokenStream.reset();
+                if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
+                    // can't perform highlighting if the stream has no terms (binary token stream) or no offsets
+                    continue;
+                }
                TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
                for (TextFragment bestTextFragment : bestTextFragments) {
                    if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@ -1431,4 +1431,70 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
        assertThat(response.getHits().hits()[0].highlightFields().isEmpty(), equalTo(true));
    }

+    @Test
+    // https://github.com/elasticsearch/elasticsearch/issues/3211
+    public void testNumericHighlighting() throws Exception {
+        wipeIndex("test");
+        prepareCreate("test")
+        .addMapping("test", jsonBuilder()
+                .startObject()
+                    .startObject("test")
+                        .startObject("properties")
+                            .startObject("text")
+                                .field("type", "string")
+                                .field("index", "analyzed")
+                            .endObject()
+                            .startObject("byte")
+                                .field("type", "byte")
+                            .endObject()
+                            .startObject("short")
+                                .field("type", "short")
+                            .endObject()
+                            .startObject("int")
+                                .field("type", "integer")
+                            .endObject()
+                            .startObject("long")
+                                .field("type", "long")
+                            .endObject()
+                            .startObject("float")
+                                .field("type", "float")
+                            .endObject()
+                            .startObject("double")
+                                .field("type", "double")
+                            .endObject()
+                        .endObject()
+                    .endObject()
+                .endObject())
+        .execute().actionGet();
+
+        ensureGreen();
+
+        client().prepareIndex("test", "type1", "1")
+        .setSource(jsonBuilder().startObject()
+            .field("text", "elasticsearch test")
+            .field("byte", 25)
+            .field("short", 42)
+            .field("int", 100)
+            .field("long", -1)
+            .field("float", 3.2f)
+            .field("double", 42.42)
+        .endObject())
+        .setRefresh(true).execute().actionGet();
+
+        SearchResponse response = client().prepareSearch("test")
+                .setQuery(QueryBuilders.matchQuery("text", "test").type(MatchQueryBuilder.Type.BOOLEAN))
+                .addHighlightedField("text")
+                .addHighlightedField("byte")
+                .addHighlightedField("short")
+                .addHighlightedField("int")
+                .addHighlightedField("long")
+                .addHighlightedField("float")
+                .addHighlightedField("double")
+                .execute().actionGet();
+        assertThat(response.getHits().totalHits(), equalTo(1L));
+        // Highlighting of numeric fields is not supported, but it should not raise errors
+        // (this behavior is consistent with version 0.20)
+        assertThat(response.getFailedShards(), equalTo(0));
+    }
+
 }
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/NumericAnalyzerTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/NumericAnalyzerTests.java
@ -0,0 +1,66 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.test.unit.index.analysis;
+
+import org.apache.lucene.analysis.NumericTokenStream;
+import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.index.analysis.NumericDoubleAnalyzer;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Random;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+
+public class NumericAnalyzerTests {
+
+    @Test
+    public void testAttributeEqual() throws IOException {
+        final int precisionStep = 8;
+        final double value = new Random().nextDouble();
+        NumericDoubleAnalyzer analyzer = new NumericDoubleAnalyzer(precisionStep);
+
+        final TokenStream ts1 = analyzer.tokenStream("dummy", new StringReader(String.valueOf(value)));
+        final NumericTokenStream ts2 = new NumericTokenStream(precisionStep);
+        ts2.setDoubleValue(value);
+        final NumericTermAttribute numTerm1 = ts1.addAttribute(NumericTermAttribute.class);
+        final NumericTermAttribute numTerm2 = ts1.addAttribute(NumericTermAttribute.class);
+        final PositionIncrementAttribute posInc1 = ts1.addAttribute(PositionIncrementAttribute.class);
+        final PositionIncrementAttribute posInc2 = ts1.addAttribute(PositionIncrementAttribute.class);
+        ts1.reset();
+        ts2.reset();
+        while (ts1.incrementToken()) {
+            assertThat(ts2.incrementToken(), is(true));
+            assertThat(posInc1, equalTo(posInc2));
+            // can't use equalTo directly on the numeric attribute cause it doesn't implement equals (LUCENE-5070)
+            assertThat(numTerm1.getRawValue(), equalTo(numTerm2.getRawValue()));
+            assertThat(numTerm2.getShift(), equalTo(numTerm2.getShift()));
+        }
+        assertThat(ts2.incrementToken(), is(false));
+        ts1.end();
+        ts2.end();
+    }
+
+}