reuse non analyzed token stream for string types

so heavyweight token stream won't be created each time
2025-03-24 17:09:48 +00:00 · 2012-12-12 22:53:48 -08:00 · 2012-12-12 22:53:48 -08:00 · c65d5a77c4
commit c65d5a77c4
parent fc35fd8a29
1 changed files with 87 additions and 1 deletions
--- a/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
@ -20,6 +20,9 @@
 package org.elasticsearch.index.mapper.core;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
@ -294,7 +297,7 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al
            context.ignoredValue(names.indexName(), value);
            return null;
        }
-        Field field = new Field(names.indexName(), value, fieldType);
+        Field field = new StringField(names.indexName(), value, fieldType);
        field.setBoost(boost);
        return field;
    }
@ -364,4 +367,87 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al
            builder.field("ignore_above", ignoreAbove);
        }
    }
+
+    /**
+     * Extension of {@link Field} supporting reuse of a cached TokenStream for not-tokenized values.
+     */
+    static class StringField extends Field {
+
+        public StringField(String name, String value, FieldType fieldType) {
+            super(name, value, fieldType);
+        }
+
+        @Override
+        public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+            if (!fieldType().indexed()) {
+                return null;
+            }
+            // Only use the cached TokenStream if the value is indexed and not-tokenized
+            if (fieldType().tokenized()) {
+                return super.tokenStream(analyzer);
+            }
+            return NOT_ANALYZED_TOKENSTREAM.get().setValue((String) fieldsData);
+        }
+    }
+
+    private static final ThreadLocal<StringTokenStream> NOT_ANALYZED_TOKENSTREAM = new ThreadLocal<StringTokenStream>() {
+        @Override
+        protected StringTokenStream initialValue() {
+            return new StringTokenStream();
+        }
+    };
+
+
+    // Copied from Field.java
+    static final class StringTokenStream extends TokenStream {
+        private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+        private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+        private boolean used = false;
+        private String value = null;
+
+        /**
+         * Creates a new TokenStream that returns a String as single token.
+         * <p>Warning: Does not initialize the value, you must call
+         * {@link #setValue(String)} afterwards!
+         */
+        StringTokenStream() {
+        }
+
+        /**
+         * Sets the string value.
+         */
+        StringTokenStream setValue(String value) {
+            this.value = value;
+            return this;
+        }
+
+        @Override
+        public boolean incrementToken() {
+            if (used) {
+                return false;
+            }
+            clearAttributes();
+            termAttribute.append(value);
+            offsetAttribute.setOffset(0, value.length());
+            used = true;
+            return true;
+        }
+
+        @Override
+        public void end() {
+            final int finalOffset = value.length();
+            offsetAttribute.setOffset(finalOffset, finalOffset);
+            value = null;
+        }
+
+        @Override
+        public void reset() {
+            used = false;
+        }
+
+        @Override
+        public void close() {
+            value = null;
+        }
+    }
 }