More resource efficient analysis wrapping usage

Today, we take great care to try and share the same analyzer instances across shards and indices (global analyzer). The idea is to share the same analyzer so the thread local resource it has will not be allocated per analyzer instance per thread. The problem is that AnalyzerWrapper keeps its resources on its own per thread storage, and with per field reuse strategy, it causes for per field per thread token stream components to be used. This is very evident with the StandardTokenizer that uses a buffer... This came out of test with "many fields", where the majority of 1GB heap was consumed by StandardTokenizer instances... closes #6714
2014-07-03 16:20:00 +02:00 · 2014-07-03 16:20:00 +02:00 · 5249005578
parent 388fddb3d9
commit 5249005578
5 changed files with 73 additions and 104 deletions
--- a/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java
+++ b/src/main/java/org/apache/lucene/analysis/CustomAnalyzerWrapper.java
@ -1,77 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.lucene.analysis;
-
-import java.io.Reader;
-
-/**
- * Similar to Lucene {@link AnalyzerWrapper} but actually allows to set the reuse strategy....
- * //TODO add to lucene the ability to set it...
- */
-public abstract class CustomAnalyzerWrapper extends Analyzer {
-
-    /**
-     * Creates a new CustomAnalyzerWrapper.  Since the {@link Analyzer.ReuseStrategy} of
-     * the wrapped Analyzers are unknown, {@link Analyzer.PerFieldReuseStrategy} is assumed
-     */
-    protected CustomAnalyzerWrapper(ReuseStrategy reuseStrategy) {
-        super(reuseStrategy);
-    }
-
-    /**
-     * Retrieves the wrapped Analyzer appropriate for analyzing the field with
-     * the given name
-     *
-     * @param fieldName Name of the field which is to be analyzed
-     * @return Analyzer for the field with the given name.  Assumed to be non-null
-     */
-    protected abstract Analyzer getWrappedAnalyzer(String fieldName);
-
-    /**
-     * Wraps / alters the given TokenStreamComponents, taken from the wrapped
-     * Analyzer, to form new components.  It is through this method that new
-     * TokenFilters can be added by AnalyzerWrappers.
-     *
-     * @param fieldName  Name of the field which is to be analyzed
-     * @param components TokenStreamComponents taken from the wrapped Analyzer
-     * @return Wrapped / altered TokenStreamComponents.
-     */
-    protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
-
-    @Override
-    protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
-        return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
-    }
-
-    @Override
-    public int getPositionIncrementGap(String fieldName) {
-        return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
-    }
-
-    @Override
-    public int getOffsetGap(String fieldName) {
-        return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
-    }
-
-    @Override
-    public final Reader initReader(String fieldName, Reader reader) {
-        return getWrappedAnalyzer(fieldName).initReader(fieldName, reader);
-    }
-}
--- a/src/main/java/org/apache/lucene/analysis/SimpleAnalyzerWrapper.java
+++ b/src/main/java/org/apache/lucene/analysis/SimpleAnalyzerWrapper.java
@ -0,0 +1,66 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.analysis;
+
+import java.io.Reader;
+
+/**
+ * A simple analyzer wrapper, that doesn't allow to wrap components or reader. By disallowing
+ * it, it means that the thread local resources will be delegated to the wrapped analyzer, and not
+ * also be allocated on this analyzer.
+ *
+ * This solves the problem of per field analyzer wrapper, where it also maintains a thread local
+ * per field token stream components, while it can safely delegate those and not also hold these
+ * data structures, which can become expensive memory wise.
+ */
+public abstract class SimpleAnalyzerWrapper extends AnalyzerWrapper {
+
+    public SimpleAnalyzerWrapper() {
+        super(new DelegatingReuseStrategy());
+        ((DelegatingReuseStrategy) getReuseStrategy()).wrapper = this;
+    }
+
+    @Override
+    protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+        return super.wrapComponents(fieldName, components);
+    }
+
+    @Override
+    protected final Reader wrapReader(String fieldName, Reader reader) {
+        return super.wrapReader(fieldName, reader);
+    }
+
+    private static class DelegatingReuseStrategy extends ReuseStrategy {
+
+        AnalyzerWrapper wrapper;
+
+        @Override
+        public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) {
+            Analyzer wrappedAnalyzer = wrapper.getWrappedAnalyzer(fieldName);
+            return wrappedAnalyzer.getReuseStrategy().getReusableComponents(wrappedAnalyzer, fieldName);
+        }
+
+        @Override
+        public void setReusableComponents(Analyzer analyzer, String fieldName, TokenStreamComponents components) {
+            Analyzer wrappedAnalyzer = wrapper.getWrappedAnalyzer(fieldName);
+            wrappedAnalyzer.getReuseStrategy().setReusableComponents(wrappedAnalyzer, fieldName, components);
+        }
+    }
+}
--- a/src/main/java/org/elasticsearch/index/analysis/FieldNameAnalyzer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/FieldNameAnalyzer.java
@ -20,13 +20,13 @@
 package org.elasticsearch.index.analysis;

 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.SimpleAnalyzerWrapper;
 import org.elasticsearch.common.collect.ImmutableOpenMap;

 /**
 *
 */
-public final class FieldNameAnalyzer extends AnalyzerWrapper {
+public final class FieldNameAnalyzer extends SimpleAnalyzerWrapper {

    private final ImmutableOpenMap<String, Analyzer> analyzers;

@ -50,11 +50,6 @@ public final class FieldNameAnalyzer extends AnalyzerWrapper {
        return getAnalyzer(fieldName);
    }

-    @Override
-    protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
-        return components;
-    }
-
    private Analyzer getAnalyzer(String name) {
        Analyzer analyzer = analyzers.get(name);
        if (analyzer != null) {
--- a/src/main/java/org/elasticsearch/index/analysis/NamedAnalyzer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NamedAnalyzer.java
@ -20,13 +20,13 @@
 package org.elasticsearch.index.analysis;

 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CustomAnalyzerWrapper;
+import org.apache.lucene.analysis.SimpleAnalyzerWrapper;

 /**
 * Named analyzer is an analyzer wrapper around an actual analyzer ({@link #analyzer} that is associated
 * with a name ({@link #name()}.
 */
-public class NamedAnalyzer extends CustomAnalyzerWrapper {
+public class NamedAnalyzer extends SimpleAnalyzerWrapper {

    private final String name;
    private final AnalyzerScope scope;
@ -46,7 +46,6 @@ public class NamedAnalyzer extends CustomAnalyzerWrapper {
    }

    public NamedAnalyzer(String name, AnalyzerScope scope, Analyzer analyzer, int positionOffsetGap) {
-        super(analyzer.getReuseStrategy());
        this.name = name;
        this.scope = scope;
        this.analyzer = analyzer;
@ -79,11 +78,6 @@ public class NamedAnalyzer extends CustomAnalyzerWrapper {
        return this.analyzer;
    }

-    @Override
-    protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
-        return components;
-    }
-
    @Override
    public int getPositionIncrementGap(String fieldName) {
        if (positionOffsetGap != Integer.MIN_VALUE) {
--- a/src/main/java/org/elasticsearch/index/mapper/MapperService.java
+++ b/src/main/java/org/elasticsearch/index/mapper/MapperService.java
@ -24,6 +24,7 @@ import com.google.common.base.Charsets;
 import com.google.common.collect.*;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.SimpleAnalyzerWrapper;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.FilterClause;
 import org.apache.lucene.queries.TermFilter;
@ -976,7 +977,7 @@ public class MapperService extends AbstractIndexComponent implements Iterable<Do
        }
    }

-    final class SmartIndexNameSearchAnalyzer extends AnalyzerWrapper {
+    final class SmartIndexNameSearchAnalyzer extends SimpleAnalyzerWrapper {

        private final Analyzer defaultAnalyzer;

@ -1005,14 +1006,9 @@ public class MapperService extends AbstractIndexComponent implements Iterable<Do
            }
            return defaultAnalyzer;
        }
-
-        @Override
-        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
-            return components;
-        }
    }

-    final class SmartIndexNameSearchQuoteAnalyzer extends AnalyzerWrapper {
+    final class SmartIndexNameSearchQuoteAnalyzer extends SimpleAnalyzerWrapper {

        private final Analyzer defaultAnalyzer;

@ -1041,11 +1037,6 @@ public class MapperService extends AbstractIndexComponent implements Iterable<Do
            }
            return defaultAnalyzer;
        }
-
-        @Override
-        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
-            return components;
-        }
    }

    class InternalFieldMapperListener extends FieldMapperListener {