Security for _field_names field should not override field statistics (#33261)

In Lucene 8 the statistics for a field (doc_count, sum_doc_count, ...) are checked and invalid values (v < 0) are rejected. Though for the _field_names field we hide the statistics of the field if security is enabled since some terms (field names) may be filtered. However this statistics are never used, this field is not used for ranking and cannot be used to generate term vectors. For these reasons this commit restores the original statistics for the field in order to be compliant with Lucene 8.
2018-09-03 09:36:39 +02:00 · 2018-09-03 09:36:39 +02:00 · f0a61b6dec
parent 713c07e14d
commit f0a61b6dec
2 changed files with 159 additions and 151 deletions
--- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java
+++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/security/authz/accesscontrol/FieldSubsetReader.java
@ -35,6 +35,7 @@ import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
 import org.elasticsearch.index.mapper.SourceFieldMapper;

 import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
@ -70,7 +71,11 @@ public final class FieldSubsetReader extends FilterLeafReader {
            super(in, new FilterDirectoryReader.SubReaderWrapper() {
                @Override
                public LeafReader wrap(LeafReader reader) {
+                    try {
                        return new FieldSubsetReader(reader, filter);
+                    } catch (IOException e) {
+                        throw new UncheckedIOException(e);
+                    }
                }
            });
            this.filter = filter;
@ -109,11 +114,13 @@ public final class FieldSubsetReader extends FilterLeafReader {
    private final FieldInfos fieldInfos;
    /** An automaton that only accepts authorized fields. */
    private final CharacterRunAutomaton filter;
+    /** {@link Terms} cache with filtered stats for the {@link FieldNamesFieldMapper} field. */
+    private final Terms fieldNamesFilterTerms;

    /**
     * Wrap a single segment, exposing a subset of its fields.
     */
-    FieldSubsetReader(LeafReader in, CharacterRunAutomaton filter) {
+    FieldSubsetReader(LeafReader in, CharacterRunAutomaton filter) throws IOException {
        super(in);
        ArrayList<FieldInfo> filteredInfos = new ArrayList<>();
        for (FieldInfo fi : in.getFieldInfos()) {
@ -123,6 +130,8 @@ public final class FieldSubsetReader extends FilterLeafReader {
        }
        fieldInfos = new FieldInfos(filteredInfos.toArray(new FieldInfo[filteredInfos.size()]));
        this.filter = filter;
+        final Terms fieldNameTerms = super.terms(FieldNamesFieldMapper.NAME);
+        this.fieldNamesFilterTerms = fieldNameTerms == null ? null : new FieldNamesTerms(fieldNameTerms);
    }

    /** returns true if this field is allowed. */
@ -346,21 +355,14 @@ public final class FieldSubsetReader extends FilterLeafReader {
        }
    }

-    private Terms wrapTerms(Terms terms, String field) {
+    private Terms wrapTerms(Terms terms, String field) throws IOException {
        if (!hasField(field)) {
            return null;
        } else if (FieldNamesFieldMapper.NAME.equals(field)) {
            // for the _field_names field, fields for the document
            // are encoded as postings, where term is the field.
            // so we hide terms for fields we filter out.
-            if (terms != null) {
-                // check for null, in case term dictionary is not a ghostbuster
-                // So just because its in fieldinfos and "indexed=true" doesn't mean you can go grab a Terms for it.
-                // It just means at one point there was a document with that field indexed...
-                // The fields infos isn't updates/removed even if no docs refer to it
-                terms = new FieldNamesTerms(terms);
-            }
-            return terms;
+            return fieldNamesFilterTerms;
        } else {
            return terms;
        }
@ -371,9 +373,22 @@ public final class FieldSubsetReader extends FilterLeafReader {
     * representing fields that should not be visible in this reader.
     */
    class FieldNamesTerms extends FilterTerms {
+        final long size;
+        final long sumDocFreq;

-        FieldNamesTerms(Terms in) {
+        FieldNamesTerms(Terms in) throws IOException {
            super(in);
+            assert in.hasFreqs() == false;
+            // re-compute the stats for the field to take
+            // into account the filtered terms.
+            final TermsEnum e = iterator();
+            long size = 0, sumDocFreq = 0;
+            while (e.next() != null) {
+                size ++;
+                sumDocFreq += e.docFreq();
+            }
+            this.size = size;
+            this.sumDocFreq = sumDocFreq;
        }

        @Override
@ -381,27 +396,20 @@ public final class FieldSubsetReader extends FilterLeafReader {
            return new FieldNamesTermsEnum(in.iterator());
        }

-        // we don't support field statistics (since we filter out terms)
-        // but this isn't really a big deal: _field_names is not used for ranking.
-
        @Override
-        public int getDocCount() throws IOException {
-            return -1;
+        public long size() throws IOException {
+            return size;
        }

        @Override
        public long getSumDocFreq() throws IOException {
-            return -1;
+            return sumDocFreq;
        }

        @Override
-        public long getSumTotalTermFreq() throws IOException {
-            return -1;
-        }
-
-        @Override
-        public long size() throws IOException {
-            return -1;
+        public int getDocCount() throws IOException {
+            // it is costly to recompute this value so we assume that docCount == maxDoc.
+            return maxDoc();
        }
    }