From 96150badce8234cac00a23c2d5da55545e0be958 Mon Sep 17 00:00:00 2001
From: Adrien Grand <jpountz@gmail.com>
Date: Tue, 5 Sep 2017 18:34:05 +0200
Subject: [PATCH] LUCENE-7956: Fixed potential stack overflow error in
 ICUNormalizer2CharFilter.

---
 lucene/CHANGES.txt                            |   3 +
 .../icu/ICUNormalizer2CharFilter.java         |  35 +++--
 .../icu/TestICUNormalizer2CharFilter.java     |  21 +++
 .../org/apache/lucene/search/DisiWrapper.java |  12 ++
 .../apache/lucene/search/TermInSetQuery.java  | 143 ++++--------------
 5 files changed, 89 insertions(+), 125 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a819916f275..02b22315940 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -195,6 +195,9 @@ Bug Fixes
 * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even
   if possible). (Dawid Weiss)
 
+* LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.
+  (Adrien Grand)
+
 Improvements
 
 * LUCENE-7489: Better storage of sparse doc-values fields with the default
diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
index 706550a0f71..c529f74ed08 100644
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.Reader;
 import java.util.Objects;
 
+import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.charfilter.BaseCharFilter;
 
 import com.ibm.icu.text.Normalizer2;
@@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
   ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
     super(in);
     this.normalizer = Objects.requireNonNull(normalizer);
-    this.tmpBuffer = new char[bufferSize];
+    this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
   }
 
   @Override
@@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
     return -1;
   }
 
-  private final char[] tmpBuffer;
+  private final CharacterUtils.CharacterBuffer tmpBuffer;
 
-  private int readInputToBuffer() throws IOException {
-    final int len = input.read(tmpBuffer);
-    if (len == -1) {
-      inputFinished = true;
-      return 0;
+  private void readInputToBuffer() throws IOException {
+    while (true) {
+      // CharacterUtils.fill is supplementary char aware
+      final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
+
+      assert tmpBuffer.getOffset() == 0;
+      inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
+
+      if (hasRemainingChars == false) {
+        inputFinished = true;
+        break;
+      }
+
+      final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength());
+      if (normalizer.isInert(lastCodePoint)) {
+        // we require an inert char so that we can normalize content before and
+        // after this character independently
+        break;
+      }
     }
-    inputBuffer.append(tmpBuffer, 0, len);
 
     // if checkedInputBoundary was at the end of a buffer, we need to check that char again
     checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
-    // this loop depends on 'isInert' (changes under normalization) but looks only at characters.
-    // so we treat all surrogates as non-inert for simplicity
-    if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isSurrogate(tmpBuffer[len-1])) {
-      return len;
-    } else return len + readInputToBuffer();
   }
 
   private int readAndNormalizeFromInput() {
diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
index 438a93179c2..822466f0192 100644
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java
@@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
+import java.util.Arrays;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.apache.lucene.util.TestUtil;
 
@@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
     }
     a.close();
   }
+
+  // https://issues.apache.org/jira/browse/LUCENE-7956
+  public void testVeryLargeInputOfNonInertChars() throws Exception {
+    char[] text = new char[1000000];
+    Arrays.fill(text, 'a');
+    try (Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        return new TokenStreamComponents(new KeywordTokenizer());
+      }
+
+      @Override
+      protected Reader initReader(String fieldName, Reader reader) {
+        return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
+      }
+    }) {
+      checkAnalysisConsistency(random(), a, false, new String(text));
+    }
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
index f2543409f1d..28ba989be62 100644
--- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java
@@ -60,6 +60,18 @@ public class DisiWrapper {
     }
   }
 
+  // For TermInSetQuery
+  public DisiWrapper(DocIdSetIterator iterator) {
+    this.scorer = null;
+    this.spans = null;
+    this.iterator = iterator;
+    this.cost = iterator.cost();
+    this.doc = -1;
+    this.twoPhaseView = null;
+    this.approximation = iterator;
+    this.matchCost = 0f;
+  }
+
   public DisiWrapper(Spans spans) {
     this.scorer = null;
     this.spans = spans;
diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
index 9b64d379174..5a6676fca90 100644
--- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java
@@ -17,12 +17,9 @@
 package org.apache.lucene.search;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.List;
-import java.util.Objects;
 import java.util.Set;
 import java.util.SortedSet;
 
@@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum;
 import org.apache.lucene.index.PrefixCodedTerms;
 import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermContext;
-import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanClause.Occur;
@@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
@@ -171,39 +167,6 @@ public class TermInSetQuery extends Query implements Accountable {
     return Collections.emptyList();
   }
 
-  private static class TermAndState {
-    final String field;
-    final TermsEnum termsEnum;
-    final BytesRef term;
-    final TermState state;
-    final int docFreq;
-    final long totalTermFreq;
-
-    TermAndState(String field, TermsEnum termsEnum) throws IOException {
-      this.field = field;
-      this.termsEnum = termsEnum;
-      this.term = BytesRef.deepCopyOf(termsEnum.term());
-      this.state = termsEnum.termState();
-      this.docFreq = termsEnum.docFreq();
-      this.totalTermFreq = termsEnum.totalTermFreq();
-    }
-  }
-
-  private static class WeightOrDocIdSet {
-    final Weight weight;
-    final DocIdSet set;
-
-    WeightOrDocIdSet(Weight weight) {
-      this.weight = Objects.requireNonNull(weight);
-      this.set = null;
-    }
-
-    WeightOrDocIdSet(DocIdSet bitset) {
-      this.set = bitset;
-      this.weight = null;
-    }
-  }
-
   @Override
   public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
     return new ConstantScoreWeight(this, boost) {
@@ -216,11 +179,8 @@ public class TermInSetQuery extends Query implements Accountable {
         // order to protect highlighters
       }
 
-      /**
-       * On the given leaf context, try to either rewrite to a disjunction if
-       * there are few matching terms, or build a bitset containing matching docs.
-       */
-      private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
         final LeafReader reader = context.reader();
 
         Terms terms = reader.terms(field);
@@ -231,90 +191,49 @@ public class TermInSetQuery extends Query implements Accountable {
         PostingsEnum docs = null;
         TermIterator iterator = termData.iterator();
 
-        // We will first try to collect up to 'threshold' terms into 'matchingTerms'
-        // if there are two many terms, we will fall back to building the 'builder'
-        final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
-        assert termData.size() > threshold : "Query should have been rewritten";
-        List<TermAndState> matchingTerms = new ArrayList<>(threshold);
-        DocIdSetBuilder builder = null;
+        // Here we partition postings based on cost: longer ones will be consumed
+        // lazily while shorter ones are consumed eagerly into a bitset. Compared to
+        // putting everything into a bitset, this should help skip over unnecessary doc
+        // ids in the longer postings lists. This should be especially useful if
+        // document frequencies have a zipfian distribution.
+        final PriorityQueue<PostingsEnum> longestPostingsLists = new PriorityQueue<PostingsEnum>(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
+          @Override
+          protected boolean lessThan(PostingsEnum a, PostingsEnum b) {
+            return a.cost() < b.cost();
+          }
+        };
+        DocIdSetBuilder shortestPostingsLists = null;
 
         for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
           assert field.equals(iterator.field());
           if (termsEnum.seekExact(term)) {
-            if (matchingTerms == null) {
-              docs = termsEnum.postings(docs, PostingsEnum.NONE);
-              builder.add(docs);
-            } else if (matchingTerms.size() < threshold) {
-              matchingTerms.add(new TermAndState(field, termsEnum));
-            } else {
-              assert matchingTerms.size() == threshold;
-              builder = new DocIdSetBuilder(reader.maxDoc(), terms);
-              docs = termsEnum.postings(docs, PostingsEnum.NONE);
-              builder.add(docs);
-              for (TermAndState t : matchingTerms) {
-                t.termsEnum.seekExact(t.term, t.state);
-                docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
-                builder.add(docs);
+            docs = termsEnum.postings(docs, PostingsEnum.NONE);
+            docs = longestPostingsLists.insertWithOverflow(docs);
+            if (docs != null) { // the pq is full
+              if (shortestPostingsLists == null) {
+                shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc());
               }
-              matchingTerms = null;
+              shortestPostingsLists.add(docs);
             }
           }
         }
-        if (matchingTerms != null) {
-          assert builder == null;
-          BooleanQuery.Builder bq = new BooleanQuery.Builder();
-          for (TermAndState t : matchingTerms) {
-            final TermContext termContext = new TermContext(searcher.getTopReaderContext());
-            termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
-            bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
-          }
-          Query q = new ConstantScoreQuery(bq.build());
-          final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
-          return new WeightOrDocIdSet(weight);
-        } else {
-          assert builder != null;
-          return new WeightOrDocIdSet(builder.build());
-        }
-      }
 
-      private Scorer scorer(DocIdSet set) throws IOException {
-        if (set == null) {
+        final int numClauses = longestPostingsLists.size() + (shortestPostingsLists == null ? 0 : 1);
+        if (numClauses == 0) {
           return null;
         }
-        final DocIdSetIterator disi = set.iterator();
-        if (disi == null) {
-          return null;
-        }
-        return new ConstantScoreScorer(this, score(), disi);
-      }
 
-      @Override
-      public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
-        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
-        if (weightOrBitSet == null) {
-          return null;
-        } else if (weightOrBitSet.weight != null) {
-          return weightOrBitSet.weight.bulkScorer(context);
-        } else {
-          final Scorer scorer = scorer(weightOrBitSet.set);
-          if (scorer == null) {
-            return null;
-          }
-          return new DefaultBulkScorer(scorer);
+        DisiPriorityQueue queue = new DisiPriorityQueue(numClauses);
+        for (PostingsEnum postings : longestPostingsLists) {
+          queue.add(new DisiWrapper(postings));
         }
-      }
-
-      @Override
-      public Scorer scorer(LeafReaderContext context) throws IOException {
-        final WeightOrDocIdSet weightOrBitSet = rewrite(context);
-        if (weightOrBitSet == null) {
-          return null;
-        } else if (weightOrBitSet.weight != null) {
-          return weightOrBitSet.weight.scorer(context);
-        } else {
-          return scorer(weightOrBitSet.set);
+        if (shortestPostingsLists != null) {
+          queue.add(new DisiWrapper(shortestPostingsLists.build().iterator()));
         }
+        final DocIdSetIterator disi = new DisjunctionDISIApproximation(queue);
+        return new ConstantScoreScorer(this, boost, disi);
       }
     };
   }
+
 }