From 96150badce8234cac00a23c2d5da55545e0be958 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 5 Sep 2017 18:34:05 +0200 Subject: [PATCH] LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter. --- lucene/CHANGES.txt | 3 + .../icu/ICUNormalizer2CharFilter.java | 35 +++-- .../icu/TestICUNormalizer2CharFilter.java | 21 +++ .../org/apache/lucene/search/DisiWrapper.java | 12 ++ .../apache/lucene/search/TermInSetQuery.java | 143 ++++-------------- 5 files changed, 89 insertions(+), 125 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a819916f275..02b22315940 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -195,6 +195,9 @@ Bug Fixes * LUCENE-7864: IndexMergeTool is not using intermediate hard links (even if possible). (Dawid Weiss) +* LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter. + (Adrien Grand) + Improvements * LUCENE-7489: Better storage of sparse doc-values fields with the default diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java index 706550a0f71..c529f74ed08 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.io.Reader; import java.util.Objects; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.charfilter.BaseCharFilter; import com.ibm.icu.text.Normalizer2; @@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) { super(in); this.normalizer = Objects.requireNonNull(normalizer); - this.tmpBuffer = new char[bufferSize]; + this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize); } @Override @@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter { return -1; } - private final char[] tmpBuffer; + private final CharacterUtils.CharacterBuffer tmpBuffer; - private int readInputToBuffer() throws IOException { - final int len = input.read(tmpBuffer); - if (len == -1) { - inputFinished = true; - return 0; + private void readInputToBuffer() throws IOException { + while (true) { + // CharacterUtils.fill is supplementary char aware + final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input); + + assert tmpBuffer.getOffset() == 0; + inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); + + if (hasRemainingChars == false) { + inputFinished = true; + break; + } + + final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength()); + if (normalizer.isInert(lastCodePoint)) { + // we require an inert char so that we can normalize content before and + // after this character independently + break; + } } - inputBuffer.append(tmpBuffer, 0, len); // if checkedInputBoundary was at the end of a buffer, we need to check that char again checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); - // this loop depends on 'isInert' (changes under normalization) but looks only at characters. - // so we treat all surrogates as non-inert for simplicity - if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isSurrogate(tmpBuffer[len-1])) { - return len; - } else return len + readInputToBuffer(); } private int readAndNormalizeFromInput() { diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java index 438a93179c2..822466f0192 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java @@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.util.Arrays; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.util.TestUtil; @@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { } a.close(); } + + // https://issues.apache.org/jira/browse/LUCENE-7956 + public void testVeryLargeInputOfNonInertChars() throws Exception { + char[] text = new char[1000000]; + Arrays.fill(text, 'a'); + try (Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + return new TokenStreamComponents(new KeywordTokenizer()); + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); + } + }) { + checkAnalysisConsistency(random(), a, false, new String(text)); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java index f2543409f1d..28ba989be62 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java @@ -60,6 +60,18 @@ public class DisiWrapper { } } + // For TermInSetQuery + public DisiWrapper(DocIdSetIterator iterator) { + this.scorer = null; + this.spans = null; + this.iterator = iterator; + this.cost = iterator.cost(); + this.doc = -1; + this.twoPhaseView = null; + this.approximation = iterator; + this.matchCost = 0f; + } + public DisiWrapper(Spans spans) { this.scorer = null; this.spans = spans; diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 9b64d379174..5a6676fca90 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -17,12 +17,9 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.List; -import java.util.Objects; import java.util.Set; import java.util.SortedSet; @@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PrefixCodedTerms; import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; -import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause.Occur; @@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; /** @@ -171,39 +167,6 @@ public class TermInSetQuery extends Query implements Accountable { return Collections.emptyList(); } - private static class TermAndState { - final String field; - final TermsEnum termsEnum; - final BytesRef term; - final TermState state; - final int docFreq; - final long totalTermFreq; - - TermAndState(String field, TermsEnum termsEnum) throws IOException { - this.field = field; - this.termsEnum = termsEnum; - this.term = BytesRef.deepCopyOf(termsEnum.term()); - this.state = termsEnum.termState(); - this.docFreq = termsEnum.docFreq(); - this.totalTermFreq = termsEnum.totalTermFreq(); - } - } - - private static class WeightOrDocIdSet { - final Weight weight; - final DocIdSet set; - - WeightOrDocIdSet(Weight weight) { - this.weight = Objects.requireNonNull(weight); - this.set = null; - } - - WeightOrDocIdSet(DocIdSet bitset) { - this.set = bitset; - this.weight = null; - } - } - @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { return new ConstantScoreWeight(this, boost) { @@ -216,11 +179,8 @@ public class TermInSetQuery extends Query implements Accountable { // order to protect highlighters } - /** - * On the given leaf context, try to either rewrite to a disjunction if - * there are few matching terms, or build a bitset containing matching docs. - */ - private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException { + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { final LeafReader reader = context.reader(); Terms terms = reader.terms(field); @@ -231,90 +191,49 @@ public class TermInSetQuery extends Query implements Accountable { PostingsEnum docs = null; TermIterator iterator = termData.iterator(); - // We will first try to collect up to 'threshold' terms into 'matchingTerms' - // if there are two many terms, we will fall back to building the 'builder' - final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); - assert termData.size() > threshold : "Query should have been rewritten"; - List matchingTerms = new ArrayList<>(threshold); - DocIdSetBuilder builder = null; + // Here we partition postings based on cost: longer ones will be consumed + // lazily while shorter ones are consumed eagerly into a bitset. Compared to + // putting everything into a bitset, this should help skip over unnecessary doc + // ids in the longer postings lists. This should be especially useful if + // document frequencies have a zipfian distribution. + final PriorityQueue longestPostingsLists = new PriorityQueue(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) { + @Override + protected boolean lessThan(PostingsEnum a, PostingsEnum b) { + return a.cost() < b.cost(); + } + }; + DocIdSetBuilder shortestPostingsLists = null; for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { assert field.equals(iterator.field()); if (termsEnum.seekExact(term)) { - if (matchingTerms == null) { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); - } else if (matchingTerms.size() < threshold) { - matchingTerms.add(new TermAndState(field, termsEnum)); - } else { - assert matchingTerms.size() == threshold; - builder = new DocIdSetBuilder(reader.maxDoc(), terms); - docs = termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); - for (TermAndState t : matchingTerms) { - t.termsEnum.seekExact(t.term, t.state); - docs = t.termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); + docs = termsEnum.postings(docs, PostingsEnum.NONE); + docs = longestPostingsLists.insertWithOverflow(docs); + if (docs != null) { // the pq is full + if (shortestPostingsLists == null) { + shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc()); } - matchingTerms = null; + shortestPostingsLists.add(docs); } } } - if (matchingTerms != null) { - assert builder == null; - BooleanQuery.Builder bq = new BooleanQuery.Builder(); - for (TermAndState t : matchingTerms) { - final TermContext termContext = new TermContext(searcher.getTopReaderContext()); - termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD); - } - Query q = new ConstantScoreQuery(bq.build()); - final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score()); - return new WeightOrDocIdSet(weight); - } else { - assert builder != null; - return new WeightOrDocIdSet(builder.build()); - } - } - private Scorer scorer(DocIdSet set) throws IOException { - if (set == null) { + final int numClauses = longestPostingsLists.size() + (shortestPostingsLists == null ? 0 : 1); + if (numClauses == 0) { return null; } - final DocIdSetIterator disi = set.iterator(); - if (disi == null) { - return null; - } - return new ConstantScoreScorer(this, score(), disi); - } - @Override - public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrBitSet = rewrite(context); - if (weightOrBitSet == null) { - return null; - } else if (weightOrBitSet.weight != null) { - return weightOrBitSet.weight.bulkScorer(context); - } else { - final Scorer scorer = scorer(weightOrBitSet.set); - if (scorer == null) { - return null; - } - return new DefaultBulkScorer(scorer); + DisiPriorityQueue queue = new DisiPriorityQueue(numClauses); + for (PostingsEnum postings : longestPostingsLists) { + queue.add(new DisiWrapper(postings)); } - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrBitSet = rewrite(context); - if (weightOrBitSet == null) { - return null; - } else if (weightOrBitSet.weight != null) { - return weightOrBitSet.weight.scorer(context); - } else { - return scorer(weightOrBitSet.set); + if (shortestPostingsLists != null) { + queue.add(new DisiWrapper(shortestPostingsLists.build().iterator())); } + final DocIdSetIterator disi = new DisjunctionDISIApproximation(queue); + return new ConstantScoreScorer(this, boost, disi); } }; } + }