mirror of https://github.com/apache/lucene.git
LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.
This commit is contained in:
parent
5a8eb5388d
commit
96150badce
|
@ -195,6 +195,9 @@ Bug Fixes
|
||||||
* LUCENE-7864: IndexMergeTool is not using intermediate hard links (even
|
* LUCENE-7864: IndexMergeTool is not using intermediate hard links (even
|
||||||
if possible). (Dawid Weiss)
|
if possible). (Dawid Weiss)
|
||||||
|
|
||||||
|
* LUCENE-7956: Fixed potential stack overflow error in ICUNormalizer2CharFilter.
|
||||||
|
(Adrien Grand)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharacterUtils;
|
||||||
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
||||||
|
|
||||||
import com.ibm.icu.text.Normalizer2;
|
import com.ibm.icu.text.Normalizer2;
|
||||||
|
@ -61,7 +62,7 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||||
ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
|
ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
|
||||||
super(in);
|
super(in);
|
||||||
this.normalizer = Objects.requireNonNull(normalizer);
|
this.normalizer = Objects.requireNonNull(normalizer);
|
||||||
this.tmpBuffer = new char[bufferSize];
|
this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -94,23 +95,31 @@ public final class ICUNormalizer2CharFilter extends BaseCharFilter {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private final char[] tmpBuffer;
|
private final CharacterUtils.CharacterBuffer tmpBuffer;
|
||||||
|
|
||||||
private int readInputToBuffer() throws IOException {
|
private void readInputToBuffer() throws IOException {
|
||||||
final int len = input.read(tmpBuffer);
|
while (true) {
|
||||||
if (len == -1) {
|
// CharacterUtils.fill is supplementary char aware
|
||||||
|
final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);
|
||||||
|
|
||||||
|
assert tmpBuffer.getOffset() == 0;
|
||||||
|
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
|
||||||
|
|
||||||
|
if (hasRemainingChars == false) {
|
||||||
inputFinished = true;
|
inputFinished = true;
|
||||||
return 0;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int lastCodePoint = Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength());
|
||||||
|
if (normalizer.isInert(lastCodePoint)) {
|
||||||
|
// we require an inert char so that we can normalize content before and
|
||||||
|
// after this character independently
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inputBuffer.append(tmpBuffer, 0, len);
|
|
||||||
|
|
||||||
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
|
||||||
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
|
||||||
// this loop depends on 'isInert' (changes under normalization) but looks only at characters.
|
|
||||||
// so we treat all surrogates as non-inert for simplicity
|
|
||||||
if (normalizer.isInert(tmpBuffer[len - 1]) && !Character.isSurrogate(tmpBuffer[len-1])) {
|
|
||||||
return len;
|
|
||||||
} else return len + readInputToBuffer();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private int readAndNormalizeFromInput() {
|
private int readAndNormalizeFromInput() {
|
||||||
|
|
|
@ -20,12 +20,14 @@ package org.apache.lucene.analysis.icu;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharFilter;
|
import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
|
@ -418,4 +420,23 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://issues.apache.org/jira/browse/LUCENE-7956
|
||||||
|
public void testVeryLargeInputOfNonInertChars() throws Exception {
|
||||||
|
char[] text = new char[1000000];
|
||||||
|
Arrays.fill(text, 'a');
|
||||||
|
try (Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
return new TokenStreamComponents(new KeywordTokenizer());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new ICUNormalizer2CharFilter(reader, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||||
|
}
|
||||||
|
}) {
|
||||||
|
checkAnalysisConsistency(random(), a, false, new String(text));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,6 +60,18 @@ public class DisiWrapper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For TermInSetQuery
|
||||||
|
public DisiWrapper(DocIdSetIterator iterator) {
|
||||||
|
this.scorer = null;
|
||||||
|
this.spans = null;
|
||||||
|
this.iterator = iterator;
|
||||||
|
this.cost = iterator.cost();
|
||||||
|
this.doc = -1;
|
||||||
|
this.twoPhaseView = null;
|
||||||
|
this.approximation = iterator;
|
||||||
|
this.matchCost = 0f;
|
||||||
|
}
|
||||||
|
|
||||||
public DisiWrapper(Spans spans) {
|
public DisiWrapper(Spans spans) {
|
||||||
this.scorer = null;
|
this.scorer = null;
|
||||||
this.spans = spans;
|
this.spans = spans;
|
||||||
|
|
|
@ -17,12 +17,9 @@
|
||||||
package org.apache.lucene.search;
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
|
|
||||||
|
@ -33,8 +30,6 @@ import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.PrefixCodedTerms;
|
import org.apache.lucene.index.PrefixCodedTerms;
|
||||||
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
|
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermContext;
|
|
||||||
import org.apache.lucene.index.TermState;
|
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
|
@ -43,6 +38,7 @@ import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.DocIdSetBuilder;
|
import org.apache.lucene.util.DocIdSetBuilder;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -171,39 +167,6 @@ public class TermInSetQuery extends Query implements Accountable {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class TermAndState {
|
|
||||||
final String field;
|
|
||||||
final TermsEnum termsEnum;
|
|
||||||
final BytesRef term;
|
|
||||||
final TermState state;
|
|
||||||
final int docFreq;
|
|
||||||
final long totalTermFreq;
|
|
||||||
|
|
||||||
TermAndState(String field, TermsEnum termsEnum) throws IOException {
|
|
||||||
this.field = field;
|
|
||||||
this.termsEnum = termsEnum;
|
|
||||||
this.term = BytesRef.deepCopyOf(termsEnum.term());
|
|
||||||
this.state = termsEnum.termState();
|
|
||||||
this.docFreq = termsEnum.docFreq();
|
|
||||||
this.totalTermFreq = termsEnum.totalTermFreq();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class WeightOrDocIdSet {
|
|
||||||
final Weight weight;
|
|
||||||
final DocIdSet set;
|
|
||||||
|
|
||||||
WeightOrDocIdSet(Weight weight) {
|
|
||||||
this.weight = Objects.requireNonNull(weight);
|
|
||||||
this.set = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
WeightOrDocIdSet(DocIdSet bitset) {
|
|
||||||
this.set = bitset;
|
|
||||||
this.weight = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
|
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
|
||||||
return new ConstantScoreWeight(this, boost) {
|
return new ConstantScoreWeight(this, boost) {
|
||||||
|
@ -216,11 +179,8 @@ public class TermInSetQuery extends Query implements Accountable {
|
||||||
// order to protect highlighters
|
// order to protect highlighters
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
@Override
|
||||||
* On the given leaf context, try to either rewrite to a disjunction if
|
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||||
* there are few matching terms, or build a bitset containing matching docs.
|
|
||||||
*/
|
|
||||||
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
|
|
||||||
final LeafReader reader = context.reader();
|
final LeafReader reader = context.reader();
|
||||||
|
|
||||||
Terms terms = reader.terms(field);
|
Terms terms = reader.terms(field);
|
||||||
|
@ -231,90 +191,49 @@ public class TermInSetQuery extends Query implements Accountable {
|
||||||
PostingsEnum docs = null;
|
PostingsEnum docs = null;
|
||||||
TermIterator iterator = termData.iterator();
|
TermIterator iterator = termData.iterator();
|
||||||
|
|
||||||
// We will first try to collect up to 'threshold' terms into 'matchingTerms'
|
// Here we partition postings based on cost: longer ones will be consumed
|
||||||
// if there are two many terms, we will fall back to building the 'builder'
|
// lazily while shorter ones are consumed eagerly into a bitset. Compared to
|
||||||
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
|
// putting everything into a bitset, this should help skip over unnecessary doc
|
||||||
assert termData.size() > threshold : "Query should have been rewritten";
|
// ids in the longer postings lists. This should be especially useful if
|
||||||
List<TermAndState> matchingTerms = new ArrayList<>(threshold);
|
// document frequencies have a zipfian distribution.
|
||||||
DocIdSetBuilder builder = null;
|
final PriorityQueue<PostingsEnum> longestPostingsLists = new PriorityQueue<PostingsEnum>(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD) {
|
||||||
|
@Override
|
||||||
|
protected boolean lessThan(PostingsEnum a, PostingsEnum b) {
|
||||||
|
return a.cost() < b.cost();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
DocIdSetBuilder shortestPostingsLists = null;
|
||||||
|
|
||||||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||||
assert field.equals(iterator.field());
|
assert field.equals(iterator.field());
|
||||||
if (termsEnum.seekExact(term)) {
|
if (termsEnum.seekExact(term)) {
|
||||||
if (matchingTerms == null) {
|
|
||||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||||
builder.add(docs);
|
docs = longestPostingsLists.insertWithOverflow(docs);
|
||||||
} else if (matchingTerms.size() < threshold) {
|
if (docs != null) { // the pq is full
|
||||||
matchingTerms.add(new TermAndState(field, termsEnum));
|
if (shortestPostingsLists == null) {
|
||||||
} else {
|
shortestPostingsLists = new DocIdSetBuilder(reader.maxDoc());
|
||||||
assert matchingTerms.size() == threshold;
|
|
||||||
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
|
|
||||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
|
||||||
builder.add(docs);
|
|
||||||
for (TermAndState t : matchingTerms) {
|
|
||||||
t.termsEnum.seekExact(t.term, t.state);
|
|
||||||
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
|
|
||||||
builder.add(docs);
|
|
||||||
}
|
}
|
||||||
matchingTerms = null;
|
shortestPostingsLists.add(docs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (matchingTerms != null) {
|
|
||||||
assert builder == null;
|
|
||||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
|
||||||
for (TermAndState t : matchingTerms) {
|
|
||||||
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
|
|
||||||
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
|
|
||||||
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
|
|
||||||
}
|
|
||||||
Query q = new ConstantScoreQuery(bq.build());
|
|
||||||
final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
|
|
||||||
return new WeightOrDocIdSet(weight);
|
|
||||||
} else {
|
|
||||||
assert builder != null;
|
|
||||||
return new WeightOrDocIdSet(builder.build());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Scorer scorer(DocIdSet set) throws IOException {
|
final int numClauses = longestPostingsLists.size() + (shortestPostingsLists == null ? 0 : 1);
|
||||||
if (set == null) {
|
if (numClauses == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
final DocIdSetIterator disi = set.iterator();
|
|
||||||
if (disi == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return new ConstantScoreScorer(this, score(), disi);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
DisiPriorityQueue queue = new DisiPriorityQueue(numClauses);
|
||||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
for (PostingsEnum postings : longestPostingsLists) {
|
||||||
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
|
queue.add(new DisiWrapper(postings));
|
||||||
if (weightOrBitSet == null) {
|
|
||||||
return null;
|
|
||||||
} else if (weightOrBitSet.weight != null) {
|
|
||||||
return weightOrBitSet.weight.bulkScorer(context);
|
|
||||||
} else {
|
|
||||||
final Scorer scorer = scorer(weightOrBitSet.set);
|
|
||||||
if (scorer == null) {
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
return new DefaultBulkScorer(scorer);
|
if (shortestPostingsLists != null) {
|
||||||
}
|
queue.add(new DisiWrapper(shortestPostingsLists.build().iterator()));
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
|
||||||
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
|
|
||||||
if (weightOrBitSet == null) {
|
|
||||||
return null;
|
|
||||||
} else if (weightOrBitSet.weight != null) {
|
|
||||||
return weightOrBitSet.weight.scorer(context);
|
|
||||||
} else {
|
|
||||||
return scorer(weightOrBitSet.set);
|
|
||||||
}
|
}
|
||||||
|
final DocIdSetIterator disi = new DisjunctionDISIApproximation(queue);
|
||||||
|
return new ConstantScoreScorer(this, boost, disi);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue