diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2876fa72ad2..0f13dd3babd 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -189,6 +189,9 @@ Bug Fixes: * LUCENE-8398: TieredMergePolicy.getMaxMergedSegmentMB has rounding error (Erick Erickson) +* LUCENE-8429: DaciukMihovAutomatonBuilder is no longer prone to stack + overflows by enforcing a maximum term length. (Adrien Grand) + Changes in Runtime Behavior: * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java index 60ec8659346..3757e82a3cb 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java @@ -33,7 +33,14 @@ import org.apache.lucene.util.UnicodeUtil; * @see Automata#makeStringUnion(Collection) */ public final class DaciukMihovAutomatonBuilder { - + + /** + * This builder rejects terms that are more than 1k chars long since it then + * uses recursion based on the length of the string, which might cause stack + * overflows. + */ + static final int MAX_TERM_LENGTH = 1_000; + /** * The default constructor is private. Use static methods directly. */ @@ -220,6 +227,9 @@ public final class DaciukMihovAutomatonBuilder { * to this automaton (the input must be sorted). */ public void add(CharsRef current) { + if (current.length > MAX_TERM_LENGTH) { + throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current); + } assert stateRegistry != null : "Automaton already built."; assert previous == null || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: " diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java new file mode 100644 index 00000000000..fa8f15479db --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestDaciukMihovAutomatonBuilder.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util.automaton; + +import java.util.Arrays; +import java.util.Collections; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase { + + public void testLargeTerms() { + byte[] b10k = new byte[10_000]; + Arrays.fill(b10k, (byte) 'a'); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k)))); + assertTrue(e.getMessage().startsWith("This builder doesn't allow terms that are larger than 1,000 characters")); + + byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000); + DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception + } + +}