LUCENE-8429: Avoid stack overflows in DaciukMihovAutomatonBuilder.

This commit is contained in:
Adrien Grand 2018-07-27 11:11:00 +02:00
parent 534204890a
commit d78feb2236
3 changed files with 53 additions and 1 deletions

View File

@ -189,6 +189,9 @@ Bug Fixes:
* LUCENE-8398: TieredMergePolicy.getMaxMergedSegmentMB has rounding error (Erick Erickson)
* LUCENE-8429: DaciukMihovAutomatonBuilder is no longer prone to stack
overflows by enforcing a maximum term length. (Adrien Grand)
Changes in Runtime Behavior:
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing

View File

@ -33,7 +33,14 @@ import org.apache.lucene.util.UnicodeUtil;
* @see Automata#makeStringUnion(Collection)
*/
public final class DaciukMihovAutomatonBuilder {
/**
* This builder rejects terms that are more than 1k chars long since it then
* uses recursion based on the length of the string, which might cause stack
* overflows.
*/
static final int MAX_TERM_LENGTH = 1_000;
/**
* The default constructor is private. Use static methods directly.
*/
@ -220,6 +227,9 @@ public final class DaciukMihovAutomatonBuilder {
* to this automaton (the input must be sorted).
*/
public void add(CharsRef current) {
if (current.length > MAX_TERM_LENGTH) {
throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current);
}
assert stateRegistry != null : "Automaton already built.";
assert previous == null
|| comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "

View File

@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.automaton;
import java.util.Arrays;
import java.util.Collections;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestDaciukMihovAutomatonBuilder extends LuceneTestCase {
public void testLargeTerms() {
byte[] b10k = new byte[10_000];
Arrays.fill(b10k, (byte) 'a');
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b10k))));
assertTrue(e.getMessage().startsWith("This builder doesn't allow terms that are larger than 1,000 characters"));
byte[] b1k = ArrayUtil.copyOfSubArray(b10k, 0, 1000);
DaciukMihovAutomatonBuilder.build(Collections.singleton(new BytesRef(b1k))); // no exception
}
}