From fb8183332bc5fa74a8eb36105be1026ae7aa0a8e Mon Sep 17 00:00:00 2001
From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com>
Date: Thu, 17 Aug 2023 06:45:20 +0100
Subject: [PATCH] Fix stack overflow in RegExp for long string (#12462)
---
lucene/CHANGES.txt | 2 +
.../apache/lucene/util/automaton/RegExp.java | 41 ++++++++++++++-----
.../lucene/util/automaton/TestRegExp.java | 4 ++
3 files changed, 36 insertions(+), 11 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 266a9958ba2..09a616cf515 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -184,6 +184,8 @@ Bug Fixes
* GITHUB#9660: Throw an ArithmeticException when the offset overflows in a ByteBlockPool. (Stefan Vodita)
+* GITHUB#11537: Fix stack overflow in RegExp for long strings by reducing recursion. (Jakub Slowinski)
+
* GITHUB#12388: JoinUtil queries were ignoring boosts. (Alan Woodward)
* GITHUB#12413: Fix HNSW graph search bug that potentially leaked unapproved docs (Ben Trent).
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index 7dc4ee45793..4806e75bcf7 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -36,6 +36,8 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.function.BooleanSupplier;
+import java.util.function.Supplier;
/**
* Regular Expression extension to Automaton
.
@@ -1067,22 +1069,39 @@ public class RegExp {
}
final RegExp parseUnionExp() throws IllegalArgumentException {
- RegExp e = parseInterExp();
- if (match('|')) e = makeUnion(flags, e, parseUnionExp());
- return e;
+ return iterativeParseExp(this::parseInterExp, () -> match('|'), RegExp::makeUnion);
}
final RegExp parseInterExp() throws IllegalArgumentException {
- RegExp e = parseConcatExp();
- if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e, parseInterExp());
- return e;
+ return iterativeParseExp(
+ this::parseConcatExp, () -> check(INTERSECTION) && match('&'), RegExp::makeIntersection);
}
final RegExp parseConcatExp() throws IllegalArgumentException {
- RegExp e = parseRepeatExp();
- if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&")))
- e = makeConcatenation(flags, e, parseConcatExp());
- return e;
+ return iterativeParseExp(
+ this::parseRepeatExp,
+ () -> (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))),
+ RegExp::makeConcatenation);
+ }
+
+ /**
+ * Custom Functional Interface for a Supplying methods with signature of RegExp(int int1, RegExp
+ * exp1, RegExp exp2)
+ */
+ @FunctionalInterface
+ private interface MakeRegexGroup {
+ RegExp get(int int1, RegExp exp1, RegExp exp2);
+ }
+
+ final RegExp iterativeParseExp(
+ Supplier gather, BooleanSupplier stop, MakeRegexGroup associativeReduce)
+ throws IllegalArgumentException {
+ RegExp result = gather.get();
+ while (stop.getAsBoolean() == true) {
+ RegExp e = gather.get();
+ result = associativeReduce.get(flags, result, e);
+ }
+ return result;
}
final RegExp parseRepeatExp() throws IllegalArgumentException {
@@ -1216,7 +1235,7 @@ public class RegExp {
if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-'))
throw new NumberFormatException();
String smin = s.substring(0, i);
- String smax = s.substring(i + 1, s.length());
+ String smax = s.substring(i + 1);
int imin = Integer.parseInt(smin);
int imax = Integer.parseInt(smax);
int digits;
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index 4cb145b432b..c960e736304 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -245,4 +245,8 @@ public class TestRegExp extends LuceneTestCase {
}
return regexPattern;
}
+
+ public void testRegExpNoStackOverflow() {
+ new RegExp("(a)|".repeat(50000) + "(a)");
+ }
}