From fb8183332bc5fa74a8eb36105be1026ae7aa0a8e Mon Sep 17 00:00:00 2001 From: Jakub Slowinski <32519034+slow-J@users.noreply.github.com> Date: Thu, 17 Aug 2023 06:45:20 +0100 Subject: [PATCH] Fix stack overflow in RegExp for long string (#12462) --- lucene/CHANGES.txt | 2 + .../apache/lucene/util/automaton/RegExp.java | 41 ++++++++++++++----- .../lucene/util/automaton/TestRegExp.java | 4 ++ 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 266a9958ba2..09a616cf515 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -184,6 +184,8 @@ Bug Fixes * GITHUB#9660: Throw an ArithmeticException when the offset overflows in a ByteBlockPool. (Stefan Vodita) +* GITHUB#11537: Fix stack overflow in RegExp for long strings by reducing recursion. (Jakub Slowinski) + * GITHUB#12388: JoinUtil queries were ignoring boosts. (Alan Woodward) * GITHUB#12413: Fix HNSW graph search bug that potentially leaked unapproved docs (Ben Trent). diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index 7dc4ee45793..4806e75bcf7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -36,6 +36,8 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.BooleanSupplier; +import java.util.function.Supplier; /** * Regular Expression extension to Automaton. @@ -1067,22 +1069,39 @@ public class RegExp { } final RegExp parseUnionExp() throws IllegalArgumentException { - RegExp e = parseInterExp(); - if (match('|')) e = makeUnion(flags, e, parseUnionExp()); - return e; + return iterativeParseExp(this::parseInterExp, () -> match('|'), RegExp::makeUnion); } final RegExp parseInterExp() throws IllegalArgumentException { - RegExp e = parseConcatExp(); - if (check(INTERSECTION) && match('&')) e = makeIntersection(flags, e, parseInterExp()); - return e; + return iterativeParseExp( + this::parseConcatExp, () -> check(INTERSECTION) && match('&'), RegExp::makeIntersection); } final RegExp parseConcatExp() throws IllegalArgumentException { - RegExp e = parseRepeatExp(); - if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) - e = makeConcatenation(flags, e, parseConcatExp()); - return e; + return iterativeParseExp( + this::parseRepeatExp, + () -> (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))), + RegExp::makeConcatenation); + } + + /** + * Custom Functional Interface for a Supplying methods with signature of RegExp(int int1, RegExp + * exp1, RegExp exp2) + */ + @FunctionalInterface + private interface MakeRegexGroup { + RegExp get(int int1, RegExp exp1, RegExp exp2); + } + + final RegExp iterativeParseExp( + Supplier gather, BooleanSupplier stop, MakeRegexGroup associativeReduce) + throws IllegalArgumentException { + RegExp result = gather.get(); + while (stop.getAsBoolean() == true) { + RegExp e = gather.get(); + result = associativeReduce.get(flags, result, e); + } + return result; } final RegExp parseRepeatExp() throws IllegalArgumentException { @@ -1216,7 +1235,7 @@ public class RegExp { if (i == 0 || i == s.length() - 1 || i != s.lastIndexOf('-')) throw new NumberFormatException(); String smin = s.substring(0, i); - String smax = s.substring(i + 1, s.length()); + String smax = s.substring(i + 1); int imin = Integer.parseInt(smin); int imax = Integer.parseInt(smax); int digits; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index 4cb145b432b..c960e736304 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -245,4 +245,8 @@ public class TestRegExp extends LuceneTestCase { } return regexPattern; } + + public void testRegExpNoStackOverflow() { + new RegExp("(a)|".repeat(50000) + "(a)"); + } }