diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5c7e12fc579..d02ed269f72 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -248,6 +248,11 @@ Bug Fixes * LUCENE-6872: IndexWriter handles any VirtualMachineError, not just OOM, as tragic. (Robert Muir) +* LUCENE-6814: PatternTokenizer no longer hangs onto heap sized to the + maximum input string it's ever seen, which can be a large memory + "leak" if you tokenize large strings with many threads across many + indices (Alex Chow via Mike McCandless) + Other * LUCENE-6478: Test execution can hang with java.security.debug. (Dawid Weiss) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java index faa7e91d63a..e25a7b9f1a1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java @@ -136,10 +136,20 @@ public final class PatternTokenizer extends Tokenizer { offsetAtt.setOffset(ofs, ofs); } + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + str.setLength(0); + str.trimToSize(); + } + } + @Override public void reset() throws IOException { super.reset(); - fillBuffer(str, input); + fillBuffer(input); matcher.reset(str); index = 0; } @@ -147,11 +157,11 @@ public final class PatternTokenizer extends Tokenizer { // TODO: we should see if we can make this tokenizer work without reading // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ? final char[] buffer = new char[8192]; - private void fillBuffer(StringBuilder sb, Reader input) throws IOException { + private void fillBuffer(Reader input) throws IOException { int len; - sb.setLength(0); + str.setLength(0); while ((len = input.read(buffer)) > 0) { - sb.append(buffer, 0, len); + str.append(buffer, 0, len); } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java index 316fbdb5b6f..30badb8c14e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java @@ -146,4 +146,37 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER); b.close(); } + + // LUCENE-6814 + public void testHeapFreedAfterClose() throws Exception { + // TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers? + + // Build a 1MB string: + StringBuilder b = new StringBuilder(); + for(int i=0;i<1024;i++) { + // 1023 spaces, then an x + for(int j=0;j<1023;j++) { + b.append(' '); + } + b.append('x'); + } + + String big = b.toString(); + + Pattern x = Pattern.compile("x"); + + List tokenizers = new ArrayList<>(); + for(int i=0;i<512;i++) { + Tokenizer stream = new PatternTokenizer(x, -1); + tokenizers.add(stream); + stream.setReader(new StringReader(big)); + stream.reset(); + for(int j=0;j<1024;j++) { + assertTrue(stream.incrementToken()); + } + assertFalse(stream.incrementToken()); + stream.end(); + stream.close(); + } + } }