LUCENE-6814: release heap in PatternTokenizer.close

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1712865 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2015-11-05 20:10:59 +00:00
parent 7fed8ccb25
commit 04b3b96a94
3 changed files with 52 additions and 4 deletions

View File

@ -248,6 +248,11 @@ Bug Fixes
* LUCENE-6872: IndexWriter handles any VirtualMachineError, not just OOM,
as tragic. (Robert Muir)
* LUCENE-6814: PatternTokenizer no longer hangs onto heap sized to the
maximum input string it's ever seen, which can be a large memory
"leak" if you tokenize large strings with many threads across many
indices (Alex Chow via Mike McCandless)
Other
* LUCENE-6478: Test execution can hang with java.security.debug. (Dawid Weiss)

View File

@ -136,10 +136,20 @@ public final class PatternTokenizer extends Tokenizer {
offsetAtt.setOffset(ofs, ofs);
}
@Override
public void close() throws IOException {
try {
super.close();
} finally {
str.setLength(0);
str.trimToSize();
}
}
@Override
public void reset() throws IOException {
super.reset();
fillBuffer(str, input);
fillBuffer(input);
matcher.reset(str);
index = 0;
}
@ -147,11 +157,11 @@ public final class PatternTokenizer extends Tokenizer {
// TODO: we should see if we can make this tokenizer work without reading
// the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
final char[] buffer = new char[8192];
private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
private void fillBuffer(Reader input) throws IOException {
int len;
sb.setLength(0);
str.setLength(0);
while ((len = input.read(buffer)) > 0) {
sb.append(buffer, 0, len);
str.append(buffer, 0, len);
}
}
}

View File

@ -146,4 +146,37 @@ public class TestPatternTokenizer extends BaseTokenStreamTestCase
checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
b.close();
}
// LUCENE-6814
public void testHeapFreedAfterClose() throws Exception {
// TODO: can we move this to BaseTSTC to catch other "hangs onto heap"ers?
// Build a 1MB string:
StringBuilder b = new StringBuilder();
for(int i=0;i<1024;i++) {
// 1023 spaces, then an x
for(int j=0;j<1023;j++) {
b.append(' ');
}
b.append('x');
}
String big = b.toString();
Pattern x = Pattern.compile("x");
List<Tokenizer> tokenizers = new ArrayList<>();
for(int i=0;i<512;i++) {
Tokenizer stream = new PatternTokenizer(x, -1);
tokenizers.add(stream);
stream.setReader(new StringReader(big));
stream.reset();
for(int j=0;j<1024;j++) {
assertTrue(stream.incrementToken());
}
assertFalse(stream.incrementToken());
stream.end();
stream.close();
}
}
}