From e7cc2239fdf17db296292e1ae82d1f3d06e06247 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Thu, 6 Dec 2012 09:54:22 +0000 Subject: [PATCH] LUCENE-4588: EnwikiContentSource fixes: lost last wiki doc, thread leak in 'forever' mode. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1417788 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../byTask/feeds/EnwikiContentSource.java | 58 ++++-- .../byTask/feeds/EnwikiContentSourceTest.java | 179 ++++++++++++++++++ 3 files changed, 219 insertions(+), 21 deletions(-) create mode 100644 lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSourceTest.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2df7ddaabe9..89afe3108c1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -202,6 +202,9 @@ Bug Fixes * LUCENE-4009: Improve TermsFilter.toString (Tim Costermans via Chris Male, Mike McCandless) +* LUCENE-4588: Benchmark's EnwikiContentSource was discarding last wiki + document and had leaking threads in 'forever' mode. (Doron Cohen) + Changes in Runtime Behavior * LUCENE-4586: Change default ResultMode of FacetRequest to PER_NODE_IN_TREE. diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index 7ca1fc56d00..4a0dde0920d 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -53,6 +53,7 @@ public class EnwikiContentSource extends ContentSource { private class Parser extends DefaultHandler implements Runnable { private Thread t; private boolean threadDone; + private boolean stopped = false; private String[] tuple; private NoMoreDataException nmde; private StringBuilder contents = new StringBuilder(); @@ -70,31 +71,31 @@ public class EnwikiContentSource extends ContentSource { } String[] result; synchronized(this){ - while(tuple == null && nmde == null && !threadDone) { + while(tuple == null && nmde == null && !threadDone && !stopped) { try { wait(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); } } + if (tuple != null) { + result = tuple; + tuple = null; + notify(); + return result; + } if (nmde != null) { // Set to null so we will re-start thread in case // we are re-used: t = null; throw nmde; } - if (t != null && threadDone) { - // The thread has exited yet did not hit end of - // data, so this means it hit an exception. We - // throw NoMorDataException here to force - // benchmark to stop the current alg: - throw new NoMoreDataException(); - } - result = tuple; - tuple = null; - notify(); + // The thread has exited yet did not hit end of + // data, so this means it hit an exception. We + // throw NoMorDataException here to force + // benchmark to stop the current alg: + throw new NoMoreDataException(); } - return result; } String time(String original) { @@ -132,7 +133,7 @@ public class EnwikiContentSource extends ContentSource { tmpTuple[BODY] = body.replaceAll("[\t\n]", " "); tmpTuple[ID] = id; synchronized(this) { - while (tuple != null) { + while (tuple != null && !stopped) { try { wait(); } catch (InterruptedException ie) { @@ -175,7 +176,7 @@ public class EnwikiContentSource extends ContentSource { XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(this); reader.setErrorHandler(this); - while(true){ + while(!stopped){ final InputStream localFileIS = is; try { // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. @@ -186,8 +187,7 @@ public class EnwikiContentSource extends ContentSource { } catch (IOException ioe) { synchronized(EnwikiContentSource.this) { if (localFileIS != is) { - // fileIS was closed on us, so, just fall - // through + // fileIS was closed on us, so, just fall through } else // Exception is real throw ioe; @@ -200,7 +200,7 @@ public class EnwikiContentSource extends ContentSource { return; } else if (localFileIS == is) { // If file is not already re-opened then re-open it now - is = StreamUtils.inputStream(file); + is = openInputStream(); } } } @@ -238,6 +238,17 @@ public class EnwikiContentSource extends ContentSource { // this element should be discarded. } } + + private void stop() { + synchronized (this) { + stopped = true; + if (tuple != null) { + tuple = null; + notify(); + } + } + } + } private static final Map ELEMENTS = new HashMap(); @@ -284,6 +295,7 @@ public class EnwikiContentSource extends ContentSource { is.close(); is = null; } + parser.stop(); } } @@ -301,7 +313,12 @@ public class EnwikiContentSource extends ContentSource { @Override public void resetInputs() throws IOException { super.resetInputs(); - is = StreamUtils.inputStream(file); + is = openInputStream(); + } + + /** Open the input stream. */ + protected InputStream openInputStream() throws IOException { + return StreamUtils.inputStream(file); } @Override @@ -309,10 +326,9 @@ public class EnwikiContentSource extends ContentSource { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); String fileName = config.get("docs.file", null); - if (fileName == null) { - throw new IllegalArgumentException("docs.file must be set"); + if (fileName != null) { + file = new File(fileName).getAbsoluteFile(); } - file = new File(fileName).getAbsoluteFile(); } } diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSourceTest.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSourceTest.java new file mode 100644 index 00000000000..be72ce724d1 --- /dev/null +++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSourceTest.java @@ -0,0 +1,179 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.Properties; + +import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Ignore; +import org.junit.Test; + +public class EnwikiContentSourceTest extends LuceneTestCase { + + /** An EnwikiContentSource which works on a String and not files. */ + private static class StringableEnwikiSource extends EnwikiContentSource { + + private final String docs; + + public StringableEnwikiSource(String docs) { + this.docs = docs; + } + + @SuppressWarnings("deprecation") // fine for the characters used in this test + @Override + protected InputStream openInputStream() throws IOException { + return new java.io.StringBufferInputStream(docs); + } + + } + + private void assertDocData(DocData dd, String expName, String expTitle, String expBody, String expDate) + throws ParseException { + assertNotNull(dd); + assertEquals(expName, dd.getName()); + assertEquals(expTitle, dd.getTitle()); + assertEquals(expBody, dd.getBody()); + assertEquals(expDate, dd.getDate()); + } + + private void assertNoMoreDataException(EnwikiContentSource stdm) throws Exception { + try { + stdm.getNextDocData(null); + fail("Expecting NoMoreDataException"); + } catch (NoMoreDataException e) { + // expected + } + } + + private final String PAGE1 = + " \r\n" + + " Title1\r\n" + + " 0\r\n" + + " 1\r\n" + + " \r\n" + + " 11\r\n" + + " 111\r\n" + + " 2011-09-14T11:35:09Z\r\n" + + " \r\n" + + " Mister1111\r\n" + + " 1111\r\n" + + " \r\n" + + " \r\n" + + " /* Never mind */\r\n" + + " Some text 1 here\r\n" + + " \r\n" + + " \r\n"; + + private final String PAGE2 = + " \r\n" + + " Title2\r\n" + + " 0\r\n" + + " 2\r\n" + + " \r\n" + + " 22\r\n" + + " 222\r\n" + + " 2022-09-14T22:35:09Z\r\n" + + " \r\n" + + " Mister2222\r\n" + + " 2222\r\n" + + " \r\n" + + " \r\n" + + " /* Never mind */\r\n" + + " Some text 2 here\r\n" + + " \r\n" + + " \r\n"; + + @Test + public void testOneDocument() throws Exception { + String docs = + "\r\n" + + PAGE1 + + ""; + + EnwikiContentSource source = createContentSource(docs, false); + + DocData dd = source.getNextDocData(new DocData()); + assertDocData(dd, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + assertNoMoreDataException(source); + } + + private EnwikiContentSource createContentSource(String docs, boolean forever) throws IOException { + + Properties props = new Properties(); + props.setProperty("print.props", "false"); + props.setProperty("content.source.forever", Boolean.toString(forever)); + Config config = new Config(props); + + EnwikiContentSource source = new StringableEnwikiSource(docs); + source.setConfig(config); + + // doc-maker just for initiating content source inputs + DocMaker docMaker = new DocMaker(); + docMaker.setConfig(config, source); + docMaker.resetInputs(); + return source; + } + + @Test + public void testTwoDocuments() throws Exception { + String docs = + "\r\n" + + PAGE1 + + PAGE2 + + ""; + + EnwikiContentSource source = createContentSource(docs, false); + + DocData dd1 = source.getNextDocData(new DocData()); + assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + DocData dd2 = source.getNextDocData(new DocData()); + assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000"); + + assertNoMoreDataException(source); + } + + @Test + public void testForever() throws Exception { + String docs = + "\r\n" + + PAGE1 + + PAGE2 + + ""; + + EnwikiContentSource source = createContentSource(docs, true); + + // same documents several times + for (int i=0; i<3; i++) { + DocData dd1 = source.getNextDocData(new DocData()); + assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); + + DocData dd2 = source.getNextDocData(new DocData()); + assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000"); + // Don't test that NoMoreDataException is thrown, since the forever flag is turned on. + } + + source.close(); + } + +}