mirror of https://github.com/apache/lucene.git
LUCENE-4588: EnwikiContentSource fixes: lost last wiki doc, thread leak in 'forever' mode.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1417788 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bfde8ec0e1
commit
e7cc2239fd
|
@ -202,6 +202,9 @@ Bug Fixes
|
|||
* LUCENE-4009: Improve TermsFilter.toString (Tim Costermans via Chris
|
||||
Male, Mike McCandless)
|
||||
|
||||
* LUCENE-4588: Benchmark's EnwikiContentSource was discarding last wiki
|
||||
document and had leaking threads in 'forever' mode. (Doron Cohen)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-4586: Change default ResultMode of FacetRequest to PER_NODE_IN_TREE.
|
||||
|
|
|
@ -53,6 +53,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
private class Parser extends DefaultHandler implements Runnable {
|
||||
private Thread t;
|
||||
private boolean threadDone;
|
||||
private boolean stopped = false;
|
||||
private String[] tuple;
|
||||
private NoMoreDataException nmde;
|
||||
private StringBuilder contents = new StringBuilder();
|
||||
|
@ -70,31 +71,31 @@ public class EnwikiContentSource extends ContentSource {
|
|||
}
|
||||
String[] result;
|
||||
synchronized(this){
|
||||
while(tuple == null && nmde == null && !threadDone) {
|
||||
while(tuple == null && nmde == null && !threadDone && !stopped) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
}
|
||||
}
|
||||
if (tuple != null) {
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
return result;
|
||||
}
|
||||
if (nmde != null) {
|
||||
// Set to null so we will re-start thread in case
|
||||
// we are re-used:
|
||||
t = null;
|
||||
throw nmde;
|
||||
}
|
||||
if (t != null && threadDone) {
|
||||
// The thread has exited yet did not hit end of
|
||||
// data, so this means it hit an exception. We
|
||||
// throw NoMorDataException here to force
|
||||
// benchmark to stop the current alg:
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
result = tuple;
|
||||
tuple = null;
|
||||
notify();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
String time(String original) {
|
||||
|
@ -132,7 +133,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
|
||||
tmpTuple[ID] = id;
|
||||
synchronized(this) {
|
||||
while (tuple != null) {
|
||||
while (tuple != null && !stopped) {
|
||||
try {
|
||||
wait();
|
||||
} catch (InterruptedException ie) {
|
||||
|
@ -175,7 +176,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
XMLReader reader = XMLReaderFactory.createXMLReader();
|
||||
reader.setContentHandler(this);
|
||||
reader.setErrorHandler(this);
|
||||
while(true){
|
||||
while(!stopped){
|
||||
final InputStream localFileIS = is;
|
||||
try {
|
||||
// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
|
||||
|
@ -186,8 +187,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
} catch (IOException ioe) {
|
||||
synchronized(EnwikiContentSource.this) {
|
||||
if (localFileIS != is) {
|
||||
// fileIS was closed on us, so, just fall
|
||||
// through
|
||||
// fileIS was closed on us, so, just fall through
|
||||
} else
|
||||
// Exception is real
|
||||
throw ioe;
|
||||
|
@ -200,7 +200,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
return;
|
||||
} else if (localFileIS == is) {
|
||||
// If file is not already re-opened then re-open it now
|
||||
is = StreamUtils.inputStream(file);
|
||||
is = openInputStream();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -238,6 +238,17 @@ public class EnwikiContentSource extends ContentSource {
|
|||
// this element should be discarded.
|
||||
}
|
||||
}
|
||||
|
||||
private void stop() {
|
||||
synchronized (this) {
|
||||
stopped = true;
|
||||
if (tuple != null) {
|
||||
tuple = null;
|
||||
notify();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>();
|
||||
|
@ -284,6 +295,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
is.close();
|
||||
is = null;
|
||||
}
|
||||
parser.stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -301,7 +313,12 @@ public class EnwikiContentSource extends ContentSource {
|
|||
@Override
|
||||
public void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
is = StreamUtils.inputStream(file);
|
||||
is = openInputStream();
|
||||
}
|
||||
|
||||
/** Open the input stream. */
|
||||
protected InputStream openInputStream() throws IOException {
|
||||
return StreamUtils.inputStream(file);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -309,10 +326,9 @@ public class EnwikiContentSource extends ContentSource {
|
|||
super.setConfig(config);
|
||||
keepImages = config.get("keep.image.only.docs", true);
|
||||
String fileName = config.get("docs.file", null);
|
||||
if (fileName == null) {
|
||||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
if (fileName != null) {
|
||||
file = new File(fileName).getAbsoluteFile();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
public class EnwikiContentSourceTest extends LuceneTestCase {
|
||||
|
||||
/** An EnwikiContentSource which works on a String and not files. */
|
||||
private static class StringableEnwikiSource extends EnwikiContentSource {
|
||||
|
||||
private final String docs;
|
||||
|
||||
public StringableEnwikiSource(String docs) {
|
||||
this.docs = docs;
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation") // fine for the characters used in this test
|
||||
@Override
|
||||
protected InputStream openInputStream() throws IOException {
|
||||
return new java.io.StringBufferInputStream(docs);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void assertDocData(DocData dd, String expName, String expTitle, String expBody, String expDate)
|
||||
throws ParseException {
|
||||
assertNotNull(dd);
|
||||
assertEquals(expName, dd.getName());
|
||||
assertEquals(expTitle, dd.getTitle());
|
||||
assertEquals(expBody, dd.getBody());
|
||||
assertEquals(expDate, dd.getDate());
|
||||
}
|
||||
|
||||
private void assertNoMoreDataException(EnwikiContentSource stdm) throws Exception {
|
||||
try {
|
||||
stdm.getNextDocData(null);
|
||||
fail("Expecting NoMoreDataException");
|
||||
} catch (NoMoreDataException e) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
|
||||
private final String PAGE1 =
|
||||
" <page>\r\n" +
|
||||
" <title>Title1</title>\r\n" +
|
||||
" <ns>0</ns>\r\n" +
|
||||
" <id>1</id>\r\n" +
|
||||
" <revision>\r\n" +
|
||||
" <id>11</id>\r\n" +
|
||||
" <parentid>111</parentid>\r\n" +
|
||||
" <timestamp>2011-09-14T11:35:09Z</timestamp>\r\n" +
|
||||
" <contributor>\r\n" +
|
||||
" <username>Mister1111</username>\r\n" +
|
||||
" <id>1111</id>\r\n" +
|
||||
" </contributor>\r\n" +
|
||||
" <minor />\r\n" +
|
||||
" <comment>/* Never mind */</comment>\r\n" +
|
||||
" <text>Some text 1 here</text>\r\n" +
|
||||
" </revision>\r\n" +
|
||||
" </page>\r\n";
|
||||
|
||||
private final String PAGE2 =
|
||||
" <page>\r\n" +
|
||||
" <title>Title2</title>\r\n" +
|
||||
" <ns>0</ns>\r\n" +
|
||||
" <id>2</id>\r\n" +
|
||||
" <revision>\r\n" +
|
||||
" <id>22</id>\r\n" +
|
||||
" <parentid>222</parentid>\r\n" +
|
||||
" <timestamp>2022-09-14T22:35:09Z</timestamp>\r\n" +
|
||||
" <contributor>\r\n" +
|
||||
" <username>Mister2222</username>\r\n" +
|
||||
" <id>2222</id>\r\n" +
|
||||
" </contributor>\r\n" +
|
||||
" <minor />\r\n" +
|
||||
" <comment>/* Never mind */</comment>\r\n" +
|
||||
" <text>Some text 2 here</text>\r\n" +
|
||||
" </revision>\r\n" +
|
||||
" </page>\r\n";
|
||||
|
||||
@Test
|
||||
public void testOneDocument() throws Exception {
|
||||
String docs =
|
||||
"<mediawiki>\r\n" +
|
||||
PAGE1 +
|
||||
"</mediawiki>";
|
||||
|
||||
EnwikiContentSource source = createContentSource(docs, false);
|
||||
|
||||
DocData dd = source.getNextDocData(new DocData());
|
||||
assertDocData(dd, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
|
||||
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
private EnwikiContentSource createContentSource(String docs, boolean forever) throws IOException {
|
||||
|
||||
Properties props = new Properties();
|
||||
props.setProperty("print.props", "false");
|
||||
props.setProperty("content.source.forever", Boolean.toString(forever));
|
||||
Config config = new Config(props);
|
||||
|
||||
EnwikiContentSource source = new StringableEnwikiSource(docs);
|
||||
source.setConfig(config);
|
||||
|
||||
// doc-maker just for initiating content source inputs
|
||||
DocMaker docMaker = new DocMaker();
|
||||
docMaker.setConfig(config, source);
|
||||
docMaker.resetInputs();
|
||||
return source;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoDocuments() throws Exception {
|
||||
String docs =
|
||||
"<mediawiki>\r\n" +
|
||||
PAGE1 +
|
||||
PAGE2 +
|
||||
"</mediawiki>";
|
||||
|
||||
EnwikiContentSource source = createContentSource(docs, false);
|
||||
|
||||
DocData dd1 = source.getNextDocData(new DocData());
|
||||
assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
|
||||
|
||||
DocData dd2 = source.getNextDocData(new DocData());
|
||||
assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000");
|
||||
|
||||
assertNoMoreDataException(source);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForever() throws Exception {
|
||||
String docs =
|
||||
"<mediawiki>\r\n" +
|
||||
PAGE1 +
|
||||
PAGE2 +
|
||||
"</mediawiki>";
|
||||
|
||||
EnwikiContentSource source = createContentSource(docs, true);
|
||||
|
||||
// same documents several times
|
||||
for (int i=0; i<3; i++) {
|
||||
DocData dd1 = source.getNextDocData(new DocData());
|
||||
assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000");
|
||||
|
||||
DocData dd2 = source.getNextDocData(new DocData());
|
||||
assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000");
|
||||
// Don't test that NoMoreDataException is thrown, since the forever flag is turned on.
|
||||
}
|
||||
|
||||
source.close();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue