LUCENE-849: configurable HTML Parser; external classes; exhaustive doc maker - '*';

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@522569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2007-03-26 16:46:33 +00:00
parent eee9d52886
commit 031f50c4e7
17 changed files with 513 additions and 106 deletions

View File

@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
3/25/07
LUCENE-849:
1. which HTML Parser is used is configurable with html.parser property.
2. External classes added to classpath with -Dbenchmark.ext.classpath=path.
3. '*' as repeating number now means "exhaust doc maker - no repetitions".
3/22/07
-Moved withRetrieve() call out of the loop in ReadTask

View File

@ -97,6 +97,7 @@
<path id="run.classpath">
<path refid="classpath"/>
<pathelement location="${build.dir}/classes/java"/>
<pathelement location="${benchmark.ext.classpath}"/>
</path>
<target name="run-standard" depends="compile,check-files,get-files" description="Run the standard baseline">

View File

@ -52,6 +52,7 @@ public class Benchmark {
try {
runData = new PerfRunData(new Config(algReader));
} catch (Exception e) {
e.printStackTrace();
throw new Exception("Error: cannot init PerfRunData!",e);
}

View File

@ -23,6 +23,7 @@ import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.benchmark.byTask.feeds.HTMLParser;
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
import org.apache.lucene.benchmark.byTask.stats.Points;
import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
@ -58,6 +59,7 @@ public class PerfRunData {
private Directory directory;
private Analyzer analyzer;
private DocMaker docMaker;
private HTMLParser htmlParser;
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
private HashMap readTaskQueryMaker;
@ -79,7 +81,10 @@ public class PerfRunData {
docMaker.setConfig(config);
// query makers
readTaskQueryMaker = new HashMap();
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
// html parser, used for some doc makers
htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance();
docMaker.setHTMLParser(htmlParser);
// index stuff
reinit(false);
@ -229,4 +234,11 @@ public class PerfRunData {
return qm;
}
/**
* @return Returns the htmlParser.
*/
public HTMLParser getHtmlParser() {
return htmlParser;
}
}

View File

@ -26,9 +26,7 @@ import org.apache.lucene.document.Field;
import java.io.File;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Properties;
/**
@ -47,15 +45,8 @@ public abstract class BasicDocMaker implements DocMaker {
private int numDocsCreated = 0;
private boolean storeBytes = false;
protected boolean forever;
static class DocData {
String name;
Date date;
String title;
String body;
Properties props;
}
private static class LeftOver {
private DocData docdata;
private int cnt;
@ -80,10 +71,14 @@ public abstract class BasicDocMaker implements DocMaker {
/**
* Return the data of the next document.
* All current implementations can create docs forever.
* When the input data is exhausted, input files are iterated.
* This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
* @return data of the next document.
* @exception if cannot create the next doc data
* @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
*/
protected abstract DocData getNextDocData() throws Exception;
protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;
/*
* (non-Javadoc)
@ -103,32 +98,32 @@ public abstract class BasicDocMaker implements DocMaker {
int docid = incrNumDocsCreated();
Document doc = new Document();
doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
if (docData.name!=null) {
String name = (cnt<0 ? docData.name : docData.name+"_"+cnt);
if (docData.getName()!=null) {
String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
}
if (docData.date!=null) {
String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND);
if (docData.getDate()!=null) {
String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
}
if (docData.title!=null) {
doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal));
if (docData.getTitle()!=null) {
doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
}
if (docData.body!=null && docData.body.length()>0) {
if (docData.getBody()!=null && docData.getBody().length()>0) {
String bdy;
if (size<=0 || size>=docData.body.length()) {
bdy = docData.body; // use all
docData.body = ""; // nothing left
if (size<=0 || size>=docData.getBody().length()) {
bdy = docData.getBody(); // use all
docData.setBody(""); // nothing left
} else {
// attempt not to break words - if whitespace found within next 20 chars...
for (int n=size-1; n<size+20 && n<docData.body.length(); n++) {
if (Character.isWhitespace(docData.body.charAt(n))) {
for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
if (Character.isWhitespace(docData.getBody().charAt(n))) {
size = n;
break;
}
}
bdy = docData.body.substring(0,size); // use part
docData.body = docData.body.substring(size); // some left
bdy = docData.getBody().substring(0,size); // use part
docData.setBody(docData.getBody().substring(size)); // some left
}
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
if (storeBytes == true) {
@ -136,13 +131,13 @@ public abstract class BasicDocMaker implements DocMaker {
}
}
if (docData.props!=null) {
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
if (docData.getProps()!=null) {
for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
String key = (String) it.next();
String val = (String) docData.props.get(key);
String val = (String) docData.getProps().get(key);
doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
}
docData.props = null;
docData.setProps(null);
}
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
return doc;
@ -154,19 +149,19 @@ public abstract class BasicDocMaker implements DocMaker {
*/
public Document makeDocument(int size) throws Exception {
LeftOver lvr = (LeftOver) leftovr.get();
if (lvr==null || lvr.docdata==null || lvr.docdata.body==null || lvr.docdata.body.length()==0) {
if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
resetLeftovers();
}
DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
int cnt = (lvr==null ? 0 : lvr.cnt);
while (dd.body==null || dd.body.length()<size) {
while (dd.getBody()==null || dd.getBody().length()<size) {
DocData dd2 = dd;
dd = getNextDocData();
cnt = 0;
dd.body = dd2.body + dd.body;
dd.setBody(dd2.getBody() + dd.getBody());
}
Document doc = createDocument(dd,size,cnt);
if (dd.body==null || dd.body.length()==0) {
if (dd.getBody()==null || dd.getBody().length()==0) {
resetLeftovers();
} else {
if (lvr == null) {
@ -195,6 +190,7 @@ public abstract class BasicDocMaker implements DocMaker {
indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
storeBytes = config.get("doc.store.body.bytes", false);
forever = config.get("doc.maker.forever",true);
}
/*
@ -247,6 +243,8 @@ public abstract class BasicDocMaker implements DocMaker {
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
private int printNum = 0;
private HTMLParser htmlParser;
public void printDocStatistics() {
boolean print = false;
String col = " ";
@ -277,6 +275,7 @@ public abstract class BasicDocMaker implements DocMaker {
}
protected void collectFiles(File f, ArrayList inputFiles) {
//System.out.println("Collect: "+f.getAbsolutePath());
if (!f.canRead()) {
return;
}
@ -291,5 +290,20 @@ public abstract class BasicDocMaker implements DocMaker {
addUniqueBytes(f.length());
}
/* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
*/
public void setHTMLParser(HTMLParser htmlParser) {
this.htmlParser = htmlParser;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
*/
public HTMLParser getHtmlParser() {
return htmlParser;
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.Properties;
/**
* HTML Parser that is based on Lucene's demo HTML parser.
*/
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
DateFormat dateFormat;
public DemoHTMLParser () {
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
dateFormat.setLenient(true);
}
/*
* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
*/
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
// title
String title = p.getTitle();
// properties
Properties props = p.getMetaTags();
// body
Reader r = p.getReader();
char c[] = new char[1024];
StringBuffer bodyBuf = new StringBuffer();
int n;
while ((n = r.read(c)) >= 0) {
if (n>0) {
bodyBuf.append(c,0,n);
}
}
r.close();
if (date == null && props.getProperty("date")!=null) {
try {
date = dateFormat.parse(props.getProperty("date").trim());
} catch (ParseException e) {
// do not fail test just because a date could not be parsed
System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
date = new Date(); // now
}
}
return new DocData(name, bodyBuf.toString(), title, props, date);
}
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
// TODO Auto-generated method stub
return parse(name, date, new StringReader(inputText.toString()), dateFormat);
}
}

View File

@ -0,0 +1,113 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Date;
import java.util.Properties;
/**
* Output of parsing (e.g. HTML parsing) of an input document.
*/
public class DocData {
private String name;
private String body;
private String title;
private Date date;
private Properties props;
public DocData(String name, String body, String title, Properties props, Date date) {
this.name = name;
this.body = body;
this.title = title;
this.date = date;
this.props = props;
}
/**
* @return Returns the name.
*/
public String getName() {
return name;
}
/**
* @param name The name to set.
*/
public void setName(String name) {
this.name = name;
}
/**
* @return Returns the props.
*/
public Properties getProps() {
return props;
}
/**
* @param props The props to set.
*/
public void setProps(Properties props) {
this.props = props;
}
/**
* @return Returns the body.
*/
public String getBody() {
return body;
}
/**
* @param body The body to set.
*/
public void setBody(String body) {
this.body = body;
}
/**
* @return Returns the title.
*/
public String getTitle() {
return title;
}
/**
* @param title The title to set.
*/
public void setTitle(String title) {
this.title = title;
}
/**
* @return Returns the date.
*/
public Date getDate() {
return date;
}
/**
* @param date The date to set.
*/
public void setDate(Date date) {
this.date = date;
}
}

View File

@ -61,4 +61,11 @@ public interface DocMaker {
/** Print some statistics on docs available/added/etc. */
public void printDocStatistics();
}
/** Set the html parser to use, when appropriate */
public void setHTMLParser(HTMLParser htmlParser);
/** Returns the htmlParser. */
public HTMLParser getHtmlParser();
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.DateFormat;
import java.util.Date;
/**
* HTML Parsing Interfacew for test purposes
*/
public interface HTMLParser {
/**
* Parse the input Reader and return DocData.
* A provided name or date is used for the result, otherwise an attempt is
* made to set them from the parsed data.
* @param dateFormat date formatter to use for extracting the date.
* @param name name of the result doc data. If null, attempt to set by parsed data.
* @param date date of the result doc data. If null, attempt to set by parsed data.
* @param reader of html text to parse.
* @return Parsed doc data.
* @throws IOException
* @throws InterruptedException
*/
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
/**
* Parse the inputText and return DocData.
* @param inputText the html text to parse.
* @see #parse(String, Date, Reader, DateFormat)
*/
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
}

View File

@ -0,0 +1,27 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Exception indicating there is no more data.
* Thrown by Docs Makers if doc.maker.forever is false and docs sources of that maker where exhausted.
* This is usefull for iterating all document of a source, in case we don't know in advance how many docs there are.
*/
public class NoMoreDataException extends Exception {
}

View File

@ -25,6 +25,7 @@ import java.io.FileReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
@ -66,13 +67,16 @@ public class ReutersDocMaker extends BasicDocMaker {
File f = null;
String name = null;
synchronized (this) {
f = (File) inputFiles.get(nextFile++);
name = f.getCanonicalPath()+"_"+iteration;
if (nextFile >= inputFiles.size()) {
// exhausted files, start a new round
// exhausted files, start a new round, unless forever set to false.
if (!forever) {
throw new NoMoreDataException();
}
nextFile = 0;
iteration++;
}
f = (File) inputFiles.get(nextFile++);
name = f.getCanonicalPath()+"_"+iteration;
}
BufferedReader reader = new BufferedReader(new FileReader(f));
@ -90,13 +94,9 @@ public class ReutersDocMaker extends BasicDocMaker {
addBytes(f.length());
DocData dd = new DocData();
dd.date = dateFormat.parse(dateStr.trim());
dd.name = name;
dd.title = title;
dd.body = bodyBuf.toString();
return dd;
Date date = dateFormat.parse(dateStr.trim());
return new DocData(name, bodyBuf.toString(), title, null, date);
}

View File

@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/
/**
* Create documents for the test
* Create documents for the test.
*/
public class SimpleDocMaker extends BasicDocMaker {
@ -58,12 +58,12 @@ public class SimpleDocMaker extends BasicDocMaker {
return 0; // not applicable
}
protected DocData getNextDocData() {
DocData dd = new DocData();
dd.body = DOC_TEXT;
dd.name = "doc"+newdocid();
protected DocData getNextDocData() throws NoMoreDataException {
if (docID>0 && !forever) {
throw new NoMoreDataException();
}
addBytes(DOC_TEXT.length());
return dd;
return new DocData("doc"+newdocid(),DOC_TEXT, null, null, null);
}
}

View File

@ -23,19 +23,15 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Locale;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.demo.html.HTMLParser;
/**
@ -45,7 +41,7 @@ public class TrecDocMaker extends BasicDocMaker {
private static final String newline = System.getProperty("line.separator");
private DateFormat dateFormat;
private DateFormat dateFormat [];
private File dataDir = null;
private ArrayList inputFiles = new ArrayList();
private int nextFile = 0;
@ -53,6 +49,13 @@ public class TrecDocMaker extends BasicDocMaker {
private BufferedReader reader;
private GZIPInputStream zis;
private static final String DATE_FORMATS [] = {
"EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
"EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST
"EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
"EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
};
/* (non-Javadoc)
* @see SimpleDocMaker#setConfig(java.util.Properties)
*/
@ -65,34 +68,44 @@ public class TrecDocMaker extends BasicDocMaker {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
}
// date format: 30-MAR-1987 14:22:36.87
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
dateFormat.setLenient(true);
}
dateFormat = new SimpleDateFormat[DATE_FORMATS.length];
for (int i = 0; i < dateFormat.length; i++) {
dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US);
dateFormat[i].setLenient(true);
}
}
private void openNextFile() throws Exception {
private void openNextFile() throws NoMoreDataException, Exception {
closeInputs();
int retries = 0;
while (retries<20) {
while (true) {
File f = null;
synchronized (this) {
f = (File) inputFiles.get(nextFile++);
if (nextFile >= inputFiles.size()) {
// exhausted files, start a new round
// exhausted files, start a new round, unless forever set to false.
if (!forever) {
throw new NoMoreDataException();
}
nextFile = 0;
iteration++;
}
f = (File) inputFiles.get(nextFile++);
}
System.out.println("opening: "+f+" length: "+f.length());
try {
zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
break;
reader = new BufferedReader(new InputStreamReader(zis));
return;
} catch (Exception e) {
retries++;
System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
continue;
if (retries<20) {
System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
continue;
} else {
throw new NoMoreDataException();
}
}
}
reader = new BufferedReader(new InputStreamReader(zis));
}
private void closeInputs() {
@ -142,7 +155,7 @@ public class TrecDocMaker extends BasicDocMaker {
return sb;
}
protected DocData getNextDocData() throws Exception {
protected DocData getNextDocData() throws NoMoreDataException, Exception {
if (reader==null) {
openNextFile();
}
@ -162,39 +175,27 @@ public class TrecDocMaker extends BasicDocMaker {
// 6. collect until end of doc
sb = read("</DOC>",null,false,true);
// this is the next document, so parse it
// TODO use a more robust html parser (current one aborts parsing quite easily).
HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
// title
String title = p.getTitle();
// properties
Properties props = p.getMetaTags();
// body
Reader r = p.getReader();
char c[] = new char[1024];
StringBuffer bodyBuf = new StringBuffer();
int n;
while ((n = r.read(c)) >= 0) {
if (n>0) {
bodyBuf.append(c,0,n);
Date date = parseDate(dateStr);
HTMLParser p = getHtmlParser();
DocData docData = p.parse(name, date, sb, dateFormat[0]);
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
return docData;
}
private Date parseDate(String dateStr) {
Date date = null;
for (int i=0; i<dateFormat.length; i++) {
try {
date = dateFormat[i].parse(dateStr.trim());
return date;
} catch (ParseException e) {
}
}
r.close();
addBytes(bodyBuf.length());
DocData dd = new DocData();
try {
dd.date = dateFormat.parse(dateStr.trim());
} catch (ParseException e) {
// do not fail test just because a date could not be parsed
System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
dd.date = new Date(); // now
}
dd.name = name;
dd.title = title;
dd.body = bodyBuf.toString();
dd.props = props;
return dd;
// do not fail test just because a date could not be parsed
System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
date = new Date(); // now
return date;
}

View File

@ -149,6 +149,19 @@ Assume you added the class "WonderfulTask" - doing so also enables the
command "Wonderful" to be used in the algorithm.
</p>
<p>
<u>External classes</u>: It is sometimes useful to invoke the benchmark
package with your external alg file that configures the use of your own
doc/query maker and or html parser. You can work this out without
modifying the benchmark package code, by passing your class path
with the benchmark.ext.classpath property:
<ul>
<li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
<font color="#FF0000">-Dbenchmark.ext.classpath=/mydir/classes
</font> -Dtask.mem=512M</li>
</ul>
</p>
<a name="algorithm"></a>
<h2>Benchmark "algorithm"</h2>
@ -198,6 +211,14 @@ The following is an informal description of the supported syntax.
30 times in a row.
<br>Example - <font color="#FF0066">{ AddDoc AddDoc } : 30</font> - would do
addDoc 60 times in a row.
<br><b>Exhaustive repeating</b>: use <font color="#FF0066">*</font> instead of
a number to repeat forever.
This is sometimes useful, for adding as many files as a doc maker can create,
without iterating over the same files again, but in the case that the exact
number of files is not known in advance. For insance, TREC files extracted
from a zip file.
<br>Example - <font color="#FF0066">{ AddDoc } : *</font> - would add docs
until the doc maker is "exhausted".
</li>
<li>
<b>Command parameter</b>: a command can optionally take a single parameter.
@ -487,6 +508,8 @@ Here is a list of currently defined properties:
<li><b>Docs and queries creation:</b></li>
<ul><li>analyzer
</li><li>doc.maker
</li><li>doc.maker.forever
</li><li>html.parser
</li><li>doc.stored
</li><li>doc.tokenized
</li><li>doc.term.vector

View File

@ -21,11 +21,13 @@ import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
/**
* Sequence of parallel or sequential tasks.
*/
public class TaskSequence extends PerfTask {
public static int REPEAT_EXHAUST = -2;
private ArrayList tasks;
private int repetitions = 1;
private boolean parallel;
@ -61,9 +63,13 @@ public class TaskSequence extends PerfTask {
/**
* @param repetitions The repetitions to set.
* @throws Exception
*/
public void setRepetitions(int repetitions) {
public void setRepetitions(int repetitions) throws Exception {
this.repetitions = repetitions;
if (repetitions==REPEAT_EXHAUST && isParallel()) {
throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
}
setSequenceName();
}
@ -88,10 +94,15 @@ public class TaskSequence extends PerfTask {
}
int count = 0;
for (int k=0; k<repetitions; k++) {
boolean exhausted = false;
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
for (Iterator it = tasks.iterator(); it.hasNext();) {
PerfTask task = (PerfTask) it.next();
count += task.runAndMaybeStats(letChildReport);
try {
count += task.runAndMaybeStats(letChildReport);
} catch (NoMoreDataException e) {
exhausted = true;
}
}
}
return count;
@ -101,7 +112,8 @@ public class TaskSequence extends PerfTask {
long delayStep = (perMin ? 60000 : 1000) /rate;
long nextStartTime = System.currentTimeMillis();
int count = 0;
for (int k=0; k<repetitions; k++) {
boolean exhausted = false;
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
for (Iterator it = tasks.iterator(); it.hasNext();) {
PerfTask task = (PerfTask) it.next();
long waitMore = nextStartTime - System.currentTimeMillis();
@ -110,7 +122,11 @@ public class TaskSequence extends PerfTask {
Thread.sleep(waitMore);
}
nextStartTime += delayStep; // this aims at avarage rate.
count += task.runAndMaybeStats(letChildReport);
try {
count += task.runAndMaybeStats(letChildReport);
} catch (NoMoreDataException e) {
exhausted = true;
}
}
}
return count;
@ -198,6 +214,9 @@ public class TaskSequence extends PerfTask {
if (repetitions>1) {
sb.append(" * " + repetitions);
}
if (repetitions==REPEAT_EXHAUST) {
sb.append(" * EXHAUST");
}
if (rate>0) {
sb.append(", rate: " + rate+"/"+(perMin?"min":"sec"));
}
@ -237,7 +256,9 @@ public class TaskSequence extends PerfTask {
private void setSequenceName() {
seqName = super.getName();
if (repetitions>1) {
if (repetitions==REPEAT_EXHAUST) {
seqName += "_Exhaust";
} else if (repetitions>1) {
seqName += "_"+repetitions;
}
if (rate>0) {

View File

@ -117,8 +117,12 @@ public class Algorithm {
colonOk = false;
// get repetitions number
stok.nextToken();
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
((TaskSequence)prevTask).setRepetitions((int)stok.nval);
if ((char)stok.ttype == '*') {
((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
} else {
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
((TaskSequence)prevTask).setRepetitions((int)stok.nval);
}
// check for rate specification (ops/min)
stok.nextToken();
if (stok.ttype!=':') {

View File

@ -81,6 +81,49 @@ public class TestPerfTasksLogic extends TestCase {
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
}
/**
* Test Exhasting Doc Maker logic
*/
public void testExhaustDocMaker() throws Exception {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"# ----- properties ",
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
"doc.add.log.step=1",
"doc.term.vector=false",
"doc.maker.forever=false",
"directory=RAMDirectory",
"doc.stored=false",
"doc.tokenized=false",
"# ----- alg ",
"CreateIndex",
"{ AddDoc } : * ",
"Optimize",
"CloseIndex",
"OpenReader",
"{ CountingSearchTest } : 100",
"CloseReader",
"[ CountingSearchTest > : 30",
"[ CountingSearchTest > : 9",
};
// 2. we test this value later
CountingSearchTestTask.numSearches = 0;
// 3. execute the algorithm (required in every "logic" test)
Benchmark benchmark = execBenchmark(algLines);
// 4. test specific checks after the benchmark run completed.
assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
// now we should be able to open the index for write.
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
}
// create the benchmark and execute it.
private Benchmark execBenchmark(String[] algLines) throws Exception {
String algText = algLinesToText(algLines);