mirror of https://github.com/apache/lucene.git
LUCENE-849: configurable HTML Parser; external classes; exhaustive doc maker - '*';
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@522569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
eee9d52886
commit
031f50c4e7
|
@ -4,6 +4,13 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
3/25/07
|
||||
|
||||
LUCENE-849:
|
||||
1. which HTML Parser is used is configurable with html.parser property.
|
||||
2. External classes added to classpath with -Dbenchmark.ext.classpath=path.
|
||||
3. '*' as repeating number now means "exhaust doc maker - no repetitions".
|
||||
|
||||
3/22/07
|
||||
|
||||
-Moved withRetrieve() call out of the loop in ReadTask
|
||||
|
|
|
@ -97,6 +97,7 @@
|
|||
<path id="run.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${build.dir}/classes/java"/>
|
||||
<pathelement location="${benchmark.ext.classpath}"/>
|
||||
</path>
|
||||
|
||||
<target name="run-standard" depends="compile,check-files,get-files" description="Run the standard baseline">
|
||||
|
|
|
@ -52,6 +52,7 @@ public class Benchmark {
|
|||
try {
|
||||
runData = new PerfRunData(new Config(algReader));
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
throw new Exception("Error: cannot init PerfRunData!",e);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Iterator;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.HTMLParser;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.benchmark.byTask.stats.Points;
|
||||
import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
|
||||
|
@ -58,6 +59,7 @@ public class PerfRunData {
|
|||
private Directory directory;
|
||||
private Analyzer analyzer;
|
||||
private DocMaker docMaker;
|
||||
private HTMLParser htmlParser;
|
||||
|
||||
// we use separate (identical) instances for each "read" task type, so each can iterate the quries separately.
|
||||
private HashMap readTaskQueryMaker;
|
||||
|
@ -79,7 +81,10 @@ public class PerfRunData {
|
|||
docMaker.setConfig(config);
|
||||
// query makers
|
||||
readTaskQueryMaker = new HashMap();
|
||||
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
|
||||
qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker"));
|
||||
// html parser, used for some doc makers
|
||||
htmlParser = (HTMLParser) Class.forName(config.get("html.parser","org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser")).newInstance();
|
||||
docMaker.setHTMLParser(htmlParser);
|
||||
|
||||
// index stuff
|
||||
reinit(false);
|
||||
|
@ -229,4 +234,11 @@ public class PerfRunData {
|
|||
return qm;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the htmlParser.
|
||||
*/
|
||||
public HTMLParser getHtmlParser() {
|
||||
return htmlParser;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,9 +26,7 @@ import org.apache.lucene.document.Field;
|
|||
import java.io.File;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.Properties;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -47,15 +45,8 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
|
||||
private int numDocsCreated = 0;
|
||||
private boolean storeBytes = false;
|
||||
protected boolean forever;
|
||||
|
||||
static class DocData {
|
||||
String name;
|
||||
Date date;
|
||||
String title;
|
||||
String body;
|
||||
Properties props;
|
||||
}
|
||||
|
||||
private static class LeftOver {
|
||||
private DocData docdata;
|
||||
private int cnt;
|
||||
|
@ -80,10 +71,14 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
|
||||
/**
|
||||
* Return the data of the next document.
|
||||
* All current implementations can create docs forever.
|
||||
* When the input data is exhausted, input files are iterated.
|
||||
* This re-iteration can be avoided by setting doc.maker.forever to false (default is true).
|
||||
* @return data of the next document.
|
||||
* @exception if cannot create the next doc data
|
||||
* @exception NoMoreDataException if data is exhausted (and 'forever' set to false).
|
||||
*/
|
||||
protected abstract DocData getNextDocData() throws Exception;
|
||||
protected abstract DocData getNextDocData() throws NoMoreDataException, Exception;
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
|
@ -103,32 +98,32 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
int docid = incrNumDocsCreated();
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
|
||||
if (docData.name!=null) {
|
||||
String name = (cnt<0 ? docData.name : docData.name+"_"+cnt);
|
||||
if (docData.getName()!=null) {
|
||||
String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
|
||||
doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.date!=null) {
|
||||
String dateStr = DateTools.dateToString(docData.date, DateTools.Resolution.SECOND);
|
||||
if (docData.getDate()!=null) {
|
||||
String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
|
||||
doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.title!=null) {
|
||||
doc.add(new Field("doctitle", docData.title, storeVal, indexVal, termVecVal));
|
||||
if (docData.getTitle()!=null) {
|
||||
doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
|
||||
}
|
||||
if (docData.body!=null && docData.body.length()>0) {
|
||||
if (docData.getBody()!=null && docData.getBody().length()>0) {
|
||||
String bdy;
|
||||
if (size<=0 || size>=docData.body.length()) {
|
||||
bdy = docData.body; // use all
|
||||
docData.body = ""; // nothing left
|
||||
if (size<=0 || size>=docData.getBody().length()) {
|
||||
bdy = docData.getBody(); // use all
|
||||
docData.setBody(""); // nothing left
|
||||
} else {
|
||||
// attempt not to break words - if whitespace found within next 20 chars...
|
||||
for (int n=size-1; n<size+20 && n<docData.body.length(); n++) {
|
||||
if (Character.isWhitespace(docData.body.charAt(n))) {
|
||||
for (int n=size-1; n<size+20 && n<docData.getBody().length(); n++) {
|
||||
if (Character.isWhitespace(docData.getBody().charAt(n))) {
|
||||
size = n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bdy = docData.body.substring(0,size); // use part
|
||||
docData.body = docData.body.substring(size); // some left
|
||||
bdy = docData.getBody().substring(0,size); // use part
|
||||
docData.setBody(docData.getBody().substring(size)); // some left
|
||||
}
|
||||
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
|
||||
if (storeBytes == true) {
|
||||
|
@ -136,13 +131,13 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
}
|
||||
}
|
||||
|
||||
if (docData.props!=null) {
|
||||
for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {
|
||||
if (docData.getProps()!=null) {
|
||||
for (Iterator it = docData.getProps().keySet().iterator(); it.hasNext(); ) {
|
||||
String key = (String) it.next();
|
||||
String val = (String) docData.props.get(key);
|
||||
String val = (String) docData.getProps().get(key);
|
||||
doc.add(new Field(key, val, storeVal, indexVal, termVecVal));
|
||||
}
|
||||
docData.props = null;
|
||||
docData.setProps(null);
|
||||
}
|
||||
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
|
||||
return doc;
|
||||
|
@ -154,19 +149,19 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
*/
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
LeftOver lvr = (LeftOver) leftovr.get();
|
||||
if (lvr==null || lvr.docdata==null || lvr.docdata.body==null || lvr.docdata.body.length()==0) {
|
||||
if (lvr==null || lvr.docdata==null || lvr.docdata.getBody()==null || lvr.docdata.getBody().length()==0) {
|
||||
resetLeftovers();
|
||||
}
|
||||
DocData dd = (lvr==null ? getNextDocData() : lvr.docdata);
|
||||
int cnt = (lvr==null ? 0 : lvr.cnt);
|
||||
while (dd.body==null || dd.body.length()<size) {
|
||||
while (dd.getBody()==null || dd.getBody().length()<size) {
|
||||
DocData dd2 = dd;
|
||||
dd = getNextDocData();
|
||||
cnt = 0;
|
||||
dd.body = dd2.body + dd.body;
|
||||
dd.setBody(dd2.getBody() + dd.getBody());
|
||||
}
|
||||
Document doc = createDocument(dd,size,cnt);
|
||||
if (dd.body==null || dd.body.length()==0) {
|
||||
if (dd.getBody()==null || dd.getBody().length()==0) {
|
||||
resetLeftovers();
|
||||
} else {
|
||||
if (lvr == null) {
|
||||
|
@ -195,6 +190,7 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
|
||||
termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
|
||||
storeBytes = config.get("doc.store.body.bytes", false);
|
||||
forever = config.get("doc.maker.forever",true);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -247,6 +243,8 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
private int lastPrintedNumUniqueTexts = 0;
|
||||
private long lastPrintedNumUniqueBytes = 0;
|
||||
private int printNum = 0;
|
||||
private HTMLParser htmlParser;
|
||||
|
||||
public void printDocStatistics() {
|
||||
boolean print = false;
|
||||
String col = " ";
|
||||
|
@ -277,6 +275,7 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
}
|
||||
|
||||
protected void collectFiles(File f, ArrayList inputFiles) {
|
||||
//System.out.println("Collect: "+f.getAbsolutePath());
|
||||
if (!f.canRead()) {
|
||||
return;
|
||||
}
|
||||
|
@ -291,5 +290,20 @@ public abstract class BasicDocMaker implements DocMaker {
|
|||
addUniqueBytes(f.length());
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#setHTMLParser(org.apache.lucene.benchmark.byTask.feeds.HTMLParser)
|
||||
*/
|
||||
public void setHTMLParser(HTMLParser htmlParser) {
|
||||
this.htmlParser = htmlParser;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.DocMaker#getHtmlParser()
|
||||
*/
|
||||
public HTMLParser getHtmlParser() {
|
||||
return htmlParser;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* HTML Parser that is based on Lucene's demo HTML parser.
|
||||
*/
|
||||
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
|
||||
|
||||
DateFormat dateFormat;
|
||||
|
||||
public DemoHTMLParser () {
|
||||
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
dateFormat.setLenient(true);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see org.apache.lucene.benchmark.byTask.feeds.HTMLParser#parse(java.lang.String, java.util.Date, java.io.Reader, java.text.DateFormat)
|
||||
*/
|
||||
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
|
||||
|
||||
// title
|
||||
String title = p.getTitle();
|
||||
// properties
|
||||
Properties props = p.getMetaTags();
|
||||
// body
|
||||
Reader r = p.getReader();
|
||||
char c[] = new char[1024];
|
||||
StringBuffer bodyBuf = new StringBuffer();
|
||||
int n;
|
||||
while ((n = r.read(c)) >= 0) {
|
||||
if (n>0) {
|
||||
bodyBuf.append(c,0,n);
|
||||
}
|
||||
}
|
||||
r.close();
|
||||
if (date == null && props.getProperty("date")!=null) {
|
||||
try {
|
||||
date = dateFormat.parse(props.getProperty("date").trim());
|
||||
} catch (ParseException e) {
|
||||
// do not fail test just because a date could not be parsed
|
||||
System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
|
||||
date = new Date(); // now
|
||||
}
|
||||
}
|
||||
|
||||
return new DocData(name, bodyBuf.toString(), title, props, date);
|
||||
}
|
||||
|
||||
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException {
|
||||
// TODO Auto-generated method stub
|
||||
return parse(name, date, new StringReader(inputText.toString()), dateFormat);
|
||||
}
|
||||
|
||||
}
|
113
contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
Executable file
113
contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
Executable file
|
@ -0,0 +1,113 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* Output of parsing (e.g. HTML parsing) of an input document.
|
||||
*/
|
||||
|
||||
public class DocData {
|
||||
|
||||
private String name;
|
||||
private String body;
|
||||
private String title;
|
||||
private Date date;
|
||||
private Properties props;
|
||||
|
||||
public DocData(String name, String body, String title, Properties props, Date date) {
|
||||
this.name = name;
|
||||
this.body = body;
|
||||
this.title = title;
|
||||
this.date = date;
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the name.
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name The name to set.
|
||||
*/
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the props.
|
||||
*/
|
||||
public Properties getProps() {
|
||||
return props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props The props to set.
|
||||
*/
|
||||
public void setProps(Properties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the body.
|
||||
*/
|
||||
public String getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param body The body to set.
|
||||
*/
|
||||
public void setBody(String body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the title.
|
||||
*/
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param title The title to set.
|
||||
*/
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the date.
|
||||
*/
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param date The date to set.
|
||||
*/
|
||||
public void setDate(Date date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
}
|
|
@ -61,4 +61,11 @@ public interface DocMaker {
|
|||
|
||||
/** Print some statistics on docs available/added/etc. */
|
||||
public void printDocStatistics();
|
||||
}
|
||||
|
||||
/** Set the html parser to use, when appropriate */
|
||||
public void setHTMLParser(HTMLParser htmlParser);
|
||||
|
||||
/** Returns the htmlParser. */
|
||||
public HTMLParser getHtmlParser();
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
/**
|
||||
* HTML Parsing Interfacew for test purposes
|
||||
*/
|
||||
public interface HTMLParser {
|
||||
|
||||
/**
|
||||
* Parse the input Reader and return DocData.
|
||||
* A provided name or date is used for the result, otherwise an attempt is
|
||||
* made to set them from the parsed data.
|
||||
* @param dateFormat date formatter to use for extracting the date.
|
||||
* @param name name of the result doc data. If null, attempt to set by parsed data.
|
||||
* @param date date of the result doc data. If null, attempt to set by parsed data.
|
||||
* @param reader of html text to parse.
|
||||
* @return Parsed doc data.
|
||||
* @throws IOException
|
||||
* @throws InterruptedException
|
||||
*/
|
||||
public DocData parse(String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
|
||||
/**
|
||||
* Parse the inputText and return DocData.
|
||||
* @param inputText the html text to parse.
|
||||
* @see #parse(String, Date, Reader, DateFormat)
|
||||
*/
|
||||
public DocData parse(String name, Date date, StringBuffer inputText, DateFormat dateFormat) throws IOException, InterruptedException;
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Exception indicating there is no more data.
|
||||
* Thrown by Docs Makers if doc.maker.forever is false and docs sources of that maker where exhausted.
|
||||
* This is usefull for iterating all document of a source, in case we don't know in advance how many docs there are.
|
||||
*/
|
||||
public class NoMoreDataException extends Exception {
|
||||
|
||||
}
|
|
@ -25,6 +25,7 @@ import java.io.FileReader;
|
|||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
|
||||
|
@ -66,13 +67,16 @@ public class ReutersDocMaker extends BasicDocMaker {
|
|||
File f = null;
|
||||
String name = null;
|
||||
synchronized (this) {
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
name = f.getCanonicalPath()+"_"+iteration;
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
name = f.getCanonicalPath()+"_"+iteration;
|
||||
}
|
||||
|
||||
BufferedReader reader = new BufferedReader(new FileReader(f));
|
||||
|
@ -90,13 +94,9 @@ public class ReutersDocMaker extends BasicDocMaker {
|
|||
|
||||
addBytes(f.length());
|
||||
|
||||
DocData dd = new DocData();
|
||||
|
||||
dd.date = dateFormat.parse(dateStr.trim());
|
||||
dd.name = name;
|
||||
dd.title = title;
|
||||
dd.body = bodyBuf.toString();
|
||||
return dd;
|
||||
Date date = dateFormat.parse(dateStr.trim());
|
||||
return new DocData(name, bodyBuf.toString(), title, null, date);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
*/
|
||||
|
||||
/**
|
||||
* Create documents for the test
|
||||
* Create documents for the test.
|
||||
*/
|
||||
public class SimpleDocMaker extends BasicDocMaker {
|
||||
|
||||
|
@ -58,12 +58,12 @@ public class SimpleDocMaker extends BasicDocMaker {
|
|||
return 0; // not applicable
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() {
|
||||
DocData dd = new DocData();
|
||||
dd.body = DOC_TEXT;
|
||||
dd.name = "doc"+newdocid();
|
||||
protected DocData getNextDocData() throws NoMoreDataException {
|
||||
if (docID>0 && !forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
addBytes(DOC_TEXT.length());
|
||||
return dd;
|
||||
return new DocData("doc"+newdocid(),DOC_TEXT, null, null, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,19 +23,15 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.Properties;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.demo.html.HTMLParser;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -45,7 +41,7 @@ public class TrecDocMaker extends BasicDocMaker {
|
|||
|
||||
private static final String newline = System.getProperty("line.separator");
|
||||
|
||||
private DateFormat dateFormat;
|
||||
private DateFormat dateFormat [];
|
||||
private File dataDir = null;
|
||||
private ArrayList inputFiles = new ArrayList();
|
||||
private int nextFile = 0;
|
||||
|
@ -53,6 +49,13 @@ public class TrecDocMaker extends BasicDocMaker {
|
|||
private BufferedReader reader;
|
||||
private GZIPInputStream zis;
|
||||
|
||||
private static final String DATE_FORMATS [] = {
|
||||
"EEE, dd MMM yyyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE MMM dd kk:mm:ss yyyy z", //Tue Dec 09 16:45:08 2003 EST
|
||||
"EEE, dd-MMM-':'y kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
"EEE, dd-MMM-yyy kk:mm:ss z", //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
};
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see SimpleDocMaker#setConfig(java.util.Properties)
|
||||
*/
|
||||
|
@ -65,34 +68,44 @@ public class TrecDocMaker extends BasicDocMaker {
|
|||
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
|
||||
}
|
||||
// date format: 30-MAR-1987 14:22:36.87
|
||||
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ",Locale.US); //Tue, 09 Dec 2003 22:39:08 GMT
|
||||
dateFormat.setLenient(true);
|
||||
}
|
||||
dateFormat = new SimpleDateFormat[DATE_FORMATS.length];
|
||||
for (int i = 0; i < dateFormat.length; i++) {
|
||||
dateFormat[i] = new SimpleDateFormat(DATE_FORMATS[i],Locale.US);
|
||||
dateFormat[i].setLenient(true);
|
||||
}
|
||||
}
|
||||
|
||||
private void openNextFile() throws Exception {
|
||||
private void openNextFile() throws NoMoreDataException, Exception {
|
||||
closeInputs();
|
||||
int retries = 0;
|
||||
while (retries<20) {
|
||||
while (true) {
|
||||
File f = null;
|
||||
synchronized (this) {
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
if (nextFile >= inputFiles.size()) {
|
||||
// exhausted files, start a new round
|
||||
// exhausted files, start a new round, unless forever set to false.
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
nextFile = 0;
|
||||
iteration++;
|
||||
}
|
||||
f = (File) inputFiles.get(nextFile++);
|
||||
}
|
||||
System.out.println("opening: "+f+" length: "+f.length());
|
||||
try {
|
||||
zis = new GZIPInputStream(new BufferedInputStream(new FileInputStream(f)));
|
||||
break;
|
||||
reader = new BufferedReader(new InputStreamReader(zis));
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
retries++;
|
||||
System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
|
||||
continue;
|
||||
if (retries<20) {
|
||||
System.out.println("Skipping 'bad' file "+f.getAbsolutePath()+" #retries="+retries);
|
||||
continue;
|
||||
} else {
|
||||
throw new NoMoreDataException();
|
||||
}
|
||||
}
|
||||
}
|
||||
reader = new BufferedReader(new InputStreamReader(zis));
|
||||
}
|
||||
|
||||
private void closeInputs() {
|
||||
|
@ -142,7 +155,7 @@ public class TrecDocMaker extends BasicDocMaker {
|
|||
return sb;
|
||||
}
|
||||
|
||||
protected DocData getNextDocData() throws Exception {
|
||||
protected DocData getNextDocData() throws NoMoreDataException, Exception {
|
||||
if (reader==null) {
|
||||
openNextFile();
|
||||
}
|
||||
|
@ -162,39 +175,27 @@ public class TrecDocMaker extends BasicDocMaker {
|
|||
// 6. collect until end of doc
|
||||
sb = read("</DOC>",null,false,true);
|
||||
// this is the next document, so parse it
|
||||
// TODO use a more robust html parser (current one aborts parsing quite easily).
|
||||
HTMLParser p = new HTMLParser(new StringReader(sb.toString()));
|
||||
// title
|
||||
String title = p.getTitle();
|
||||
// properties
|
||||
Properties props = p.getMetaTags();
|
||||
// body
|
||||
Reader r = p.getReader();
|
||||
char c[] = new char[1024];
|
||||
StringBuffer bodyBuf = new StringBuffer();
|
||||
int n;
|
||||
while ((n = r.read(c)) >= 0) {
|
||||
if (n>0) {
|
||||
bodyBuf.append(c,0,n);
|
||||
Date date = parseDate(dateStr);
|
||||
HTMLParser p = getHtmlParser();
|
||||
DocData docData = p.parse(name, date, sb, dateFormat[0]);
|
||||
addBytes(sb.length()); // count char length of parsed html text (larger than the plain doc body text).
|
||||
|
||||
return docData;
|
||||
}
|
||||
|
||||
private Date parseDate(String dateStr) {
|
||||
Date date = null;
|
||||
for (int i=0; i<dateFormat.length; i++) {
|
||||
try {
|
||||
date = dateFormat[i].parse(dateStr.trim());
|
||||
return date;
|
||||
} catch (ParseException e) {
|
||||
}
|
||||
}
|
||||
r.close();
|
||||
addBytes(bodyBuf.length());
|
||||
|
||||
DocData dd = new DocData();
|
||||
|
||||
try {
|
||||
dd.date = dateFormat.parse(dateStr.trim());
|
||||
} catch (ParseException e) {
|
||||
// do not fail test just because a date could not be parsed
|
||||
System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
|
||||
dd.date = new Date(); // now
|
||||
}
|
||||
dd.name = name;
|
||||
dd.title = title;
|
||||
dd.body = bodyBuf.toString();
|
||||
dd.props = props;
|
||||
return dd;
|
||||
// do not fail test just because a date could not be parsed
|
||||
System.out.println("ignoring date parse exception (assigning 'now') for: "+dateStr);
|
||||
date = new Date(); // now
|
||||
return date;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -149,6 +149,19 @@ Assume you added the class "WonderfulTask" - doing so also enables the
|
|||
command "Wonderful" to be used in the algorithm.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
<u>External classes</u>: It is sometimes useful to invoke the benchmark
|
||||
package with your external alg file that configures the use of your own
|
||||
doc/query maker and or html parser. You can work this out without
|
||||
modifying the benchmark package code, by passing your class path
|
||||
with the benchmark.ext.classpath property:
|
||||
<ul>
|
||||
<li>ant run-task -Dtask.alg=[full-path-to-your-alg-file]
|
||||
<font color="#FF0000">-Dbenchmark.ext.classpath=/mydir/classes
|
||||
</font> -Dtask.mem=512M</li>
|
||||
</ul>
|
||||
</p>
|
||||
|
||||
<a name="algorithm"></a>
|
||||
<h2>Benchmark "algorithm"</h2>
|
||||
|
||||
|
@ -198,6 +211,14 @@ The following is an informal description of the supported syntax.
|
|||
30 times in a row.
|
||||
<br>Example - <font color="#FF0066">{ AddDoc AddDoc } : 30</font> - would do
|
||||
addDoc 60 times in a row.
|
||||
<br><b>Exhaustive repeating</b>: use <font color="#FF0066">*</font> instead of
|
||||
a number to repeat forever.
|
||||
This is sometimes useful, for adding as many files as a doc maker can create,
|
||||
without iterating over the same files again, but in the case that the exact
|
||||
number of files is not known in advance. For insance, TREC files extracted
|
||||
from a zip file.
|
||||
<br>Example - <font color="#FF0066">{ AddDoc } : *</font> - would add docs
|
||||
until the doc maker is "exhausted".
|
||||
</li>
|
||||
<li>
|
||||
<b>Command parameter</b>: a command can optionally take a single parameter.
|
||||
|
@ -487,6 +508,8 @@ Here is a list of currently defined properties:
|
|||
<li><b>Docs and queries creation:</b></li>
|
||||
<ul><li>analyzer
|
||||
</li><li>doc.maker
|
||||
</li><li>doc.maker.forever
|
||||
</li><li>html.parser
|
||||
</li><li>doc.stored
|
||||
</li><li>doc.tokenized
|
||||
</li><li>doc.term.vector
|
||||
|
|
|
@ -21,11 +21,13 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
|
||||
|
||||
/**
|
||||
* Sequence of parallel or sequential tasks.
|
||||
*/
|
||||
public class TaskSequence extends PerfTask {
|
||||
public static int REPEAT_EXHAUST = -2;
|
||||
private ArrayList tasks;
|
||||
private int repetitions = 1;
|
||||
private boolean parallel;
|
||||
|
@ -61,9 +63,13 @@ public class TaskSequence extends PerfTask {
|
|||
|
||||
/**
|
||||
* @param repetitions The repetitions to set.
|
||||
* @throws Exception
|
||||
*/
|
||||
public void setRepetitions(int repetitions) {
|
||||
public void setRepetitions(int repetitions) throws Exception {
|
||||
this.repetitions = repetitions;
|
||||
if (repetitions==REPEAT_EXHAUST && isParallel()) {
|
||||
throw new Exception("REPEAT_EXHAUST is not allowed for parallel tasks");
|
||||
}
|
||||
setSequenceName();
|
||||
}
|
||||
|
||||
|
@ -88,10 +94,15 @@ public class TaskSequence extends PerfTask {
|
|||
}
|
||||
|
||||
int count = 0;
|
||||
for (int k=0; k<repetitions; k++) {
|
||||
boolean exhausted = false;
|
||||
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
|
||||
for (Iterator it = tasks.iterator(); it.hasNext();) {
|
||||
PerfTask task = (PerfTask) it.next();
|
||||
count += task.runAndMaybeStats(letChildReport);
|
||||
try {
|
||||
count += task.runAndMaybeStats(letChildReport);
|
||||
} catch (NoMoreDataException e) {
|
||||
exhausted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -101,7 +112,8 @@ public class TaskSequence extends PerfTask {
|
|||
long delayStep = (perMin ? 60000 : 1000) /rate;
|
||||
long nextStartTime = System.currentTimeMillis();
|
||||
int count = 0;
|
||||
for (int k=0; k<repetitions; k++) {
|
||||
boolean exhausted = false;
|
||||
for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k<repetitions; k++) {
|
||||
for (Iterator it = tasks.iterator(); it.hasNext();) {
|
||||
PerfTask task = (PerfTask) it.next();
|
||||
long waitMore = nextStartTime - System.currentTimeMillis();
|
||||
|
@ -110,7 +122,11 @@ public class TaskSequence extends PerfTask {
|
|||
Thread.sleep(waitMore);
|
||||
}
|
||||
nextStartTime += delayStep; // this aims at avarage rate.
|
||||
count += task.runAndMaybeStats(letChildReport);
|
||||
try {
|
||||
count += task.runAndMaybeStats(letChildReport);
|
||||
} catch (NoMoreDataException e) {
|
||||
exhausted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return count;
|
||||
|
@ -198,6 +214,9 @@ public class TaskSequence extends PerfTask {
|
|||
if (repetitions>1) {
|
||||
sb.append(" * " + repetitions);
|
||||
}
|
||||
if (repetitions==REPEAT_EXHAUST) {
|
||||
sb.append(" * EXHAUST");
|
||||
}
|
||||
if (rate>0) {
|
||||
sb.append(", rate: " + rate+"/"+(perMin?"min":"sec"));
|
||||
}
|
||||
|
@ -237,7 +256,9 @@ public class TaskSequence extends PerfTask {
|
|||
|
||||
private void setSequenceName() {
|
||||
seqName = super.getName();
|
||||
if (repetitions>1) {
|
||||
if (repetitions==REPEAT_EXHAUST) {
|
||||
seqName += "_Exhaust";
|
||||
} else if (repetitions>1) {
|
||||
seqName += "_"+repetitions;
|
||||
}
|
||||
if (rate>0) {
|
||||
|
|
|
@ -117,8 +117,12 @@ public class Algorithm {
|
|||
colonOk = false;
|
||||
// get repetitions number
|
||||
stok.nextToken();
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
|
||||
((TaskSequence)prevTask).setRepetitions((int)stok.nval);
|
||||
if ((char)stok.ttype == '*') {
|
||||
((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST);
|
||||
} else {
|
||||
if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expexted repetitions number: - "+stok.toString());
|
||||
((TaskSequence)prevTask).setRepetitions((int)stok.nval);
|
||||
}
|
||||
// check for rate specification (ops/min)
|
||||
stok.nextToken();
|
||||
if (stok.ttype!=':') {
|
||||
|
|
|
@ -81,6 +81,49 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Exhasting Doc Maker logic
|
||||
*/
|
||||
public void testExhaustDocMaker() throws Exception {
|
||||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"# ----- properties ",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker",
|
||||
"doc.add.log.step=1",
|
||||
"doc.term.vector=false",
|
||||
"doc.maker.forever=false",
|
||||
"directory=RAMDirectory",
|
||||
"doc.stored=false",
|
||||
"doc.tokenized=false",
|
||||
"# ----- alg ",
|
||||
"CreateIndex",
|
||||
"{ AddDoc } : * ",
|
||||
"Optimize",
|
||||
"CloseIndex",
|
||||
"OpenReader",
|
||||
"{ CountingSearchTest } : 100",
|
||||
"CloseReader",
|
||||
"[ CountingSearchTest > : 30",
|
||||
"[ CountingSearchTest > : 9",
|
||||
};
|
||||
|
||||
// 2. we test this value later
|
||||
CountingSearchTestTask.numSearches = 0;
|
||||
|
||||
// 3. execute the algorithm (required in every "logic" test)
|
||||
Benchmark benchmark = execBenchmark(algLines);
|
||||
|
||||
// 4. test specific checks after the benchmark run completed.
|
||||
assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches);
|
||||
assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory()));
|
||||
// now we should be able to open the index for write.
|
||||
IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
|
||||
iw.close();
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
|
||||
}
|
||||
|
||||
|
||||
// create the benchmark and execute it.
|
||||
private Benchmark execBenchmark(String[] algLines) throws Exception {
|
||||
String algText = algLinesToText(algLines);
|
||||
|
|
Loading…
Reference in New Issue