diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt
index 7e20b421831..e50d747585f 100644
--- a/contrib/benchmark/CHANGES.txt
+++ b/contrib/benchmark/CHANGES.txt
@@ -4,6 +4,11 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
+7/24/07
+ LUCENE-947: Add support for creating and index "one document per
+ line" from a large text file, which reduces per-document overhead of
+ opening a single file for each document.
+
6/30/07
LUCENE-848: Added support for Wikipedia benchmarking.
diff --git a/contrib/benchmark/conf/createLineFile.alg b/contrib/benchmark/conf/createLineFile.alg
new file mode 100644
index 00000000000..dae602811ea
--- /dev/null
+++ b/contrib/benchmark/conf/createLineFile.alg
@@ -0,0 +1,43 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg will process the Reuters documents feed to produce a
+# single file that contains all documents, one per line.
+#
+# To use this, first cd to contrib/benchmark and then run:
+#
+# ant run-task -Dtask.alg=conf/createLineFile.alg
+#
+# Then, to index the documents in the line file, see
+# indexLineFile.alg.
+#
+
+# Where to get documents from:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker
+
+# Where to write the line file output:
+line.file.out=work/reuters.lines.txt
+
+# Stop after processing the document feed once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Process all documents, appending each one to the line file:
+{WriteLineDoc()}: *
diff --git a/contrib/benchmark/conf/indexLineFile.alg b/contrib/benchmark/conf/indexLineFile.alg
new file mode 100644
index 00000000000..52c4af1e0c6
--- /dev/null
+++ b/contrib/benchmark/conf/indexLineFile.alg
@@ -0,0 +1,53 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This file indexes documents contained in a single text file, one per
+# line. See createLineFile.alg for how to create this file. The
+# benefit of this is it removes the IO cost of opening one file per
+# document to let you more accurately measure time spent analyzing and
+# indexing your documents vs time spent creating the documents.
+#
+# To use this, you must first run the createLineFile.alg, then cd to
+# contrib/benchmark and then run:
+#
+# ant run-task -Dtask.alg=conf/indexLineFile.alg
+#
+
+analyzer=org.apache.lucene.analysis.SimpleAnalyzer
+
+# Feed that knows how to process the line file format:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker
+
+# File that contains one document per line:
+docs.file=work/reuters.lines.txt
+
+# Process documents only once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Reset the system, create a new index, index all docs from the line
+# file, close the index, produce a report.
+
+ResetSystemErase
+CreateIndex
+{AddDoc}: *
+CloseIndex
+
+RepSumByPref AddDoc
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
index 9e6bd9ba1ae..8af30f16e27 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@@ -70,6 +70,7 @@ public class PerfRunData {
private IndexReader indexReader;
private IndexWriter indexWriter;
private Config config;
+ private long startTimeMillis;
// constructor
public PerfRunData (Config config) throws Exception {
@@ -136,6 +137,15 @@ public class PerfRunData {
// release unused stuff
System.runFinalization();
System.gc();
+
+ startTimeMillis = System.currentTimeMillis();
+ }
+
+ /**
+ * @return Start time in milliseconds
+ */
+ public long getStartTimeMillis() {
+ return startTimeMillis;
}
/**
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
index 6e16f33b152..1fe45a01a25 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
@@ -39,6 +39,8 @@ import java.util.Iterator;
* doc.stored=true|FALSE
* doc.tokenized=TRUE|false
* doc.term.vector=true|FALSE
+ * doc.term.vector.positions=true|FALSE
+ * doc.term.vector.offsets=true|FALSE
* doc.store.body.bytes=true|FALSE //Store the body contents raw UTF-8 bytes as a field
*/
public abstract class BasicDocMaker implements DocMaker {
@@ -55,7 +57,13 @@ public abstract class BasicDocMaker implements DocMaker {
// leftovers are thread local, because it is unsafe to share residues between threads
private ThreadLocal leftovr = new ThreadLocal();
- static final String BODY_FIELD = "body";
+ public static final String BODY_FIELD = "body";
+ public static final String TITLE_FIELD = "doctitle";
+ public static final String DATE_FIELD = "docdate";
+ public static final String ID_FIELD = "docid";
+ public static final String BYTES_FIELD = "bytes";
+ public static final String NAME_FIELD = "docname";
+
private long numBytes = 0;
private long numUniqueBytes = 0;
@@ -97,17 +105,17 @@ public abstract class BasicDocMaker implements DocMaker {
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
int docid = incrNumDocsCreated();
Document doc = new Document();
- doc.add(new Field("docid", "doc"+docid, storeVal, indexVal, termVecVal));
+ doc.add(new Field(ID_FIELD, "doc"+docid, storeVal, indexVal, termVecVal));
if (docData.getName()!=null) {
String name = (cnt<0 ? docData.getName() : docData.getName()+"_"+cnt);
- doc.add(new Field("docname", name, storeVal, indexVal, termVecVal));
+ doc.add(new Field(NAME_FIELD, name, storeVal, indexVal, termVecVal));
}
if (docData.getDate()!=null) {
String dateStr = DateTools.dateToString(docData.getDate(), DateTools.Resolution.SECOND);
- doc.add(new Field("docdate", dateStr, storeVal, indexVal, termVecVal));
+ doc.add(new Field(DATE_FIELD, dateStr, storeVal, indexVal, termVecVal));
}
if (docData.getTitle()!=null) {
- doc.add(new Field("doctitle", docData.getTitle(), storeVal, indexVal, termVecVal));
+ doc.add(new Field(TITLE_FIELD, docData.getTitle(), storeVal, indexVal, termVecVal));
}
if (docData.getBody()!=null && docData.getBody().length()>0) {
String bdy;
@@ -127,7 +135,7 @@ public abstract class BasicDocMaker implements DocMaker {
}
doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
if (storeBytes == true) {
- doc.add(new Field("bytes", bdy.getBytes("UTF-8"), Field.Store.YES));
+ doc.add(new Field(BYTES_FIELD, bdy.getBytes("UTF-8"), Field.Store.YES));
}
}
@@ -188,7 +196,18 @@ public abstract class BasicDocMaker implements DocMaker {
boolean termVec = config.get("doc.term.vector",false);
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
indexVal = (tokenized ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED);
- termVecVal = (termVec ? Field.TermVector.YES : Field.TermVector.NO);
+ boolean termVecPositions = config.get("doc.term.vector.positions",false);
+ boolean termVecOffsets = config.get("doc.term.vector.offsets",false);
+ if (termVecPositions && termVecOffsets)
+ termVecVal = Field.TermVector.WITH_POSITIONS_OFFSETS;
+ else if (termVecPositions)
+ termVecVal = Field.TermVector.WITH_POSITIONS;
+ else if (termVecOffsets)
+ termVecVal = Field.TermVector.WITH_OFFSETS;
+ else if (termVec)
+ termVecVal = Field.TermVector.YES;
+ else
+ termVecVal = Field.TermVector.NO;
storeBytes = config.get("doc.store.body.bytes", false);
forever = config.get("doc.maker.forever",true);
}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
index 8c7861cba9d..651a828eed7 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirDocMaker.java
@@ -40,7 +40,7 @@ import java.util.Stack;
*/
public class DirDocMaker extends BasicDocMaker {
- private DateFormat dateFormat;
+ private ThreadLocal dateFormat = new ThreadLocal();
private File dataDir = null;
private int iteration=0;
@@ -148,11 +148,21 @@ public class DirDocMaker extends BasicDocMaker {
if (inputFiles==null) {
throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
}
- // date format: 30-MAR-1987 14:22:36
- dateFormat = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss",Locale.US);
- dateFormat.setLenient(true);
}
+ // get/initiate a thread-local simple date format (must do so
+ // because SimpleDateFormat is not thread-safe).
+ protected DateFormat getDateFormat () {
+ DateFormat df = (DateFormat) dateFormat.get();
+ if (df == null) {
+ // date format: 30-MAR-1987 14:22:36.87
+ df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
+ df.setLenient(true);
+ dateFormat.set(df);
+ }
+ return df;
+ }
+
protected DocData getNextDocData() throws Exception {
File f = null;
String name = null;
@@ -184,7 +194,7 @@ public class DirDocMaker extends BasicDocMaker {
reader.close();
addBytes(f.length());
- Date date = dateFormat.parse(dateStr.trim());
+ Date date = getDateFormat().parse(dateStr.trim());
return new DocData(name, bodyBuf.toString(), title, null, date);
}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
index 4956acca8e3..b357f676760 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
@@ -46,7 +46,7 @@ public class FileBasedQueryMaker extends AbstractQueryMaker implements QueryMake
Analyzer anlzr = (Analyzer) Class.forName(config.get("analyzer",
"org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
- String defaultField = config.get("file.query.maker.default.field", "body");
+ String defaultField = config.get("file.query.maker.default.field", BasicDocMaker.BODY_FIELD);
QueryParser qp = new QueryParser(defaultField, anlzr);
List qq = new ArrayList();
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
new file mode 100644
index 00000000000..0bdc6787626
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
@@ -0,0 +1,159 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.FileReader;
+
+/**
+ * A DocMaker reading one line at a time as a Document from
+ * a single file. This saves IO cost (over DirDocMaker) of
+ * recursing through a directory and opening a new file for
+ * every document. It also re-uses its Document and Field
+ * instance to improve indexing speed.
+ *
+ * Config properties:
+ * docs.file=<path to the file%gt;
+ */
+public class LineDocMaker extends BasicDocMaker {
+
+ private BufferedReader fileIn;
+ private ThreadLocal docState = new ThreadLocal();
+ private String fileName;
+
+ private static int READER_BUFFER_BYTES = 64*1024;
+
+ private class DocState {
+ Document doc;
+ Field bodyField;
+ Field titleField;
+ Field dateField;
+
+ public DocState() {
+
+ bodyField = new Field(BasicDocMaker.BODY_FIELD,
+ "",
+ storeVal,
+ Field.Index.TOKENIZED,
+ termVecVal);
+ titleField = new Field(BasicDocMaker.TITLE_FIELD,
+ "",
+ storeVal,
+ Field.Index.TOKENIZED,
+ termVecVal);
+ dateField = new Field(BasicDocMaker.TITLE_FIELD,
+ "",
+ storeVal,
+ Field.Index.TOKENIZED,
+ termVecVal);
+
+ doc = new Document();
+ doc.add(bodyField);
+ doc.add(titleField);
+ doc.add(dateField);
+ }
+
+ final static String SEP = WriteLineDocTask.SEP;
+
+ public Document setFields(String line) {
+ // title date body
+ int spot = line.indexOf(SEP);
+ titleField.setValue(line.substring(0, spot));
+ int spot2 = line.indexOf(SEP, 1+spot);
+ dateField.setValue(line.substring(1+spot, spot2));
+ bodyField.setValue(line.substring(1+spot2, line.length()));
+ return doc;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see SimpleDocMaker#setConfig(java.util.Properties)
+ */
+ public void setConfig(Config config) {
+ super.setConfig(config);
+ resetInputs();
+ }
+
+ protected DocData getNextDocData() throws Exception {
+ throw new RuntimeException("not implemented");
+ }
+
+ private DocState getDocState() {
+ DocState ds = (DocState) docState.get();
+ if (ds == null) {
+ ds = new DocState();
+ docState.set(ds);
+ }
+ return ds;
+ }
+
+ public Document makeDocument() throws Exception {
+
+ String line;
+ synchronized(this) {
+ while(true) {
+ line = fileIn.readLine();
+ if (line == null) {
+ if (!forever)
+ throw new NoMoreDataException();
+ else {
+ // Reset the file
+ openFile();
+ }
+ } else {
+ break;
+ }
+ }
+ }
+
+ return getDocState().setFields(line);
+ }
+
+ public Document makeDocument(int size) throws Exception {
+ throw new RuntimeException("cannot change document size with LineDocMaker; please use DirDocMaker instead");
+ }
+
+ public synchronized void resetInputs() {
+ super.resetInputs();
+ fileName = config.get("docs.file", null);
+ if (fileName == null)
+ throw new RuntimeException("docs.file must be set");
+ openFile();
+ }
+
+ private void openFile() {
+ try {
+ if (fileIn != null)
+ fileIn.close();
+ fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public int numUniqueTexts() {
+ return -1;
+ }
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
index 5c2d940e5b2..0b0081b4e44 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersQueryMaker.java
@@ -71,7 +71,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
* @return array of Lucene queries
*/
private static Query[] createQueries(List qs, Analyzer a) {
- QueryParser qp = new QueryParser("body", a);
+ QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD, a);
List queries = new ArrayList();
for (int i = 0; i < qs.size(); i++) {
try {
@@ -107,7 +107,7 @@ public class ReutersQueryMaker extends AbstractQueryMaker implements QueryMaker
List queryList = new ArrayList(20);
queryList.addAll(Arrays.asList(STANDARD_QUERIES));
- queryList.addAll(Arrays.asList(getPrebuiltQueries("body")));
+ queryList.addAll(Arrays.asList(getPrebuiltQueries(BasicDocMaker.BODY_FIELD)));
return createQueries(queryList, anlzr);
}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
index 6468e2e0708..638fbd01438 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/SimpleQueryMaker.java
@@ -45,11 +45,11 @@ public class SimpleQueryMaker extends AbstractQueryMaker implements QueryMaker {
Analyzer anlzr= (Analyzer) Class.forName(config.get("analyzer",
"org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance();
- QueryParser qp = new QueryParser("body",anlzr);
+ QueryParser qp = new QueryParser(BasicDocMaker.BODY_FIELD,anlzr);
ArrayList qq = new ArrayList();
- Query q1 = new TermQuery(new Term("docid","doc2"));
+ Query q1 = new TermQuery(new Term(BasicDocMaker.ID_FIELD,"doc2"));
qq.add(q1);
- Query q2 = new TermQuery(new Term("body","simple"));
+ Query q2 = new TermQuery(new Term(BasicDocMaker.BODY_FIELD,"simple"));
qq.add(q2);
BooleanQuery bq = new BooleanQuery();
bq.add(q1,Occur.MUST);
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
index ea5bc00fd68..a19c6baee06 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html
@@ -519,6 +519,8 @@ Here is a list of currently defined properties:
doc.stored
doc.tokenized
doc.term.vector
+ doc.term.vector.positions
+ doc.term.vector.offsets
doc.store.body.bytes
docs.dir
query.maker
@@ -540,6 +542,8 @@ Here is a list of currently defined properties:
merge.factor
max.buffered
directory
+ ram.flush.mb
+ autocommit
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
index e7c023d249d..4af39963641 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
@@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
+import java.text.NumberFormat;
/**
@@ -81,7 +82,10 @@ public class AddDocTask extends PerfTask {
logStep = getRunData().getConfig().get("doc.add.log.step",DEFAULT_ADD_DOC_LOG_STEP);
}
if (logStep>0 && (count%logStep)==0) {
- System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
+ double seconds = (System.currentTimeMillis() - getRunData().getStartTimeMillis())/1000.0;
+ NumberFormat nf = NumberFormat.getInstance();
+ nf.setMaximumFractionDigits(2);
+ System.out.println("--> "+nf.format(seconds) + " sec: " + Thread.currentThread().getName()+" processed (add) "+count+" docs");
}
}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
index 5093012678c..4c3cd48fccd 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@@ -30,7 +30,8 @@ import java.io.IOException;
* Create an index.
*
Other side effects: index writer object in perfRunData is set.
*
Relevant properties: merge.factor, max.buffered,
- * max.field.length
.
+ * max.field.length, ram.flush.mb [default 0], autocommit
+ * [default true].
*/
public class CreateIndexTask extends PerfTask {
@@ -42,19 +43,23 @@ public class CreateIndexTask extends PerfTask {
Directory dir = getRunData().getDirectory();
Analyzer analyzer = getRunData().getAnalyzer();
- IndexWriter iw = new IndexWriter(dir, analyzer, true);
-
Config config = getRunData().getConfig();
boolean cmpnd = config.get("compound",true);
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
+ double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+ boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
+ IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
+
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
iw.setMaxBufferedDocs(mxbf);
iw.setMaxFieldLength(mxfl);
+ if (flushAtRAMUsage > 0)
+ iw.setRAMBufferSizeMB(flushAtRAMUsage);
getRunData().setIndexWriter(iw);
return 1;
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
index e60edb86f9d..7e1b1b2ea1b 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java
@@ -30,14 +30,16 @@ import java.io.IOException;
* Open an index writer.
*
Other side effects: index writer object in perfRunData is set.
*
Relevant properties: merge.factor, max.buffered,
- * max.field.length
.
-.
+ * max.field.length, ram.flush.mb [default 0], autocommit
+ * [default true].
*/
public class OpenIndexTask extends PerfTask {
public static final int DEFAULT_MAX_BUFFERED = 10;
public static final int DEFAULT_MAX_FIELD_LENGTH = 10000;
public static final int DEFAULT_MERGE_PFACTOR = 10;
+ public static final int DEFAULT_RAM_FLUSH_MB = 0;
+ public static final boolean DEFAULT_AUTO_COMMIT = true;
public OpenIndexTask(PerfRunData runData) {
super(runData);
@@ -46,7 +48,6 @@ public class OpenIndexTask extends PerfTask {
public int doLogic() throws IOException {
Directory dir = getRunData().getDirectory();
Analyzer analyzer = getRunData().getAnalyzer();
- IndexWriter writer = new IndexWriter(dir, analyzer, false);
Config config = getRunData().getConfig();
@@ -54,12 +55,17 @@ public class OpenIndexTask extends PerfTask {
int mrgf = config.get("merge.factor",DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",DEFAULT_MAX_FIELD_LENGTH);
+ double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+ boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
+ IndexWriter writer = new IndexWriter(dir, autoCommit, analyzer, false);
// must update params for newly opened writer
writer.setMaxBufferedDocs(mxbf);
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?
+ if (flushAtRAMUsage > 0)
+ writer.setRAMBufferSizeMB(flushAtRAMUsage);
getRunData().setIndexWriter(writer);
return 1;
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
new file mode 100644
index 00000000000..97c57c11ee0
--- /dev/null
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
@@ -0,0 +1,137 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
+import org.apache.lucene.benchmark.byTask.feeds.BasicDocMaker;
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+
+public class WriteLineDocTask extends PerfTask {
+
+ /**
+ * Default value for property doc.add.log.step - indicating how often
+ * an "added N docs" message should be logged.
+ */
+ public static final int DEFAULT_WRITELINE_DOC_LOG_STEP = 1000;
+
+ public WriteLineDocTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ private int logStep = -1;
+ private int docSize = 0;
+ int count = 0;
+ private BufferedWriter lineFileOut=null;
+ private DocMaker docMaker;
+
+ public final static String SEP = "\t";
+
+ /*
+ * (non-Javadoc)
+ * @see PerfTask#setup()
+ */
+ public void setup() throws Exception {
+ super.setup();
+ if (lineFileOut==null) {
+ Config config = getRunData().getConfig();
+ String fileName = config.get("line.file.out", null);
+ if (fileName == null)
+ throw new Exception("line.file.out must be set");
+ lineFileOut = new BufferedWriter(new FileWriter(fileName));
+ }
+ docMaker = getRunData().getDocMaker();
+ }
+
+ public void tearDown() throws Exception {
+ log(++count);
+ super.tearDown();
+ }
+
+ public int doLogic() throws Exception {
+ Document doc;
+ if (docSize > 0) {
+ doc = docMaker.makeDocument(docSize);
+ } else {
+ doc = docMaker.makeDocument();
+ }
+
+ Field f = doc.getField(BasicDocMaker.BODY_FIELD);
+
+ String body, title, date;
+ if (f != null)
+ body = f.stringValue().replace('\t', ' ');
+ else
+ body = null;
+
+ f = doc.getField(BasicDocMaker.TITLE_FIELD);
+ if (f != null)
+ title = f.stringValue().replace('\t', ' ');
+ else
+ title = "";
+
+ f = doc.getField(BasicDocMaker.DATE_FIELD);
+ if (f != null)
+ date = f.stringValue().replace('\t', ' ');
+ else
+ date = "";
+
+ if (body != null) {
+ lineFileOut.write(title, 0, title.length());
+ lineFileOut.write(SEP);
+ lineFileOut.write(date, 0, date.length());
+ lineFileOut.write(SEP);
+ lineFileOut.write(body, 0, body.length());
+ lineFileOut.newLine();
+ lineFileOut.flush();
+ }
+ return 1;
+ }
+
+ private void log (int count) {
+ if (logStep<0) {
+ // init once per instance
+ logStep = getRunData().getConfig().get("doc.writeline.log.step", DEFAULT_WRITELINE_DOC_LOG_STEP);
+ }
+ if (logStep>0 && (count%logStep)==0) {
+ System.out.println("--> "+Thread.currentThread().getName()+" processed (add) "+count+" docs");
+ }
+ }
+
+ /**
+ * Set the params (docSize only)
+ * @param params docSize, or 0 for no limit.
+ */
+ public void setParams(String params) {
+ super.setParams(params);
+ docSize = (int) Float.parseFloat(params);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
+ */
+ public boolean supportsParams() {
+ return true;
+ }
+}
diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
index af6f0b161bd..798786b53b9 100644
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
@@ -22,6 +22,8 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
+import java.util.List;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Properties;
@@ -110,7 +112,9 @@ public class Config {
private void printProps() {
System.out.println("------------> config properties:");
- for (Iterator it = props.keySet().iterator(); it.hasNext();) {
+ List propKeys = new ArrayList(props.keySet());
+ Collections.sort(propKeys);
+ for (Iterator it = propKeys.iterator(); it.hasNext();) {
String propName = (String) it.next();
System.out.println(propName + " = " + props.getProperty(propName));
}
diff --git a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
index 907c9902e76..2cd86d4641f 100755
--- a/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@@ -18,6 +18,9 @@
package org.apache.lucene.benchmark.byTask;
import java.io.StringReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.BufferedReader;
import org.apache.lucene.benchmark.byTask.Benchmark;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
@@ -79,6 +82,7 @@ public class TestPerfTasksLogic extends TestCase {
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
+ ir.close();
}
/**
@@ -121,6 +125,7 @@ public class TestPerfTasksLogic extends TestCase {
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs());
+ ir.close();
}
/**
@@ -150,6 +155,69 @@ public class TestPerfTasksLogic extends TestCase {
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
int ndocsExpected = 21578; // that's how many docs there are in the Reuters collecton.
assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs());
+ ir.close();
+ }
+
+ /**
+ * Test WriteLineDoc and LineDocMaker.
+ */
+ public void testLineDocFile() throws Exception {
+ File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt");
+
+ // We will call WriteLineDocs this many times
+ final int NUM_TRY_DOCS = 500;
+
+ // Creates a line file with first 500 docs from reuters
+ String algLines1[] = {
+ "# ----- properties ",
+ "doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker",
+ "doc.maker.forever=false",
+ "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'),
+ "# ----- alg ",
+ "{WriteLineDoc()}:" + NUM_TRY_DOCS,
+ };
+
+ // Run algo
+ Benchmark benchmark = execBenchmark(algLines1);
+
+ // Verify we got somewhere between 1-500 lines (some
+ // Reuters docs have no body, which WriteLineDoc task
+ // skips).
+ BufferedReader r = new BufferedReader(new FileReader(lineFile));
+ int numLines = 0;
+ while(r.readLine() != null)
+ numLines++;
+ r.close();
+ assertTrue("did not see the right number of docs; should be > 0 and <= " + NUM_TRY_DOCS + " but was " + numLines, numLines > 0 && numLines <= NUM_TRY_DOCS);
+
+ // Index the line docs
+ String algLines2[] = {
+ "# ----- properties ",
+ "analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
+ "doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
+ "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
+ "doc.maker.forever=false",
+ "autocommit=false",
+ "ram.flush.mb=4",
+ "# ----- alg ",
+ "ResetSystemErase",
+ "CreateIndex",
+ "{AddDoc}: *",
+ "CloseIndex",
+ };
+
+ // Run algo
+ benchmark = execBenchmark(algLines2);
+
+ // now we should be able to open the index for write.
+ IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false);
+ iw.close();
+
+ IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
+ assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
+ ir.close();
+
+ lineFile.delete();
}
// create the benchmark and execute it.