LCUENE-1716: allow control over storage of norms (body norms), info stream and whether docs properties should be indexed as fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@788777 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-06-26 17:26:54 +00:00
parent d820e34a26
commit c7f865a4c7
9 changed files with 339 additions and 31 deletions

View File

@ -3,6 +3,16 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
$Id:$
6/26/2009
LUCENE-1716: Added the following support:
doc.tokenized.norms: specifies whether to store norms
doc.body.tokenized.norms: special attribute for the body field
doc.index.props: specifies whether DocMaker should index the properties set on
DocData
writer.info.stream: specifies the info stream to set on IndexWriter (supported
values are: SystemOut, SystemErr and a file name). (Shai Erera via Mike McCandless)
6/23/09
LUCENE-1714: WriteLineDocTask incorrectly normalized text, by replacing only
occurrences of "\t" with a space. It now replaces "\r\n" in addition to that,

View File

@ -43,6 +43,12 @@ import org.apache.lucene.document.Field.TermVector;
* <b>false</b>).
* <li><b>doc.tokenized</b> - specifies whether fields should be tokenized
* (default <b>true</b>).
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
* the index or not. (default <b>false</b>).
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
* stored in the index for the body field. This can be set to true, while
* <code>doc.tokenized.norms</code> is set to false, to allow norms storing just
* for the body field. (default <b>true</b>).
* <li><b>doc.term.vector</b> - specifies whether term vectors should be stored
* for fields (default <b>false</b>).
* <li><b>doc.term.vector.positions</b> - specifies whether term vectors should
@ -53,6 +59,8 @@ import org.apache.lucene.document.Field.TermVector;
* the document's content in the document (default <b>false</b>).
* <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
* should be reused (default <b>true</b>).
* <li><b>doc.index.props</b> - specifies whether the properties returned by
* {@link DocData#getProps()} will be indexed. (default <b>false</b>).
* </ul>
*/
public class DocMaker {
@ -69,7 +77,7 @@ public class DocMaker {
Document doc;
DocData docData = new DocData();
public DocState(boolean reuseFields, Store store, Index index, TermVector termVector) {
public DocState(boolean reuseFields, Store store, Index index, Index bodyIndex, TermVector termVector) {
this.reuseFields = reuseFields;
@ -77,7 +85,7 @@ public class DocMaker {
fields = new HashMap();
// Initialize the map with the default fields.
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, index, termVector));
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", store, bodyIndex, termVector));
fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", store, index, termVector));
fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
@ -123,12 +131,14 @@ public class DocMaker {
protected Config config;
protected Store storeVal = Store.NO;
protected Index indexVal = Index.ANALYZED;
protected Index indexVal = Index.ANALYZED_NO_NORMS;
protected Index bodyIndexVal = Index.ANALYZED;
protected TermVector termVecVal = TermVector.NO;
protected ContentSource source;
protected boolean reuseFields;
protected DocState localDocState;
protected boolean indexProperties;
private int lastPrintedNumUniqueTexts = 0;
@ -146,7 +156,7 @@ public class DocMaker {
doc.getFields().clear();
// Set ID_FIELD
Field idField = ds.getField(ID_FIELD, storeVal, indexVal, termVecVal);
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
idField.setValue("doc" + docid);
doc.add(idField);
@ -190,7 +200,7 @@ public class DocMaker {
bdy = body.substring(0, size); // use part
docData.setBody(body.substring(size)); // some left
}
Field bodyField = ds.getField(BODY_FIELD, storeVal, indexVal, termVecVal);
Field bodyField = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
bodyField.setValue(bdy);
doc.add(bodyField);
@ -201,16 +211,19 @@ public class DocMaker {
}
}
Properties props = docData.getProps();
if (props != null) {
for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) {
Entry entry = (Entry) iterator.next();
Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
f.setValue((String) entry.getValue());
doc.add(f);
if (indexProperties) {
Properties props = docData.getProps();
if (props != null) {
for (Iterator iterator = props.entrySet().iterator(); iterator.hasNext();) {
Entry entry = (Entry) iterator.next();
Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
f.setValue((String) entry.getValue());
doc.add(f);
}
docData.setProps(null);
}
docData.setProps(null);
}
//System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
return doc;
}
@ -222,7 +235,7 @@ public class DocMaker {
protected DocState getDocState() {
DocState ds = (DocState) docState.get();
if (ds == null) {
ds = new DocState(true, storeVal, indexVal, termVecVal);
ds = new DocState(true, storeVal, indexVal, bodyIndexVal, termVecVal);
docState.set(ds);
}
return ds;
@ -357,9 +370,17 @@ public class DocMaker {
boolean stored = config.get("doc.stored", false);
boolean tokenized = config.get("doc.tokenized", true);
boolean norms = config.get("doc.tokenized.norms", false);
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
boolean termVec = config.get("doc.term.vector", false);
storeVal = (stored ? Field.Store.YES : Field.Store.NO);
indexVal = (tokenized ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED);
if (tokenized) {
indexVal = norms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
bodyIndexVal = bodyNorms ? Index.ANALYZED : Index.ANALYZED_NO_NORMS;
} else {
indexVal = norms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
bodyIndexVal = bodyNorms ? Index.NOT_ANALYZED : Index.NOT_ANALYZED_NO_NORMS;
}
boolean termVecPositions = config.get("doc.term.vector.positions", false);
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
if (termVecPositions && termVecOffsets) {
@ -377,13 +398,15 @@ public class DocMaker {
reuseFields = config.get("doc.reuse.fields", true);
if (!reuseFields) {
localDocState = new DocState(false, storeVal, indexVal, termVecVal);
localDocState = new DocState(false, storeVal, indexVal, bodyIndexVal, termVecVal);
} else {
// In a multi-rounds run, it is important to reset DocState since settings
// of fields may change between rounds, and this is the only way to reset
// the cache of all threads.
docState = new ThreadLocal();
}
indexProperties = config.get("doc.index.props", false);
}
}

View File

@ -37,15 +37,15 @@ public class EnwikiDocMaker extends DocMaker {
Document doc = reuseFields ? ds.doc : new Document();
doc.getFields().clear();
Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
body.setValue(dd.getBody());
doc.add(body);
Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
title.setValue(dd.getTitle());
doc.add(title);
Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
date.setValue(dd.getDate());
doc.add(date);

View File

@ -55,15 +55,15 @@ public class LineDocMaker extends DocMaker {
Document doc = reuseFields ? ds.doc : new Document();
doc.getFields().clear();
Field body = ds.getField(BODY_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
body.setValue(dd.getBody());
doc.add(body);
Field title = ds.getField(TITLE_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
title.setValue(dd.getTitle());
doc.add(title);
Field date = ds.getField(DATE_FIELD, storeVal, Index.ANALYZED, termVecVal);
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
date.setValue(dd.getDate());
doc.add(date);

View File

@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
*/
import java.io.IOException;
import java.io.PrintStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.index.IndexWriter;
@ -37,10 +38,16 @@ public class CloseIndexTask extends PerfTask {
public int doLogic() throws IOException {
IndexWriter iw = getRunData().getIndexWriter();
if (iw!=null) {
if (iw != null) {
// If infoStream was set to output to a file, close it.
PrintStream infoStream = iw.getInfoStream();
if (infoStream != null && infoStream != System.out
&& infoStream != System.err) {
infoStream.close();
}
iw.close(doWait);
getRunData().setIndexWriter(null);
}
getRunData().setIndexWriter(null);
return 1;
}

View File

@ -24,15 +24,31 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.MergePolicy;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
/**
* Create an index.
* <br>Other side effects: index writer object in perfRunData is set.
* <br>Relevant properties: <code>merge.factor, max.buffered,
* Create an index. <br>
* Other side effects: index writer object in perfRunData is set. <br>
* Relevant properties: <code>merge.factor, max.buffered,
* max.field.length, ram.flush.mb [default 0], autocommit
* [default true]</code>.
* <p>
* This task also supports a "writer.info.stream" property with the following
* values:
* <ul>
* <li>SystemOut - sets {@link IndexWriter#setInfoStream(java.io.PrintStream)}
* to {@link System#out}.
* <li>SystemErr - sets {@link IndexWriter#setInfoStream(java.io.PrintStream)}
* to {@link System#err}.
* <li>&lt;file_name&gt; - attempts to create a file given that name and sets
* {@link IndexWriter#setInfoStream(java.io.PrintStream)} to that file. If this
* denotes an invalid file name, or some error occurs, an exception will be
* thrown.
* </ul>
*/
public class CreateIndexTask extends PerfTask {
@ -62,7 +78,6 @@ public class CreateIndexTask extends PerfTask {
final String mergePolicy = config.get("merge.policy",
"org.apache.lucene.index.LogByteSizeMergePolicy");
err = null;
try {
writer.setMergePolicy((MergePolicy) Class.forName(mergePolicy).newInstance());
} catch (IllegalAccessException iae) {
@ -91,6 +106,18 @@ public class CreateIndexTask extends PerfTask {
writer.setMaxBufferedDocs(maxBuffered);
writer.setRAMBufferSizeMB(ramBuffer);
}
String infoStreamVal = config.get("writer.info.stream", null);
if (infoStreamVal != null) {
if (infoStreamVal.equals("SystemOut")) {
writer.setInfoStream(System.out);
} else if (infoStreamVal.equals("SystemErr")) {
writer.setInfoStream(System.err);
} else {
File f = new File(infoStreamVal).getAbsoluteFile();
writer.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f))));
}
}
}
public static IndexDeletionPolicy getIndexDeletionPolicy(Config config) {
@ -124,7 +151,7 @@ public class CreateIndexTask extends PerfTask {
runData.getConfig().get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT),
runData.getAnalyzer(),
true, indexDeletionPolicy);
CreateIndexTask.setIndexWriterConfig(writer, config);
setIndexWriterConfig(writer, config);
runData.setIndexWriter(writer);
return 1;
}

View File

@ -0,0 +1,161 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Properties;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
/** Tests the functionality of {@link DocMaker}. */
public class DocMakerTest extends BenchmarkTestCase {
static final class OneDocSource extends ContentSource {
private boolean finish = false;
public void close() throws IOException {
}
public DocData getNextDocData(DocData docData) throws NoMoreDataException,
IOException {
if (finish) {
throw new NoMoreDataException();
}
docData.setBody("body");
docData.setDate("date");
docData.setTitle("title");
Properties props = new Properties();
props.setProperty("key", "value");
docData.setProps(props);
finish = true;
return docData;
}
}
private void doTestIndexProperties(boolean setIndexProps,
boolean indexPropsVal, int numExpectedResults) throws Exception {
Properties props = new Properties();
// Indexing configuration.
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
props.setProperty("content.source", OneDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
if (setIndexProps) {
props.setProperty("doc.index.props", Boolean.toString(indexPropsVal));
}
// Create PerfRunData
Config config = new Config(props);
PerfRunData runData = new PerfRunData(config);
TaskSequence tasks = new TaskSequence(runData, getName(), null, false);
tasks.addTask(new CreateIndexTask(runData));
tasks.addTask(new AddDocTask(runData));
tasks.addTask(new CloseIndexTask(runData));
tasks.doLogic();
IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
TopDocs td = searcher.search(new TermQuery(new Term("key", "value")), 10);
assertEquals(numExpectedResults, td.totalHits);
searcher.close();
}
private Document createTestNormsDocument(boolean setNormsProp,
boolean normsPropVal, boolean setBodyNormsProp, boolean bodyNormsVal)
throws Exception {
Properties props = new Properties();
// Indexing configuration.
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
props.setProperty("content.source", OneDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
if (setNormsProp) {
props.setProperty("doc.tokenized.norms", Boolean.toString(normsPropVal));
}
if (setBodyNormsProp) {
props.setProperty("doc.body.tokenized.norms", Boolean.toString(bodyNormsVal));
}
// Create PerfRunData
Config config = new Config(props);
DocMaker dm = new DocMaker();
dm.setConfig(config);
return dm.makeDocument();
}
/* Tests doc.index.props property. */
public void testIndexProperties() throws Exception {
// default is to not index properties.
doTestIndexProperties(false, false, 0);
// set doc.index.props to false.
doTestIndexProperties(true, false, 0);
// set doc.index.props to true.
doTestIndexProperties(true, true, 1);
}
/* Tests doc.tokenized.norms and doc.body.tokenized.norms properties. */
public void testNorms() throws Exception {
Document doc;
// Don't set anything, use the defaults
doc = createTestNormsDocument(false, false, false, false);
assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms());
assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms());
// Set norms to false
doc = createTestNormsDocument(true, false, false, false);
assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms());
assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms());
// Set norms to true
doc = createTestNormsDocument(true, true, false, false);
assertFalse(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms());
assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms());
// Set body norms to false
doc = createTestNormsDocument(false, false, true, false);
assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms());
assertTrue(doc.getField(DocMaker.BODY_FIELD).getOmitNorms());
// Set body norms to true
doc = createTestNormsDocument(false, false, true, true);
assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms());
assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms());
}
}

View File

@ -91,7 +91,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
tasks.addTask(new CloseIndexTask(runData));
tasks.doLogic();
IndexSearcher searcher = new IndexSearcher(runData.getDirectory());
IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
assertEquals(1, td.totalHits);
assertNotNull(td.scoreDocs[0]);

View File

@ -0,0 +1,80 @@
package org.apache.lucene.benchmark.byTask.tasks;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.PrintStream;
import java.util.Properties;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
/** Tests the functionality of {@link CreateIndexTask}. */
public class CreateIndexTaskTest extends BenchmarkTestCase {
private PerfRunData createPerfRunData(String infoStreamValue) throws Exception {
Properties props = new Properties();
props.setProperty("print.props", "false"); // don't print anything
props.setProperty("directory", "RAMDirectory");
props.setProperty("writer.info.stream", infoStreamValue);
Config config = new Config(props);
return new PerfRunData(config);
}
public void testInfoStream_SystemOutErr() throws Exception {
PrintStream curOut = System.out;
ByteArrayOutputStream baos = new ByteArrayOutputStream();
System.setOut(new PrintStream(baos));
try {
PerfRunData runData = createPerfRunData("SystemOut");
CreateIndexTask cit = new CreateIndexTask(runData);
cit.doLogic();
new CloseIndexTask(runData).doLogic();
assertTrue(baos.size() > 0);
} finally {
System.setOut(curOut);
}
PrintStream curErr = System.err;
baos.reset();
System.setErr(new PrintStream(baos));
try {
PerfRunData runData = createPerfRunData("SystemErr");
CreateIndexTask cit = new CreateIndexTask(runData);
cit.doLogic();
new CloseIndexTask(runData).doLogic();
assertTrue(baos.size() > 0);
} finally {
System.setErr(curErr);
}
}
public void testInfoStream_File() throws Exception {
File outFile = new File(getWorkDir(), "infoStreamTest");
PerfRunData runData = createPerfRunData(outFile.getAbsolutePath());
new CreateIndexTask(runData).doLogic();
new CloseIndexTask(runData).doLogic();
assertTrue(outFile.length() > 0);
}
}