LUCENE-1595: don't use SortField.AUTO; deprecate LineDocMaker & EnwikiDocMaker

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@798096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-07-27 10:15:03 +00:00
parent 312400842f
commit 094c674c4d
8 changed files with 57 additions and 89 deletions

View File

@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
7/24/2009
LUCENE-1595: Deprecate LineDocMaker and EnwikiDocMaker in favor of
using DocMaker directly, with content.source = LineDocSource or
EnwikiContentSource. NOTE: with this change, the "id" field from
the Wikipedia XML export is now indexed as the "docname" field
(previously it was indexed as "docid"). Additionaly, the
SearchWithSort task now accepts all types that SortField can accept
and no longer falls back to SortField.AUTO, which has been
deprecated. (Mike McCandless)
7/20/2009
LUCENE-1755: Fix WriteLineDocTask to output a document if it contains either
a title or body (or both). (Shai Erera via Mark Miller)

View File

@ -24,6 +24,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Map.Entry;
import java.util.Random;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
@ -60,6 +61,9 @@ import org.apache.lucene.document.Field.TermVector;
* <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
* should be reused (default <b>true</b>).
* <li><b>doc.index.props</b> - specifies whether the properties returned by
* <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
* IDs from 0 to this limit. This is useful with UpdateDoc
* for testing performance of IndexWriter.updateDocument.
* {@link DocData#getProps()} will be indexed. (default <b>false</b>).
* </ul>
*/
@ -70,11 +74,14 @@ public class DocMaker {
private int cnt;
}
private Random r;
private int updateDocIDLimit;
static class DocState {
private Map fields;
private boolean reuseFields;
Document doc;
private final Map fields;
private final boolean reuseFields;
final Document doc;
DocData docData = new DocData();
public DocState(boolean reuseFields, Store store, Index index, Index bodyIndex, TermVector termVector) {
@ -92,6 +99,9 @@ public class DocMaker {
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
doc = new Document();
} else {
fields = null;
doc = null;
}
}
@ -150,14 +160,14 @@ public class DocMaker {
// use only part of the body, modify it to keep the rest (or use all if size==0).
// reset the docdata properties so they are not added more than once.
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
int docid = incrNumDocsCreated();
DocState ds = reuseFields ? getDocState() : localDocState;
Document doc = reuseFields ? ds.doc : new Document();
final DocState ds = reuseFields ? getDocState() : localDocState;
final Document doc = reuseFields ? ds.doc : new Document();
doc.getFields().clear();
// Set ID_FIELD
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
idField.setValue("doc" + docid);
idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated()));
doc.add(idField);
// Set NAME_FIELD
@ -407,6 +417,11 @@ public class DocMaker {
}
indexProperties = config.get("doc.index.props", false);
updateDocIDLimit = config.get("doc.random.id.limit", -1);
if (updateDocIDLimit != -1) {
r = new Random(179);
}
}
}

View File

@ -28,43 +28,14 @@ import org.apache.lucene.document.Field.TermVector;
* A {@link DocMaker} which reads the English Wikipedia dump. Uses
* {@link EnwikiContentSource} as its content source, regardless if a different
* content source was defined in the configuration.
* @deprecated Please use {@link DocMaker} instead, with content.source=EnwikiContentSource
*/
public class EnwikiDocMaker extends DocMaker {
public Document makeDocument() throws Exception {
DocState ds = reuseFields ? getDocState() : localDocState;
DocData dd = source.getNextDocData(ds.docData);
Document doc = reuseFields ? ds.doc : new Document();
doc.getFields().clear();
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
body.setValue(dd.getBody());
doc.add(body);
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
title.setValue(dd.getTitle());
doc.add(title);
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
date.setValue(dd.getDate());
doc.add(date);
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
id.setValue(dd.getName());
doc.add(id);
return doc;
}
public Document makeDocument(int size) throws Exception {
throw new RuntimeException("cannot change document size with EnwikiDocMaker");
}
public void setConfig(Config config) {
super.setConfig(config);
// Override whatever content source was set in the config
source = new EnwikiContentSource();
source.setConfig(config);
System.out.println("NOTE: EnwikiDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=EnwikiContentSource");
}
}
}

View File

@ -42,51 +42,13 @@ import org.apache.lucene.document.Field.TermVector;
* 0..N; this is useful with UpdateDoc to test updating random documents; if
* this is unspecified or -1, then docid is sequentially assigned
* </ul>
* @deprecated Please use {@link DocMaker} instead, with content.source=LineDocSource
*/
public class LineDocMaker extends DocMaker {
private Random r;
private int numDocs;
public Document makeDocument() throws Exception {
DocState ds = reuseFields ? getDocState() : localDocState;
DocData dd = source.getNextDocData(ds.docData);
Document doc = reuseFields ? ds.doc : new Document();
doc.getFields().clear();
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
body.setValue(dd.getBody());
doc.add(body);
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
title.setValue(dd.getTitle());
doc.add(title);
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
date.setValue(dd.getDate());
doc.add(date);
String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
id.setValue(docID);
doc.add(id);
return doc;
}
public Document makeDocument(int size) throws Exception {
throw new RuntimeException("cannot change document size with LineDocMaker");
}
public void setConfig(Config config) {
super.setConfig(config);
source = new LineDocSource();
source.setConfig(config);
numDocs = config.get("doc.random.id.limit", -1);
if (numDocs != -1) {
r = new Random(179);
}
System.out.println("NOTE: LineDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=LineDocSource");
}
}

View File

@ -56,6 +56,8 @@ public class SearchWithSortTask extends ReadTask {
SortField sortField0;
if (field.equals("doc")) {
sortField0 = SortField.FIELD_DOC;
} if (field.equals("score")) {
sortField0 = SortField.FIELD_SCORE;
} else if (field.equals("noscore")) {
doScore = false;
continue;
@ -90,14 +92,22 @@ public class SearchWithSortTask extends ReadTask {
int type;
if (typeString.equals("float")) {
type = SortField.FLOAT;
} else if (typeString.equals("double")) {
type = SortField.DOUBLE;
} else if (typeString.equals("byte")) {
type = SortField.BYTE;
} else if (typeString.equals("short")) {
type = SortField.SHORT;
} else if (typeString.equals("int")) {
type = SortField.INT;
} else if (typeString.equals("long")) {
type = SortField.LONG;
} else if (typeString.equals("string")) {
type = SortField.STRING;
} else if (typeString.equals("string_val")) {
type = SortField.STRING_VAL;
} else {
type = SortField.AUTO;
throw new RuntimeException("Unrecognized sort field type " + typeString);
}
return type;
}

View File

@ -299,7 +299,7 @@ public class TestPerfTasksLogic extends TestCase {
}
/**
* Test WriteLineDoc and LineDocMaker.
* Test WriteLineDoc and LineDocSource.
*/
public void testLineDocFile() throws Exception {
File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt");
@ -334,7 +334,7 @@ public class TestPerfTasksLogic extends TestCase {
String algLines2[] = {
"# ----- properties ",
"analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
"content.source.forever=false",
"doc.reuse.fields=false",
@ -355,7 +355,7 @@ public class TestPerfTasksLogic extends TestCase {
iw.close();
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
ir.close();
lineFile.delete();

View File

@ -39,8 +39,8 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
/** Tests the functionality of {@link LineDocMaker}. */
public class LineDocMakerTest extends BenchmarkTestCase {
/** Tests the functionality of {@link LineDocSource}. */
public class LineDocSourceTest extends BenchmarkTestCase {
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
@ -70,7 +70,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
Properties props = new Properties();
// LineDocMaker specific settings.
// LineDocSource specific settings.
props.setProperty("docs.file", file.getAbsolutePath());
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
@ -78,7 +78,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
// Indexing configuration.
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
props.setProperty("doc.maker", LineDocMaker.class.getName());
props.setProperty("content.source", LineDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
// Create PerfRunData
@ -98,7 +98,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
searcher.close();
}
/* Tests LineDocMaker with a bzip2 input stream. */
/* Tests LineDocSource with a bzip2 input stream. */
public void testBZip2() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
createBZ2LineFile(file);

View File

@ -172,7 +172,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
public void testCharsReplace() throws Exception {
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
// separator char. However, it didn't replace newline characters, which
// resulted in errors in LineDocMaker.
// resulted in errors in LineDocSource.
File file = new File(getWorkDir(), "one-line");
PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);