mirror of https://github.com/apache/lucene.git
LUCENE-1595: don't use SortField.AUTO; deprecate LineDocMaker & EnwikiDocMaker
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@798096 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
312400842f
commit
094c674c4d
|
@ -4,6 +4,16 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
7/24/2009
|
||||
LUCENE-1595: Deprecate LineDocMaker and EnwikiDocMaker in favor of
|
||||
using DocMaker directly, with content.source = LineDocSource or
|
||||
EnwikiContentSource. NOTE: with this change, the "id" field from
|
||||
the Wikipedia XML export is now indexed as the "docname" field
|
||||
(previously it was indexed as "docid"). Additionaly, the
|
||||
SearchWithSort task now accepts all types that SortField can accept
|
||||
and no longer falls back to SortField.AUTO, which has been
|
||||
deprecated. (Mike McCandless)
|
||||
|
||||
7/20/2009
|
||||
LUCENE-1755: Fix WriteLineDocTask to output a document if it contains either
|
||||
a title or body (or both). (Shai Erera via Mark Miller)
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.Iterator;
|
|||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Format;
|
||||
|
@ -60,6 +61,9 @@ import org.apache.lucene.document.Field.TermVector;
|
|||
* <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects
|
||||
* should be reused (default <b>true</b>).
|
||||
* <li><b>doc.index.props</b> - specifies whether the properties returned by
|
||||
* <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random
|
||||
* IDs from 0 to this limit. This is useful with UpdateDoc
|
||||
* for testing performance of IndexWriter.updateDocument.
|
||||
* {@link DocData#getProps()} will be indexed. (default <b>false</b>).
|
||||
* </ul>
|
||||
*/
|
||||
|
@ -70,11 +74,14 @@ public class DocMaker {
|
|||
private int cnt;
|
||||
}
|
||||
|
||||
private Random r;
|
||||
private int updateDocIDLimit;
|
||||
|
||||
static class DocState {
|
||||
|
||||
private Map fields;
|
||||
private boolean reuseFields;
|
||||
Document doc;
|
||||
private final Map fields;
|
||||
private final boolean reuseFields;
|
||||
final Document doc;
|
||||
DocData docData = new DocData();
|
||||
|
||||
public DocState(boolean reuseFields, Store store, Index index, Index bodyIndex, TermVector termVector) {
|
||||
|
@ -92,6 +99,9 @@ public class DocMaker {
|
|||
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
|
||||
|
||||
doc = new Document();
|
||||
} else {
|
||||
fields = null;
|
||||
doc = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,14 +160,14 @@ public class DocMaker {
|
|||
// use only part of the body, modify it to keep the rest (or use all if size==0).
|
||||
// reset the docdata properties so they are not added more than once.
|
||||
private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException {
|
||||
int docid = incrNumDocsCreated();
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
|
||||
final DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
final Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
// Set ID_FIELD
|
||||
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
|
||||
idField.setValue("doc" + docid);
|
||||
idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated()));
|
||||
doc.add(idField);
|
||||
|
||||
// Set NAME_FIELD
|
||||
|
@ -407,6 +417,11 @@ public class DocMaker {
|
|||
}
|
||||
|
||||
indexProperties = config.get("doc.index.props", false);
|
||||
|
||||
updateDocIDLimit = config.get("doc.random.id.limit", -1);
|
||||
if (updateDocIDLimit != -1) {
|
||||
r = new Random(179);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,43 +28,14 @@ import org.apache.lucene.document.Field.TermVector;
|
|||
* A {@link DocMaker} which reads the English Wikipedia dump. Uses
|
||||
* {@link EnwikiContentSource} as its content source, regardless if a different
|
||||
* content source was defined in the configuration.
|
||||
* @deprecated Please use {@link DocMaker} instead, with content.source=EnwikiContentSource
|
||||
*/
|
||||
public class EnwikiDocMaker extends DocMaker {
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
DocData dd = source.getNextDocData(ds.docData);
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
|
||||
body.setValue(dd.getBody());
|
||||
doc.add(body);
|
||||
|
||||
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
|
||||
title.setValue(dd.getTitle());
|
||||
doc.add(title);
|
||||
|
||||
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
|
||||
date.setValue(dd.getDate());
|
||||
doc.add(date);
|
||||
|
||||
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
|
||||
id.setValue(dd.getName());
|
||||
doc.add(id);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
throw new RuntimeException("cannot change document size with EnwikiDocMaker");
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
// Override whatever content source was set in the config
|
||||
source = new EnwikiContentSource();
|
||||
source.setConfig(config);
|
||||
System.out.println("NOTE: EnwikiDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=EnwikiContentSource");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,51 +42,13 @@ import org.apache.lucene.document.Field.TermVector;
|
|||
* 0..N; this is useful with UpdateDoc to test updating random documents; if
|
||||
* this is unspecified or -1, then docid is sequentially assigned
|
||||
* </ul>
|
||||
* @deprecated Please use {@link DocMaker} instead, with content.source=LineDocSource
|
||||
*/
|
||||
public class LineDocMaker extends DocMaker {
|
||||
|
||||
private Random r;
|
||||
private int numDocs;
|
||||
|
||||
public Document makeDocument() throws Exception {
|
||||
|
||||
DocState ds = reuseFields ? getDocState() : localDocState;
|
||||
DocData dd = source.getNextDocData(ds.docData);
|
||||
Document doc = reuseFields ? ds.doc : new Document();
|
||||
doc.getFields().clear();
|
||||
|
||||
Field body = ds.getField(BODY_FIELD, storeVal, bodyIndexVal, termVecVal);
|
||||
body.setValue(dd.getBody());
|
||||
doc.add(body);
|
||||
|
||||
Field title = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
|
||||
title.setValue(dd.getTitle());
|
||||
doc.add(title);
|
||||
|
||||
Field date = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
|
||||
date.setValue(dd.getDate());
|
||||
doc.add(date);
|
||||
|
||||
String docID = "doc" + (r != null ? r.nextInt(numDocs) : incrNumDocsCreated());
|
||||
Field id = ds.getField(ID_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
|
||||
id.setValue(docID);
|
||||
doc.add(id);
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
public Document makeDocument(int size) throws Exception {
|
||||
throw new RuntimeException("cannot change document size with LineDocMaker");
|
||||
}
|
||||
|
||||
public void setConfig(Config config) {
|
||||
super.setConfig(config);
|
||||
source = new LineDocSource();
|
||||
source.setConfig(config);
|
||||
numDocs = config.get("doc.random.id.limit", -1);
|
||||
if (numDocs != -1) {
|
||||
r = new Random(179);
|
||||
}
|
||||
System.out.println("NOTE: LineDocMaker is deprecated; please use DocMaker instead (which is the default if you don't specify doc.maker) with content.source=LineDocSource");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -56,6 +56,8 @@ public class SearchWithSortTask extends ReadTask {
|
|||
SortField sortField0;
|
||||
if (field.equals("doc")) {
|
||||
sortField0 = SortField.FIELD_DOC;
|
||||
} if (field.equals("score")) {
|
||||
sortField0 = SortField.FIELD_SCORE;
|
||||
} else if (field.equals("noscore")) {
|
||||
doScore = false;
|
||||
continue;
|
||||
|
@ -90,14 +92,22 @@ public class SearchWithSortTask extends ReadTask {
|
|||
int type;
|
||||
if (typeString.equals("float")) {
|
||||
type = SortField.FLOAT;
|
||||
} else if (typeString.equals("double")) {
|
||||
type = SortField.DOUBLE;
|
||||
} else if (typeString.equals("byte")) {
|
||||
type = SortField.BYTE;
|
||||
} else if (typeString.equals("short")) {
|
||||
type = SortField.SHORT;
|
||||
} else if (typeString.equals("int")) {
|
||||
type = SortField.INT;
|
||||
} else if (typeString.equals("long")) {
|
||||
type = SortField.LONG;
|
||||
} else if (typeString.equals("string")) {
|
||||
type = SortField.STRING;
|
||||
} else if (typeString.equals("string_val")) {
|
||||
type = SortField.STRING_VAL;
|
||||
} else {
|
||||
type = SortField.AUTO;
|
||||
throw new RuntimeException("Unrecognized sort field type " + typeString);
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
|
|
@ -299,7 +299,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
}
|
||||
|
||||
/**
|
||||
* Test WriteLineDoc and LineDocMaker.
|
||||
* Test WriteLineDoc and LineDocSource.
|
||||
*/
|
||||
public void testLineDocFile() throws Exception {
|
||||
File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt");
|
||||
|
@ -334,7 +334,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
String algLines2[] = {
|
||||
"# ----- properties ",
|
||||
"analyzer=org.apache.lucene.analysis.SimpleAnalyzer",
|
||||
"doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'),
|
||||
"content.source.forever=false",
|
||||
"doc.reuse.fields=false",
|
||||
|
@ -355,7 +355,7 @@ public class TestPerfTasksLogic extends TestCase {
|
|||
iw.close();
|
||||
|
||||
IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory());
|
||||
assertEquals(numLines + " lines were were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
|
||||
assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs());
|
||||
ir.close();
|
||||
|
||||
lineFile.delete();
|
||||
|
|
|
@ -39,8 +39,8 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
|
||||
/** Tests the functionality of {@link LineDocMaker}. */
|
||||
public class LineDocMakerTest extends BenchmarkTestCase {
|
||||
/** Tests the functionality of {@link LineDocSource}. */
|
||||
public class LineDocSourceTest extends BenchmarkTestCase {
|
||||
|
||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
|
@ -70,7 +70,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
|
|||
|
||||
Properties props = new Properties();
|
||||
|
||||
// LineDocMaker specific settings.
|
||||
// LineDocSource specific settings.
|
||||
props.setProperty("docs.file", file.getAbsolutePath());
|
||||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
|
@ -78,7 +78,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
|
|||
|
||||
// Indexing configuration.
|
||||
props.setProperty("analyzer", SimpleAnalyzer.class.getName());
|
||||
props.setProperty("doc.maker", LineDocMaker.class.getName());
|
||||
props.setProperty("content.source", LineDocSource.class.getName());
|
||||
props.setProperty("directory", "RAMDirectory");
|
||||
|
||||
// Create PerfRunData
|
||||
|
@ -98,7 +98,7 @@ public class LineDocMakerTest extends BenchmarkTestCase {
|
|||
searcher.close();
|
||||
}
|
||||
|
||||
/* Tests LineDocMaker with a bzip2 input stream. */
|
||||
/* Tests LineDocSource with a bzip2 input stream. */
|
||||
public void testBZip2() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file);
|
|
@ -172,7 +172,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
public void testCharsReplace() throws Exception {
|
||||
// WriteLineDocTask replaced only \t characters w/ a space, since that's its
|
||||
// separator char. However, it didn't replace newline characters, which
|
||||
// resulted in errors in LineDocMaker.
|
||||
// resulted in errors in LineDocSource.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
|
|
Loading…
Reference in New Issue