LUCENE-2826: LineDocSource assigns stable docids; add 2 NumericFields derived from date in the line doc file

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1051305 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-12-20 22:18:16 +00:00
parent fda2483fb6
commit e18dcbf1b7
6 changed files with 102 additions and 16 deletions

View File

@ -29,6 +29,7 @@ public class DocData {
private String body;
private String title;
private String date;
private int id;
private Properties props;
public void clear() {
@ -37,6 +38,7 @@ public class DocData {
title = null;
date = null;
props = null;
id = -1;
}
public String getBody() {
@ -57,6 +59,10 @@ public class DocData {
return name;
}
public int getID() {
return id;
}
public Properties getProps() {
return props;
}
@ -85,6 +91,10 @@ public class DocData {
this.name = name;
}
public void setID(int id) {
this.id = id;
}
public void setProps(Properties props) {
this.props = props;
}

View File

@ -20,14 +20,21 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Calendar;
import java.util.Map;
import java.util.Properties;
import java.util.Locale;
import java.util.Random;
import java.util.Date;
import java.util.concurrent.atomic.AtomicInteger;
import java.text.SimpleDateFormat;
import java.text.ParsePosition;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
@ -82,6 +89,7 @@ public class DocMaker {
static class DocState {
private final Map<String,Field> fields;
private final Map<String,NumericField> numericFields;
private final boolean reuseFields;
final Document doc;
DocData docData = new DocData();
@ -92,6 +100,7 @@ public class DocMaker {
if (reuseFields) {
fields = new HashMap<String,Field>();
numericFields = new HashMap<String,NumericField>();
// Initialize the map with the default fields.
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
@ -100,8 +109,12 @@ public class DocMaker {
fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
doc = new Document();
} else {
numericFields = null;
fields = null;
doc = null;
}
@ -124,18 +137,42 @@ public class DocMaker {
}
return f;
}
NumericField getNumericField(String name) {
if (!reuseFields) {
return new NumericField(name);
}
NumericField f = numericFields.get(name);
if (f == null) {
f = new NumericField(name);
numericFields.put(name, f);
}
return f;
}
}
private int numDocsCreated = 0;
private boolean storeBytes = false;
private static class DateUtil {
public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
public Calendar cal = Calendar.getInstance();
public ParsePosition pos = new ParsePosition(0);
public DateUtil() {
parser.setLenient(true);
}
}
// leftovers are thread local, because it is unsafe to share residues between threads
private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
public static final String BODY_FIELD = "body";
public static final String TITLE_FIELD = "doctitle";
public static final String DATE_FIELD = "docdate";
public static final String DATE_MSEC_FIELD = "docdatenum";
public static final String TIME_SEC_FIELD = "doctimesecnum";
public static final String ID_FIELD = "docid";
public static final String BYTES_FIELD = "bytes";
public static final String NAME_FIELD = "docname";
@ -155,6 +192,7 @@ public class DocMaker {
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
private final AtomicInteger numDocsCreated = new AtomicInteger();
private int printNum = 0;
@ -169,7 +207,16 @@ public class DocMaker {
// Set ID_FIELD
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated()));
int id;
if (r != null) {
id = r.nextInt(updateDocIDLimit);
} else {
id = docData.getID();
if (id == -1) {
id = numDocsCreated.getAndIncrement();
}
}
idField.setValue(Integer.toString(id));
doc.add(idField);
// Set NAME_FIELD
@ -181,14 +228,40 @@ public class DocMaker {
doc.add(nameField);
// Set DATE_FIELD
String date = docData.getDate();
if (date == null) {
date = "";
DateUtil util = dateParsers.get();
if (util == null) {
util = new DateUtil();
dateParsers.set(util);
}
Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
dateField.setValue(date);
Date date = null;
String dateString = docData.getDate();
if (dateString != null) {
util.pos.setIndex(0);
date = util.parser.parse(dateString, util.pos);
//System.out.println(dateString + " parsed to " + date);
} else {
dateString = "";
}
Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
dateStringField.setValue(dateString);
doc.add(dateStringField);
if (date == null) {
// just set to right now
date = new Date();
}
NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
dateField.setLongValue(date.getTime());
doc.add(dateField);
util.cal.setTime(date);
final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
timeSecField.setIntValue(sec);
doc.add(timeSecField);
// Set TITLE_FIELD
String title = docData.getTitle();
Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
@ -252,10 +325,6 @@ public class DocMaker {
return ds;
}
protected synchronized int incrNumDocsCreated() {
return numDocsCreated++;
}
/**
* Closes the {@link DocMaker}. The base implementation closes the
* {@link ContentSource}, and it can be overridden to do more work (but make
@ -363,7 +432,7 @@ public class DocMaker {
// re-initiate since properties by round may have changed.
setConfig(config);
source.resetInputs();
numDocsCreated = 0;
numDocsCreated.set(0);
resetLeftovers();
}

View File

@ -48,6 +48,7 @@ public class LineDocSource extends ContentSource {
private File file;
private BufferedReader reader;
private int readCount;
private synchronized void openFile() {
try {
@ -71,9 +72,12 @@ public class LineDocSource extends ContentSource {
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String line;
final String line;
final int myID;
synchronized(this) {
line = reader.readLine();
myID = readCount++;
if (line == null) {
if (!forever) {
throw new NoMoreDataException();
@ -96,6 +100,7 @@ public class LineDocSource extends ContentSource {
}
// The date String was written in the format of DateTools.dateToString.
docData.clear();
docData.setID(myID);
docData.setBody(line.substring(1 + spot2, line.length()));
docData.setTitle(line.substring(0, spot));
docData.setDate(line.substring(1 + spot, spot2));

View File

@ -26,6 +26,7 @@ import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.NumericField;
/**
* Simple task to test performance of tokenizers. It just
@ -67,7 +68,7 @@ public class ReadTokensTask extends PerfTask {
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(final Fieldable field : fields) {
if (!field.isTokenized()) continue;
if (!field.isTokenized() || field instanceof NumericField) continue;
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();

View File

@ -475,8 +475,9 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
FieldsEnum fields = MultiFields.getFields(reader).iterator();
String fieldName = null;
while((fieldName = fields.next()) != null) {
if (fieldName == DocMaker.ID_FIELD)
if (fieldName == DocMaker.ID_FIELD || fieldName == DocMaker.DATE_MSEC_FIELD || fieldName == DocMaker.TIME_SEC_FIELD) {
continue;
}
TermsEnum terms = fields.terms();
DocsEnum docs = null;
while(terms.next() != null) {

View File

@ -172,7 +172,7 @@ public final class NumericUtils {
public static int getPrefixCodedLongShift(final BytesRef val) {
final int shift = val.bytes[val.offset] - SHIFT_START_LONG;
if (shift > 63 || shift < 0)
throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)");
throw new NumberFormatException("Invalid shift value (" + shift + ") in prefixCoded bytes (is encoded value really an INT?)");
return shift;
}