LUCENE-1102: EnwikiDocMaker now adds a docid field

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607732 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-12-31 13:07:14 +00:00
parent f39f15ec43
commit 90a735441f
3 changed files with 22 additions and 5 deletions

View File

@ -4,6 +4,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
$Id:$
12/31/07
LUCENE-1102: EnwikiDocMaker now indexes the docid field, so results might not be comparable with results prior to this change, although
it is doubted that this one small field makes much difference.
12/13/07
LUCENE-1086: DocMakers setup for the "docs.dir" property
fixed to properly handle absolute paths. (Shai Erera via Doron Cohen)

View File

@ -36,7 +36,8 @@ public class EnwikiDocMaker extends LineDocMaker {
static final int TITLE = 0;
static final int DATE = TITLE+1;
static final int BODY = DATE+1;
static final int LENGTH = BODY+1;
static final int ID = BODY + 1;
static final int LENGTH = ID+1;
static final String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
@ -113,8 +114,9 @@ public class EnwikiDocMaker extends LineDocMaker {
String title;
String body;
String time;
String id;
static final int BASE = 10;
public void startElement(String namespace,
String simple,
@ -124,12 +126,15 @@ public class EnwikiDocMaker extends LineDocMaker {
title = null;
body = null;
time = null;
id = null;
} else if (qualified.equals("text")) {
contents.setLength(0);
} else if (qualified.equals("timestamp")) {
contents.setLength(0);
} else if (qualified.equals("title")) {
contents.setLength(0);
} else if (qualified.equals("id")) {
contents.setLength(0);
}
}
@ -148,11 +153,12 @@ public class EnwikiDocMaker extends LineDocMaker {
return buffer.toString();
}
public void create(String title, String time, String body) {
public void create(String title, String time, String body, String id) {
String[] t = new String[LENGTH];
t[TITLE] = title.replace('\t', ' ');
t[DATE] = time.replace('\t', ' ');
t[BODY] = body.replaceAll("[\t\n]", " ");
t[ID] = id;
synchronized(this) {
while(tuple!=null) {
try {
@ -177,9 +183,12 @@ public class EnwikiDocMaker extends LineDocMaker {
}
} else if (qualified.equals("timestamp")) {
time = time(contents.toString());
} else if (qualified.equals("page")) {
} else if (qualified.equals("id") && id == null) {//just get the first id
id = contents.toString();
}
else if (qualified.equals("page")) {
if (body != null) {
create(title, time, body);
create(title, time, body, id);
}
}
}
@ -192,6 +201,7 @@ public class EnwikiDocMaker extends LineDocMaker {
titleField.setValue(tuple[TITLE]);
dateField.setValue(tuple[DATE]);
bodyField.setValue(tuple[BODY]);
idField.setValue(tuple[ID]);
return doc;
}
}

View File

@ -52,6 +52,7 @@ public class LineDocMaker extends BasicDocMaker {
Field bodyField;
Field titleField;
Field dateField;
Field idField;
public DocState() {
@ -70,11 +71,13 @@ public class LineDocMaker extends BasicDocMaker {
storeVal,
Field.Index.TOKENIZED,
termVecVal);
idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NO_NORMS);
doc = new Document();
doc.add(bodyField);
doc.add(titleField);
doc.add(dateField);
doc.add(idField);
}
final static String SEP = WriteLineDocTask.SEP;