mirror of https://github.com/apache/lucene.git
LUCENE-1102: EnwikiDocMaker now adds a docid field
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607732 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f39f15ec43
commit
90a735441f
|
@ -4,6 +4,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
12/31/07
|
||||
LUCENE-1102: EnwikiDocMaker now indexes the docid field, so results might not be comparable with results prior to this change, although
|
||||
it is doubted that this one small field makes much difference.
|
||||
|
||||
12/13/07
|
||||
LUCENE-1086: DocMakers setup for the "docs.dir" property
|
||||
fixed to properly handle absolute paths. (Shai Erera via Doron Cohen)
|
||||
|
|
|
@ -36,7 +36,8 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
static final int TITLE = 0;
|
||||
static final int DATE = TITLE+1;
|
||||
static final int BODY = DATE+1;
|
||||
static final int LENGTH = BODY+1;
|
||||
static final int ID = BODY + 1;
|
||||
static final int LENGTH = ID+1;
|
||||
|
||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||
"MAY", "JUN", "JUL", "AUG",
|
||||
|
@ -113,8 +114,9 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
String title;
|
||||
String body;
|
||||
String time;
|
||||
String id;
|
||||
|
||||
|
||||
static final int BASE = 10;
|
||||
|
||||
public void startElement(String namespace,
|
||||
String simple,
|
||||
|
@ -124,12 +126,15 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
title = null;
|
||||
body = null;
|
||||
time = null;
|
||||
id = null;
|
||||
} else if (qualified.equals("text")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("title")) {
|
||||
contents.setLength(0);
|
||||
} else if (qualified.equals("id")) {
|
||||
contents.setLength(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -148,11 +153,12 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
public void create(String title, String time, String body) {
|
||||
public void create(String title, String time, String body, String id) {
|
||||
String[] t = new String[LENGTH];
|
||||
t[TITLE] = title.replace('\t', ' ');
|
||||
t[DATE] = time.replace('\t', ' ');
|
||||
t[BODY] = body.replaceAll("[\t\n]", " ");
|
||||
t[ID] = id;
|
||||
synchronized(this) {
|
||||
while(tuple!=null) {
|
||||
try {
|
||||
|
@ -177,9 +183,12 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
}
|
||||
} else if (qualified.equals("timestamp")) {
|
||||
time = time(contents.toString());
|
||||
} else if (qualified.equals("page")) {
|
||||
} else if (qualified.equals("id") && id == null) {//just get the first id
|
||||
id = contents.toString();
|
||||
}
|
||||
else if (qualified.equals("page")) {
|
||||
if (body != null) {
|
||||
create(title, time, body);
|
||||
create(title, time, body, id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -192,6 +201,7 @@ public class EnwikiDocMaker extends LineDocMaker {
|
|||
titleField.setValue(tuple[TITLE]);
|
||||
dateField.setValue(tuple[DATE]);
|
||||
bodyField.setValue(tuple[BODY]);
|
||||
idField.setValue(tuple[ID]);
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,6 +52,7 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
Field bodyField;
|
||||
Field titleField;
|
||||
Field dateField;
|
||||
Field idField;
|
||||
|
||||
public DocState() {
|
||||
|
||||
|
@ -70,11 +71,13 @@ public class LineDocMaker extends BasicDocMaker {
|
|||
storeVal,
|
||||
Field.Index.TOKENIZED,
|
||||
termVecVal);
|
||||
idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NO_NORMS);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(bodyField);
|
||||
doc.add(titleField);
|
||||
doc.add(dateField);
|
||||
doc.add(idField);
|
||||
}
|
||||
|
||||
final static String SEP = WriteLineDocTask.SEP;
|
||||
|
|
Loading…
Reference in New Issue