mirror of https://github.com/apache/lucene.git
LUCENE-1102: EnwikiDocMaker now adds a docid field
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607732 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f39f15ec43
commit
90a735441f
|
@ -4,6 +4,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
||||||
|
|
||||||
$Id:$
|
$Id:$
|
||||||
|
|
||||||
|
12/31/07
|
||||||
|
LUCENE-1102: EnwikiDocMaker now indexes the docid field, so results might not be comparable with results prior to this change, although
|
||||||
|
it is doubted that this one small field makes much difference.
|
||||||
|
|
||||||
12/13/07
|
12/13/07
|
||||||
LUCENE-1086: DocMakers setup for the "docs.dir" property
|
LUCENE-1086: DocMakers setup for the "docs.dir" property
|
||||||
fixed to properly handle absolute paths. (Shai Erera via Doron Cohen)
|
fixed to properly handle absolute paths. (Shai Erera via Doron Cohen)
|
||||||
|
|
|
@ -36,7 +36,8 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
static final int TITLE = 0;
|
static final int TITLE = 0;
|
||||||
static final int DATE = TITLE+1;
|
static final int DATE = TITLE+1;
|
||||||
static final int BODY = DATE+1;
|
static final int BODY = DATE+1;
|
||||||
static final int LENGTH = BODY+1;
|
static final int ID = BODY + 1;
|
||||||
|
static final int LENGTH = ID+1;
|
||||||
|
|
||||||
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
static final String[] months = {"JAN", "FEB", "MAR", "APR",
|
||||||
"MAY", "JUN", "JUL", "AUG",
|
"MAY", "JUN", "JUL", "AUG",
|
||||||
|
@ -113,8 +114,9 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
String title;
|
String title;
|
||||||
String body;
|
String body;
|
||||||
String time;
|
String time;
|
||||||
|
String id;
|
||||||
|
|
||||||
|
|
||||||
static final int BASE = 10;
|
|
||||||
|
|
||||||
public void startElement(String namespace,
|
public void startElement(String namespace,
|
||||||
String simple,
|
String simple,
|
||||||
|
@ -124,12 +126,15 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
title = null;
|
title = null;
|
||||||
body = null;
|
body = null;
|
||||||
time = null;
|
time = null;
|
||||||
|
id = null;
|
||||||
} else if (qualified.equals("text")) {
|
} else if (qualified.equals("text")) {
|
||||||
contents.setLength(0);
|
contents.setLength(0);
|
||||||
} else if (qualified.equals("timestamp")) {
|
} else if (qualified.equals("timestamp")) {
|
||||||
contents.setLength(0);
|
contents.setLength(0);
|
||||||
} else if (qualified.equals("title")) {
|
} else if (qualified.equals("title")) {
|
||||||
contents.setLength(0);
|
contents.setLength(0);
|
||||||
|
} else if (qualified.equals("id")) {
|
||||||
|
contents.setLength(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,11 +153,12 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void create(String title, String time, String body) {
|
public void create(String title, String time, String body, String id) {
|
||||||
String[] t = new String[LENGTH];
|
String[] t = new String[LENGTH];
|
||||||
t[TITLE] = title.replace('\t', ' ');
|
t[TITLE] = title.replace('\t', ' ');
|
||||||
t[DATE] = time.replace('\t', ' ');
|
t[DATE] = time.replace('\t', ' ');
|
||||||
t[BODY] = body.replaceAll("[\t\n]", " ");
|
t[BODY] = body.replaceAll("[\t\n]", " ");
|
||||||
|
t[ID] = id;
|
||||||
synchronized(this) {
|
synchronized(this) {
|
||||||
while(tuple!=null) {
|
while(tuple!=null) {
|
||||||
try {
|
try {
|
||||||
|
@ -177,9 +183,12 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
}
|
}
|
||||||
} else if (qualified.equals("timestamp")) {
|
} else if (qualified.equals("timestamp")) {
|
||||||
time = time(contents.toString());
|
time = time(contents.toString());
|
||||||
} else if (qualified.equals("page")) {
|
} else if (qualified.equals("id") && id == null) {//just get the first id
|
||||||
|
id = contents.toString();
|
||||||
|
}
|
||||||
|
else if (qualified.equals("page")) {
|
||||||
if (body != null) {
|
if (body != null) {
|
||||||
create(title, time, body);
|
create(title, time, body, id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -192,6 +201,7 @@ public class EnwikiDocMaker extends LineDocMaker {
|
||||||
titleField.setValue(tuple[TITLE]);
|
titleField.setValue(tuple[TITLE]);
|
||||||
dateField.setValue(tuple[DATE]);
|
dateField.setValue(tuple[DATE]);
|
||||||
bodyField.setValue(tuple[BODY]);
|
bodyField.setValue(tuple[BODY]);
|
||||||
|
idField.setValue(tuple[ID]);
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,6 +52,7 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
Field bodyField;
|
Field bodyField;
|
||||||
Field titleField;
|
Field titleField;
|
||||||
Field dateField;
|
Field dateField;
|
||||||
|
Field idField;
|
||||||
|
|
||||||
public DocState() {
|
public DocState() {
|
||||||
|
|
||||||
|
@ -70,11 +71,13 @@ public class LineDocMaker extends BasicDocMaker {
|
||||||
storeVal,
|
storeVal,
|
||||||
Field.Index.TOKENIZED,
|
Field.Index.TOKENIZED,
|
||||||
termVecVal);
|
termVecVal);
|
||||||
|
idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NO_NORMS);
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(bodyField);
|
doc.add(bodyField);
|
||||||
doc.add(titleField);
|
doc.add(titleField);
|
||||||
doc.add(dateField);
|
doc.add(dateField);
|
||||||
|
doc.add(idField);
|
||||||
}
|
}
|
||||||
|
|
||||||
final static String SEP = WriteLineDocTask.SEP;
|
final static String SEP = WriteLineDocTask.SEP;
|
||||||
|
|
Loading…
Reference in New Issue