From 90a735441f637749796432744528fafa29adb9a5 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Mon, 31 Dec 2007 13:07:14 +0000 Subject: [PATCH] LUCENE-1102: EnwikiDocMaker now adds a docid field git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607732 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/benchmark/CHANGES.txt | 4 ++++ .../byTask/feeds/EnwikiDocMaker.java | 20 ++++++++++++++----- .../benchmark/byTask/feeds/LineDocMaker.java | 3 +++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/contrib/benchmark/CHANGES.txt b/contrib/benchmark/CHANGES.txt index 15d9a5c4ee7..e46edfdb71d 100644 --- a/contrib/benchmark/CHANGES.txt +++ b/contrib/benchmark/CHANGES.txt @@ -4,6 +4,10 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety $Id:$ +12/31/07 + LUCENE-1102: EnwikiDocMaker now indexes the docid field, so results might not be comparable with results prior to this change, although + it is doubted that this one small field makes much difference. + 12/13/07 LUCENE-1086: DocMakers setup for the "docs.dir" property fixed to properly handle absolute paths. (Shai Erera via Doron Cohen) diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java index 36510dfad3c..dcff15b0027 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java @@ -36,7 +36,8 @@ public class EnwikiDocMaker extends LineDocMaker { static final int TITLE = 0; static final int DATE = TITLE+1; static final int BODY = DATE+1; - static final int LENGTH = BODY+1; + static final int ID = BODY + 1; + static final int LENGTH = ID+1; static final String[] months = {"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", @@ -113,8 +114,9 @@ public class EnwikiDocMaker extends LineDocMaker { String title; String body; String time; + String id; + - static final int BASE = 10; public void startElement(String namespace, String simple, @@ -124,12 +126,15 @@ public class EnwikiDocMaker extends LineDocMaker { title = null; body = null; time = null; + id = null; } else if (qualified.equals("text")) { contents.setLength(0); } else if (qualified.equals("timestamp")) { contents.setLength(0); } else if (qualified.equals("title")) { contents.setLength(0); + } else if (qualified.equals("id")) { + contents.setLength(0); } } @@ -148,11 +153,12 @@ public class EnwikiDocMaker extends LineDocMaker { return buffer.toString(); } - public void create(String title, String time, String body) { + public void create(String title, String time, String body, String id) { String[] t = new String[LENGTH]; t[TITLE] = title.replace('\t', ' '); t[DATE] = time.replace('\t', ' '); t[BODY] = body.replaceAll("[\t\n]", " "); + t[ID] = id; synchronized(this) { while(tuple!=null) { try { @@ -177,9 +183,12 @@ public class EnwikiDocMaker extends LineDocMaker { } } else if (qualified.equals("timestamp")) { time = time(contents.toString()); - } else if (qualified.equals("page")) { + } else if (qualified.equals("id") && id == null) {//just get the first id + id = contents.toString(); + } + else if (qualified.equals("page")) { if (body != null) { - create(title, time, body); + create(title, time, body, id); } } } @@ -192,6 +201,7 @@ public class EnwikiDocMaker extends LineDocMaker { titleField.setValue(tuple[TITLE]); dateField.setValue(tuple[DATE]); bodyField.setValue(tuple[BODY]); + idField.setValue(tuple[ID]); return doc; } } diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java index de36bd4e008..d737f4d013c 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java @@ -52,6 +52,7 @@ public class LineDocMaker extends BasicDocMaker { Field bodyField; Field titleField; Field dateField; + Field idField; public DocState() { @@ -70,11 +71,13 @@ public class LineDocMaker extends BasicDocMaker { storeVal, Field.Index.TOKENIZED, termVecVal); + idField = new Field(BasicDocMaker.ID_FIELD, "", Field.Store.YES, Field.Index.NO_NORMS); doc = new Document(); doc.add(bodyField); doc.add(titleField); doc.add(dateField); + doc.add(idField); } final static String SEP = WriteLineDocTask.SEP;