LUCENE-1540: Improvements to contrib.benchmark for TREC collections - port/merge from 3x.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1067359 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doron Cohen 2011-02-05 00:35:09 +00:00
parent 2de3b26a09
commit 8c487e588c
17 changed files with 785 additions and 223 deletions

View File

@ -17,20 +17,13 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Arrays; import java.util.Arrays;
import java.util.Enumeration;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
@ -78,39 +71,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
*/ */
/* Unzips zipName --> dirName, removing dirName
first */
public void unzip(File zipName, String destDirName) throws IOException {
ZipFile zipFile = new ZipFile(zipName);
Enumeration<? extends ZipEntry> entries = zipFile.entries();
String dirName = fullDir(destDirName);
File fileDir = new File(dirName);
rmDir(destDirName);
fileDir.mkdir();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
InputStream in = zipFile.getInputStream(entry);
OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(fileDir, entry.getName())));
byte[] buffer = new byte[8192];
int len;
while((len = in.read(buffer)) >= 0) {
out.write(buffer, 0, len);
}
in.close();
out.close();
}
zipFile.close();
}
/* /*
public void testCreateCFS() throws IOException { public void testCreateCFS() throws IOException {
String dirName = "testindex.cfs"; String dirName = "testindex.cfs";
@ -153,10 +113,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: index " + unsupportedNames[i]); System.out.println("TEST: index " + unsupportedNames[i]);
} }
unzip(getDataFile("unsupported." + unsupportedNames[i] + ".zip"), unsupportedNames[i]); File oldIndxeDir = _TestUtil.getTempDir(unsupportedNames[i]);
_TestUtil.unzip(getDataFile("unsupported." + unsupportedNames[i] + ".zip"), oldIndxeDir);
String fullPath = fullDir(unsupportedNames[i]); Directory dir = newFSDirectory(oldIndxeDir);
Directory dir = newFSDirectory(new File(fullPath));
IndexReader reader = null; IndexReader reader = null;
IndexWriter writer = null; IndexWriter writer = null;
@ -200,7 +159,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName())); assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName()));
dir.close(); dir.close();
rmDir(unsupportedNames[i]); _TestUtil.rmDir(oldIndxeDir);
} }
} }
@ -209,10 +168,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println("\nTEST: index=" + oldNames[i]); System.out.println("\nTEST: index=" + oldNames[i]);
} }
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndxeDir = _TestUtil.getTempDir(oldNames[i]);
_TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndxeDir);
String fullPath = fullDir(oldNames[i]); Directory dir = newFSDirectory(oldIndxeDir);
Directory dir = newFSDirectory(new File(fullPath));
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig( IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(
TEST_VERSION_CURRENT, new MockAnalyzer())); TEST_VERSION_CURRENT, new MockAnalyzer()));
@ -223,15 +181,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
_TestUtil.checkIndex(dir); _TestUtil.checkIndex(dir);
dir.close(); dir.close();
rmDir(oldNames[i]); _TestUtil.rmDir(oldIndxeDir);
} }
} }
public void testAddOldIndexes() throws IOException { public void testAddOldIndexes() throws IOException {
for (String name : oldNames) { for (String name : oldNames) {
unzip(getDataFile("index." + name + ".zip"), name); File oldIndxeDir = _TestUtil.getTempDir(name);
String fullPath = fullDir(name); _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir);
Directory dir = newFSDirectory(new File(fullPath)); Directory dir = newFSDirectory(oldIndxeDir);
Directory targetDir = newDirectory(); Directory targetDir = newDirectory();
IndexWriter w = new IndexWriter(targetDir, newIndexWriterConfig( IndexWriter w = new IndexWriter(targetDir, newIndexWriterConfig(
@ -243,15 +201,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close(); dir.close();
targetDir.close(); targetDir.close();
rmDir(name); _TestUtil.rmDir(oldIndxeDir);
} }
} }
public void testAddOldIndexesReader() throws IOException { public void testAddOldIndexesReader() throws IOException {
for (String name : oldNames) { for (String name : oldNames) {
unzip(getDataFile("index." + name + ".zip"), name); File oldIndxeDir = _TestUtil.getTempDir(name);
String fullPath = fullDir(name); _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir);
Directory dir = newFSDirectory(new File(fullPath)); Directory dir = newFSDirectory(oldIndxeDir);
IndexReader reader = IndexReader.open(dir); IndexReader reader = IndexReader.open(dir);
Directory targetDir = newDirectory(); Directory targetDir = newDirectory();
@ -265,23 +223,25 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close(); dir.close();
targetDir.close(); targetDir.close();
rmDir(name); _TestUtil.rmDir(oldIndxeDir);
} }
} }
public void testSearchOldIndex() throws IOException { public void testSearchOldIndex() throws IOException {
for(int i=0;i<oldNames.length;i++) { for(int i=0;i<oldNames.length;i++) {
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndxeDir = _TestUtil.getTempDir(oldNames[i]);
searchIndex(oldNames[i], oldNames[i]); _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndxeDir);
rmDir(oldNames[i]); searchIndex(oldIndxeDir, oldNames[i]);
_TestUtil.rmDir(oldIndxeDir);
} }
} }
public void testIndexOldIndexNoAdds() throws IOException { public void testIndexOldIndexNoAdds() throws IOException {
for(int i=0;i<oldNames.length;i++) { for(int i=0;i<oldNames.length;i++) {
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndxeDir = _TestUtil.getTempDir(oldNames[i]);
changeIndexNoAdds(random, oldNames[i]); _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndxeDir);
rmDir(oldNames[i]); changeIndexNoAdds(random, oldIndxeDir);
_TestUtil.rmDir(oldIndxeDir);
} }
} }
@ -290,9 +250,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: oldName=" + oldNames[i]); System.out.println("TEST: oldName=" + oldNames[i]);
} }
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndxeDir = _TestUtil.getTempDir(oldNames[i]);
changeIndexWithAdds(random, oldNames[i]); _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndxeDir);
rmDir(oldNames[i]); changeIndexWithAdds(random, oldIndxeDir, oldNames[i]);
_TestUtil.rmDir(oldIndxeDir);
} }
} }
@ -305,13 +266,11 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
} }
public void searchIndex(String dirName, String oldName) throws IOException { public void searchIndex(File indexDir, String oldName) throws IOException {
//QueryParser parser = new QueryParser("contents", new MockAnalyzer()); //QueryParser parser = new QueryParser("contents", new MockAnalyzer());
//Query query = parser.parse("handle:1"); //Query query = parser.parse("handle:1");
dirName = fullDir(dirName); Directory dir = newFSDirectory(indexDir);
Directory dir = newFSDirectory(new File(dirName));
IndexSearcher searcher = new IndexSearcher(dir, true); IndexSearcher searcher = new IndexSearcher(dir, true);
IndexReader reader = searcher.getIndexReader(); IndexReader reader = searcher.getIndexReader();
@ -343,7 +302,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
TermFreqVector tfv = reader.getTermFreqVector(i, "utf8"); TermFreqVector tfv = reader.getTermFreqVector(i, "utf8");
assertNotNull("docID=" + i + " index=" + dirName, tfv); assertNotNull("docID=" + i + " index=" + indexDir.getName(), tfv);
assertTrue(tfv instanceof TermPositionVector); assertTrue(tfv instanceof TermPositionVector);
} else } else
// Only ID 7 is deleted // Only ID 7 is deleted
@ -376,11 +335,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
return v0 - v1; return v0 - v1;
} }
public void changeIndexWithAdds(Random random, String dirName) throws IOException { public void changeIndexWithAdds(Random random, File oldIndexDir, String origOldName) throws IOException {
String origDirName = dirName;
dirName = fullDir(dirName);
Directory dir = newFSDirectory(new File(dirName)); Directory dir = newFSDirectory(oldIndexDir);
// open writer // open writer
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND)); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
writer.setInfoStream(VERBOSE ? System.out : null); writer.setInfoStream(VERBOSE ? System.out : null);
@ -391,7 +348,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
// make sure writer sees right total -- writer seems not to know about deletes in .del? // make sure writer sees right total -- writer seems not to know about deletes in .del?
final int expected; final int expected;
if (compare(origDirName, "24") < 0) { if (compare(origOldName, "24") < 0) {
expected = 44; expected = 44;
} else { } else {
expected = 45; expected = 45;
@ -442,11 +399,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close(); dir.close();
} }
public void changeIndexNoAdds(Random random, String dirName) throws IOException { public void changeIndexNoAdds(Random random, File oldIndexDir) throws IOException {
dirName = fullDir(dirName); Directory dir = newFSDirectory(oldIndexDir);
Directory dir = newFSDirectory(new File(dirName));
// make sure searching sees right # hits // make sure searching sees right # hits
IndexSearcher searcher = new IndexSearcher(dir, true); IndexSearcher searcher = new IndexSearcher(dir, true);
@ -489,13 +444,12 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
dir.close(); dir.close();
} }
public void createIndex(Random random, String dirName, boolean doCFS) throws IOException { public File createIndex(Random random, String dirName, boolean doCFS) throws IOException {
rmDir(dirName); File indexDir = _TestUtil.getTempDir(dirName);
_TestUtil.rmDir(indexDir);
dirName = fullDir(dirName); Directory dir = newFSDirectory(indexDir);
Directory dir = newFSDirectory(new File(dirName));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMaxBufferedDocs(10); IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMaxBufferedDocs(10);
((LogMergePolicy) conf.getMergePolicy()).setUseCompoundFile(doCFS); ((LogMergePolicy) conf.getMergePolicy()).setUseCompoundFile(doCFS);
IndexWriter writer = new IndexWriter(dir, conf); IndexWriter writer = new IndexWriter(dir, conf);
@ -522,17 +476,21 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
// Set one norm so we get a .s0 file: // Set one norm so we get a .s0 file:
reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f)); reader.setNorm(21, "content", conf.getSimilarityProvider().get("content").encodeNormValue(1.5f));
reader.close(); reader.close();
dir.close();
return indexDir;
} }
/* Verifies that the expected file names were produced */ /* Verifies that the expected file names were produced */
public void testExactFileNames() throws IOException { public void testExactFileNames() throws IOException {
String outputDir = "lucene.backwardscompat0.index"; String outputDirName = "lucene.backwardscompat0.index";
rmDir(outputDir); File outputDir = _TestUtil.getTempDir(outputDirName);
_TestUtil.rmDir(outputDir);
try { try {
Directory dir = newFSDirectory(new File(fullDir(outputDir))); Directory dir = newFSDirectory(outputDir);
LogMergePolicy mergePolicy = newLogMergePolicy(true, 10); LogMergePolicy mergePolicy = newLogMergePolicy(true, 10);
mergePolicy.setNoCFSRatio(1); // This test expects all of its segments to be in CFS mergePolicy.setNoCFSRatio(1); // This test expects all of its segments to be in CFS
@ -595,7 +553,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
dir.close(); dir.close();
} finally { } finally {
rmDir(outputDir); _TestUtil.rmDir(outputDir);
} }
} }
@ -636,23 +594,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
writer.addDocument(doc); writer.addDocument(doc);
} }
private void rmDir(String dir) throws IOException {
File fileDir = new File(fullDir(dir));
if (fileDir.exists()) {
File[] files = fileDir.listFiles();
if (files != null) {
for (int i = 0; i < files.length; i++) {
files[i].delete();
}
}
fileDir.delete();
}
}
public static String fullDir(String dirName) throws IOException {
return new File(TEMP_DIR, dirName).getCanonicalPath();
}
private int countDocs(DocsEnum docs) throws IOException { private int countDocs(DocsEnum docs) throws IOException {
int count = 0; int count = 0;
while((docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { while((docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
@ -664,9 +605,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
// flex: test basics of TermsEnum api on non-flex index // flex: test basics of TermsEnum api on non-flex index
public void testNextIntoWrongField() throws Exception { public void testNextIntoWrongField() throws Exception {
for(int i=0;i<oldNames.length;i++) { for(int i=0;i<oldNames.length;i++) {
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndexDir = _TestUtil.getTempDir(oldNames[i]);
String fullPath = fullDir(oldNames[i]); _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndexDir);
Directory dir = newFSDirectory(new File(fullPath)); Directory dir = newFSDirectory(oldIndexDir);
IndexReader r = IndexReader.open(dir); IndexReader r = IndexReader.open(dir);
TermsEnum terms = MultiFields.getFields(r).terms("content").iterator(); TermsEnum terms = MultiFields.getFields(r).terms("content").iterator();
BytesRef t = terms.next(); BytesRef t = terms.next();
@ -703,16 +644,16 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
rmDir(oldNames[i]); _TestUtil.rmDir(oldIndexDir);
} }
} }
public void testNumericFields() throws Exception { public void testNumericFields() throws Exception {
for(int i=0;i<oldNames.length;i++) { for(int i=0;i<oldNames.length;i++) {
unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); File oldIndexDir = _TestUtil.getTempDir(oldNames[i]);
String fullPath = fullDir(oldNames[i]); _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndexDir);
Directory dir = newFSDirectory(new File(fullPath)); Directory dir = newFSDirectory(oldIndexDir);
IndexSearcher searcher = new IndexSearcher(dir, true); IndexSearcher searcher = new IndexSearcher(dir, true);
for (int id=10; id<15; id++) { for (int id=10; id<15; id++) {
@ -747,7 +688,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
searcher.close(); searcher.close();
dir.close(); dir.close();
rmDir(oldNames[i]); _TestUtil.rmDir(oldIndexDir);
} }
} }

View File

@ -17,13 +17,20 @@ package org.apache.lucene.util;
* limitations under the License. * limitations under the License.
*/ */
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.Enumeration;
import java.util.Random; import java.util.Random;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.junit.Assert; import org.junit.Assert;
@ -61,6 +68,49 @@ public class _TestUtil {
} }
} }
/**
* Convenience method: Unzip zipName + ".zip" under destDir, removing destDir first
*/
public static void unzip(File zipName, File destDir) throws IOException {
ZipFile zipFile = new ZipFile(zipName);
Enumeration<? extends ZipEntry> entries = zipFile.entries();
rmDir(destDir);
destDir.mkdir();
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
InputStream in = zipFile.getInputStream(entry);
File targetFile = new File(destDir, entry.getName());
if (entry.isDirectory()) {
// allow unzipping with directory structure
targetFile.mkdirs();
} else {
if (targetFile.getParentFile()!=null) {
// be on the safe side: do not rely on that directories are always extracted
// before their children (although this makes sense, but is it guaranteed?)
targetFile.getParentFile().mkdirs();
}
OutputStream out = new BufferedOutputStream(new FileOutputStream(targetFile));
byte[] buffer = new byte[8192];
int len;
while((len = in.read(buffer)) >= 0) {
out.write(buffer, 0, len);
}
in.close();
out.close();
}
}
zipFile.close();
}
public static void syncConcurrentMerges(IndexWriter writer) { public static void syncConcurrentMerges(IndexWriter writer) {
syncConcurrentMerges(writer.getConfig().getMergeScheduler()); syncConcurrentMerges(writer.getConfig().getMergeScheduler());
} }

View File

@ -2,6 +2,12 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
02/05/2011
LUCENE-1540: Improvements to contrib.benchmark for TREC collections.
ContentSource can now process plain text files, gzip files, and bzip2 files.
TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR
collection (both used by many TREC tasks). (Shai Erera, Doron Cohen)
01/26/2011 01/26/2011
LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That
way, if a previous extract attempt failed, "ant extract-reuters" will still way, if a previous extract attempt failed, "ant extract-reuters" will still

View File

@ -56,11 +56,14 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
public abstract class ContentSource { public abstract class ContentSource {
private static final int BZIP = 0; private static final int BZIP = 0;
private static final int OTHER = 1; private static final int GZIP = 1;
private static final int OTHER = 2;
private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>(); private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>();
static { static {
extensionToType.put(".bz2", Integer.valueOf(BZIP)); extensionToType.put(".bz2", Integer.valueOf(BZIP));
extensionToType.put(".bzip", Integer.valueOf(BZIP)); extensionToType.put(".bzip", Integer.valueOf(BZIP));
extensionToType.put(".gz", Integer.valueOf(GZIP));
extensionToType.put(".gzip", Integer.valueOf(GZIP));
} }
protected static final int BUFFER_SIZE = 1 << 16; // 64K protected static final int BUFFER_SIZE = 1 << 16; // 64K
@ -78,11 +81,13 @@ public abstract class ContentSource {
private CompressorStreamFactory csFactory = new CompressorStreamFactory(); private CompressorStreamFactory csFactory = new CompressorStreamFactory();
/** update count of bytes generated by this source */
protected final synchronized void addBytes(long numBytes) { protected final synchronized void addBytes(long numBytes) {
bytesCount += numBytes; bytesCount += numBytes;
totalBytesCount += numBytes; totalBytesCount += numBytes;
} }
/** update count of documents generated by this source */
protected final synchronized void addDoc() { protected final synchronized void addDoc() {
++docsCount; ++docsCount;
++totalDocsCount; ++totalDocsCount;
@ -130,21 +135,25 @@ public abstract class ContentSource {
type = typeInt.intValue(); type = typeInt.intValue();
} }
} }
switch (type) {
case BZIP: try {
try { switch (type) {
case BZIP:
// According to BZip2CompressorInputStream's code, it reads the first // According to BZip2CompressorInputStream's code, it reads the first
// two file header chars ('B' and 'Z'). It is important to wrap the // two file header chars ('B' and 'Z'). It is important to wrap the
// underlying input stream with a buffered one since // underlying input stream with a buffered one since
// Bzip2CompressorInputStream uses the read() method exclusively. // Bzip2CompressorInputStream uses the read() method exclusively.
is = csFactory.createCompressorInputStream("bzip2", is); is = csFactory.createCompressorInputStream("bzip2", is);
} catch (CompressorException e) { break;
IOException ioe = new IOException(e.getMessage()); case GZIP:
ioe.initCause(e); is = csFactory.createCompressorInputStream("gz", is);
throw ioe; break;
} default: // Do nothing, stay with FileInputStream
break; }
default: // Do nothing, stay with FileInputStream } catch (CompressorException e) {
IOException ioe = new IOException(e.getMessage());
ioe.initCause(e);
throw ioe;
} }
return is; return is;

View File

@ -29,11 +29,14 @@ import java.util.Properties;
*/ */
public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader); org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader);
// title // title
String title = p.getTitle(); if (title==null) {
title = p.getTitle();
}
// properties // properties
Properties props = p.getMetaTags(); Properties props = p.getMetaTags();
// body // body

View File

@ -29,16 +29,18 @@ public interface HTMLParser {
/** /**
* Parse the input Reader and return DocData. * Parse the input Reader and return DocData.
* A provided name or date is used for the result, otherwise an attempt is * The provided name,title,date are used for the result, unless when they're null,
* made to set them from the parsed data. * in which case an attempt is made to set them from the parsed data.
* @param dateFormat date formatter to use for extracting the date. * @param docData result reused
* @param name name of the result doc data. If null, attempt to set by parsed data. * @param name name of the result doc data.
* @param date date of the result doc data. If null, attempt to set by parsed data. * @param date date of the result doc data. If null, attempt to set by parsed data.
* @param reader of html text to parse. * @param title title of the result doc data. If null, attempt to set by parsed data.
* @param reader reader of html text to parse.
* @param dateFormat date formatter to use for extracting the date.
* @return Parsed doc data. * @return Parsed doc data.
* @throws IOException * @throws IOException
* @throws InterruptedException * @throws InterruptedException
*/ */
public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
} }

View File

@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.feeds;
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.text.DateFormat; import java.text.DateFormat;
@ -29,8 +29,8 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.Locale; import java.util.Locale;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.ThreadInterruptedException;
@ -46,8 +46,10 @@ import org.apache.lucene.util.ThreadInterruptedException;
* <li><b>docs.dir</b> - specifies the directory where the TREC files reside. * <li><b>docs.dir</b> - specifies the directory where the TREC files reside.
* Can be set to a relative path if "work.dir" is also specified * Can be set to a relative path if "work.dir" is also specified
* (<b>default=trec</b>). * (<b>default=trec</b>).
* <li><b>trec.doc.parser</b> - specifies the {@link TrecDocParser} class to use for
* parsing the TREC documents content (<b>default=TrecGov2Parser</b>).
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>). * parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>).
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used. * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
* <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname * <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
* </ul> * </ul>
@ -59,22 +61,24 @@ public class TrecContentSource extends ContentSource {
ParsePosition pos; ParsePosition pos;
} }
private static final String DATE = "Date: "; public static final String DOCNO = "<DOCNO>";
private static final String DOCHDR = "<DOCHDR>"; public static final String TERMINATING_DOCNO = "</DOCNO>";
private static final String TERMINATING_DOCHDR = "</DOCHDR>"; public static final String DOC = "<DOC>";
private static final String DOCNO = "<DOCNO>"; public static final String TERMINATING_DOC = "</DOC>";
private static final String TERMINATING_DOCNO = "</DOCNO>";
private static final String DOC = "<DOC>";
private static final String TERMINATING_DOC = "</DOC>";
private static final String NEW_LINE = System.getProperty("line.separator"); /** separator between lines in the byffer */
public static final String NEW_LINE = System.getProperty("line.separator");
private static final String DATE_FORMATS [] = { private static final String DATE_FORMATS [] = {
"EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
"EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST
"EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
"EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT
"EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003
"dd MMM yyyy", // 1 March 1994
"MMM dd, yyyy", // February 3, 1994
"yyMMdd", // 910513
"hhmm z.z.z. MMM dd, yyyy", // 0901 u.t.c. April 28, 1994
}; };
private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>(); private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
@ -83,7 +87,7 @@ public class TrecContentSource extends ContentSource {
private File dataDir = null; private File dataDir = null;
private ArrayList<File> inputFiles = new ArrayList<File>(); private ArrayList<File> inputFiles = new ArrayList<File>();
private int nextFile = 0; private int nextFile = 0;
private int rawDocSize; private int rawDocSize = 0;
// Use to synchronize threads on reading from the TREC documents. // Use to synchronize threads on reading from the TREC documents.
private Object lock = new Object(); private Object lock = new Object();
@ -92,7 +96,10 @@ public class TrecContentSource extends ContentSource {
BufferedReader reader; BufferedReader reader;
int iteration = 0; int iteration = 0;
HTMLParser htmlParser; HTMLParser htmlParser;
private boolean excludeDocnameIteration; private boolean excludeDocnameIteration;
private TrecDocParser trecDocParser = new TrecGov2Parser(); // default
ParsePathType currPathType; // not private for tests
private DateFormatInfo getDateFormatInfo() { private DateFormatInfo getDateFormatInfo() {
DateFormatInfo dfi = dateFormats.get(); DateFormatInfo dfi = dateFormats.get();
@ -118,7 +125,7 @@ public class TrecContentSource extends ContentSource {
return sb; return sb;
} }
private Reader getTrecDocReader(StringBuilder docBuffer) { Reader getTrecDocReader(StringBuilder docBuffer) {
StringBuilderReader r = trecDocReader.get(); StringBuilderReader r = trecDocReader.get();
if (r == null) { if (r == null) {
r = new StringBuilderReader(docBuffer); r = new StringBuilderReader(docBuffer);
@ -129,10 +136,21 @@ public class TrecContentSource extends ContentSource {
return r; return r;
} }
// read until finding a line that starts with the specified prefix, or a terminating tag has been found. HTMLParser getHtmlParser() {
private void read(StringBuilder buf, String prefix, boolean collectMatchLine, return htmlParser;
boolean collectAll, String terminatingTag) }
throws IOException, NoMoreDataException {
/**
* Read until a line starting with the specified <code>lineStart</code>.
* @param buf buffer for collecting the data if so specified/
* @param lineStart line start to look for, must not be null.
* @param collectMatchLine whether to collect the matching line into <code>buffer</code>.
* @param collectAll whether to collect all lines into <code>buffer</code>.
* @throws IOException
* @throws NoMoreDataException
*/
private void read(StringBuilder buf, String lineStart,
boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException {
String sep = ""; String sep = "";
while (true) { while (true) {
String line = reader.readLine(); String line = reader.readLine();
@ -144,20 +162,12 @@ public class TrecContentSource extends ContentSource {
rawDocSize += line.length(); rawDocSize += line.length();
if (line.startsWith(prefix)) { if (lineStart!=null && line.startsWith(lineStart)) {
if (collectMatchLine) { if (collectMatchLine) {
buf.append(sep).append(line); buf.append(sep).append(line);
sep = NEW_LINE; sep = NEW_LINE;
} }
break; return;
}
if (terminatingTag != null && line.startsWith(terminatingTag)) {
// didn't find the prefix that was asked, but the terminating
// tag was found. set the length to 0 to signal no match was
// found.
buf.setLength(0);
break;
} }
if (collectAll) { if (collectAll) {
@ -169,7 +179,7 @@ public class TrecContentSource extends ContentSource {
void openNextFile() throws NoMoreDataException, IOException { void openNextFile() throws NoMoreDataException, IOException {
close(); close();
int retries = 0; currPathType = null;
while (true) { while (true) {
if (nextFile >= inputFiles.size()) { if (nextFile >= inputFiles.size()) {
// exhausted files, start a new round, unless forever set to false. // exhausted files, start a new round, unless forever set to false.
@ -184,13 +194,13 @@ public class TrecContentSource extends ContentSource {
System.out.println("opening: " + f + " length: " + f.length()); System.out.println("opening: " + f + " length: " + f.length());
} }
try { try {
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE); InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension
reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE); reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE);
currPathType = TrecDocParser.pathType(f);
return; return;
} catch (Exception e) { } catch (Exception e) {
retries++; if (verbose) {
if (retries < 20 && verbose) { System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage());
System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries);
continue; continue;
} }
throw new NoMoreDataException(); throw new NoMoreDataException();
@ -198,7 +208,7 @@ public class TrecContentSource extends ContentSource {
} }
} }
Date parseDate(String dateStr) { public Date parseDate(String dateStr) {
dateStr = dateStr.trim(); dateStr = dateStr.trim();
DateFormatInfo dfi = getDateFormatInfo(); DateFormatInfo dfi = getDateFormatInfo();
for (int i = 0; i < dfi.dfs.length; i++) { for (int i = 0; i < dfi.dfs.length; i++) {
@ -237,70 +247,47 @@ public class TrecContentSource extends ContentSource {
@Override @Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String dateStr = null, name = null; String name = null;
Reader r = null; StringBuilder docBuf = getDocBuffer();
ParsePathType parsedPathType;
// protect reading from the TREC files by multiple threads. The rest of the // protect reading from the TREC files by multiple threads. The rest of the
// method, i.e., parsing the content and returning the DocData can run // method, i.e., parsing the content and returning the DocData can run unprotected.
// unprotected.
synchronized (lock) { synchronized (lock) {
if (reader == null) { if (reader == null) {
openNextFile(); openNextFile();
} }
StringBuilder docBuf = getDocBuffer();
// 1. skip until doc start // 1. skip until doc start - required for all TREC formats
docBuf.setLength(0); docBuf.setLength(0);
read(docBuf, DOC, false, false, null); read(docBuf, DOC, false, false);
// 2. name // save parsedFile for passing trecDataParser after the sync block, in
// case another thread will open another file in between.
parsedPathType = currPathType;
// 2. name - required for all TREC formats
docBuf.setLength(0); docBuf.setLength(0);
read(docBuf, DOCNO, true, false, null); read(docBuf, DOCNO, true, false);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
DOCNO.length())); DOCNO.length())).trim();
if (!excludeDocnameIteration)
if (!excludeDocnameIteration) {
name = name + "_" + iteration; name = name + "_" + iteration;
// 3. skip until doc header
docBuf.setLength(0);
read(docBuf, DOCHDR, false, false, null);
boolean findTerminatingDocHdr = false;
// 4. date - look for the date only until /DOCHDR
docBuf.setLength(0);
read(docBuf, DATE, true, false, TERMINATING_DOCHDR);
if (docBuf.length() != 0) {
// Date found.
dateStr = docBuf.substring(DATE.length());
findTerminatingDocHdr = true;
} }
// 5. skip until end of doc header // 3. read all until end of doc
if (findTerminatingDocHdr) {
docBuf.setLength(0);
read(docBuf, TERMINATING_DOCHDR, false, false, null);
}
// 6. collect until end of doc
docBuf.setLength(0); docBuf.setLength(0);
read(docBuf, TERMINATING_DOC, false, true, null); read(docBuf, TERMINATING_DOC, false, true);
// 7. Set up a Reader over the read content
r = getTrecDocReader(docBuf);
// Resetting the thread's reader means it will reuse the instance
// allocated as well as re-read from docBuf.
r.reset();
// count char length of parsed html text (larger than the plain doc body text).
addBytes(docBuf.length());
} }
// count char length of text to be parsed (may be larger than the resulted plain doc body text).
addBytes(docBuf.length());
// This code segment relies on HtmlParser being thread safe. When we get // This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe. // here, everything else is already private to that thread, so we're safe.
Date date = dateStr != null ? parseDate(dateStr) : null;
try { try {
docData = htmlParser.parse(docData, name, date, r, null); docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addDoc(); addDoc();
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie); throw new ThreadInterruptedException(ie);
@ -322,27 +309,40 @@ public class TrecContentSource extends ContentSource {
@Override @Override
public void setConfig(Config config) { public void setConfig(Config config) {
super.setConfig(config); super.setConfig(config);
// dirs
File workDir = new File(config.get("work.dir", "work")); File workDir = new File(config.get("work.dir", "work"));
String d = config.get("docs.dir", "trec"); String d = config.get("docs.dir", "trec");
dataDir = new File(d); dataDir = new File(d);
if (!dataDir.isAbsolute()) { if (!dataDir.isAbsolute()) {
dataDir = new File(workDir, d); dataDir = new File(workDir, d);
} }
// files
collectFiles(dataDir, inputFiles); collectFiles(dataDir, inputFiles);
if (inputFiles.size() == 0) { if (inputFiles.size() == 0) {
throw new IllegalArgumentException("No files in dataDir: " + dataDir); throw new IllegalArgumentException("No files in dataDir: " + dataDir);
} }
// trec doc parser
try { try {
String parserClassName = config.get("html.parser", String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
"org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance();
htmlParser = Class.forName(parserClassName).asSubclass(HTMLParser.class).newInstance();
} catch (Exception e) { } catch (Exception e) {
// Should not get here. Throw runtime exception. // Should not get here. Throw runtime exception.
throw new RuntimeException(e); throw new RuntimeException(e);
} }
// html parser
try {
String htmlParserClassName = config.get("html.parser",
"org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser");
htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance();
} catch (Exception e) {
// Should not get here. Throw runtime exception.
throw new RuntimeException(e);
}
// encoding
if (encoding == null) { if (encoding == null) {
encoding = "ISO-8859-1"; encoding = "ISO-8859-1";
} }
// iteration exclusion in doc name
excludeDocnameIteration = config.get("content.source.excludeIteration", false); excludeDocnameIteration = config.get("content.source.excludeIteration", false);
} }

View File

@ -0,0 +1,135 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* Parser for trec doc content, invoked on doc text excluding <DOC> and <DOCNO>
* which are handled in TrecContentSource. Required to be stateless and hence thread safe.
*/
public abstract class TrecDocParser {
/** Types of trec parse paths, */
public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES }
/** trec parser type used for unknown extensions */
public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2;
static final Map<ParsePathType,TrecDocParser> pathType2parser = new HashMap<ParsePathType,TrecDocParser>();
static {
pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser());
pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser());
pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser());
pathType2parser.put(ParsePathType.FT, new TrecFTParser());
pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser());
}
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
static {
for (ParsePathType ppt : ParsePathType.values()) {
pathName2Type.put(ppt.name(),ppt);
}
}
/** max length of walk up from file to its ancestors when looking for a known path type */
private static final int MAX_PATH_LENGTH = 10;
/**
* Compute the path type of a file by inspecting name of file and its parents
*/
public static ParsePathType pathType(File f) {
int pathLength = 0;
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase());
if (ppt!=null) {
return ppt;
}
f = f.getParentFile();
}
return DEFAULT_PATH_TYPE;
}
/**
* parse the text prepared in docBuf into a result DocData,
* no synchronization is required.
* @param docData reusable result
* @param name name that should be set to the result
* @param trecSrc calling trec content source
* @param docBuf text to parse
* @param pathType type of parsed file, or null if unknown - may be used by
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
* @return text obtained when stripping all tags from <code>buf</code> (Input StringBuilder is unmodified).
*/
public static String stripTags(StringBuilder buf, int start) {
return stripTags(buf.substring(start),0);
}
/**
* strip tags from input.
* @see #stripTags(StringBuilder, int)
*/
public static String stripTags(String buf, int start) {
if (start>0) {
buf = buf.substring(0);
}
return buf.replaceAll("<[^>]*>", " ");
}
/**
* Extract from <code>buf</code> the text of interest within specified tags
* @param buf entire input text
* @param startTag tag marking start of text of interest
* @param endTag tag marking end of text of interest
* @param maxPos if &ge; 0 sets a limit on start of text of interest
* @return text of interest or null if not found
*/
public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) {
int k1 = buf.indexOf(startTag);
if (k1>=0 && (maxPos<0 || k1<maxPos)) {
k1 += startTag.length();
int k2 = buf.indexOf(endTag,k1);
if (k2>=0 && (maxPos<0 || k2<maxPos)) { // found end tag with allowed range
if (noisePrefixes != null) {
for (String noise : noisePrefixes) {
int k1a = buf.indexOf(noise,k1);
if (k1a>=0 && k1a<k2) {
k1 = k1a + noise.length();
}
}
}
return buf.substring(k1,k2).trim();
}
}
return null;
}
//public static void main(String[] args) {
// System.out.println(stripTags("is it true that<space>2<<second space>><almost last space>1<one more space>?",0));
//}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FBIS docs in trec disks 4+5 collection format
*/
public class TrecFBISParser extends TrecDocParser {
private static final String HEADER = "<HEADER>";
private static final String HEADER_END = "</HEADER>";
private static final int HEADER_END_LENGTH = HEADER_END.length();
private static final String DATE1 = "<DATE1>";
private static final String DATE1_END = "</DATE1>";
private static final String TI = "<TI>";
private static final String TI_END = "</TI>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;
String title = null;
int h1 = docBuf.indexOf(HEADER);
if (h1>=0) {
int h2 = docBuf.indexOf(HEADER_END,h1);
mark = h2+HEADER_END_LENGTH;
// date...
String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
title = extract(docBuf, TI, TI_END, h2, null);
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -0,0 +1,66 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FR94 docs in trec disks 4+5 collection format
*/
public class TrecFR94Parser extends TrecDocParser {
private static final String TEXT = "<TEXT>";
private static final int TEXT_LENGTH = TEXT.length();
private static final String TEXT_END = "</TEXT>";
private static final String DATE = "<DATE>";
private static final String[] DATE_NOISE_PREFIXES = {
"DATE:",
"date:", //TODO improve date extraction for this format
"t.c.",
};
private static final String DATE_END = "</DATE>";
//TODO can we also extract title for this format?
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;
int h1 = docBuf.indexOf(TEXT);
if (h1>=0) {
int h2 = docBuf.indexOf(TEXT_END,h1);
mark = h1+TEXT_LENGTH;
// date...
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
if (dateStr != null) {
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -0,0 +1,57 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecFTParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
// title...
String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Date;
/**
* Parser for the GOV2 collection format
*/
public class TrecGov2Parser extends TrecDocParser {
private static final String DATE = "Date: ";
private static final String DATE_END = TrecContentSource.NEW_LINE;
private static final String DOCHDR = "<DOCHDR>";
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
// Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
Reader r = trecSrc.getTrecDocReader(docBuf);
// skip some of the text, optionally set date
Date date = null;
int h1 = docBuf.indexOf(DOCHDR);
if (h1>=0) {
int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
r.mark(h2+TERMINATING_DOCHDR_LENGTH);
}
r.reset();
HTMLParser htmlParser = trecSrc.getHtmlParser();
return htmlParser.parse(docData, name, date, null, r, null);
}
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
/**
* Parser for the FT docs in trec disks 4+5 collection format
*/
public class TrecLATimesParser extends TrecDocParser {
private static final String DATE = "<DATE>";
private static final String DATE_END = "</DATE>";
private static final String DATE_NOISE = "day,"; // anything aftre the ','
private static final String SUBJECT = "<SUBJECT>";
private static final String SUBJECT_END = "</SUBJECT>";
private static final String HEADLINE = "<HEADLINE>";
private static final String HEADLINE_END = "</HEADLINE>";
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
int mark = 0; // that much is skipped
// date...
Date date = null;
String dateStr = extract(docBuf, DATE, DATE_END, -1, null);
if (dateStr != null) {
int d2a = dateStr.indexOf(DATE_NOISE);
if (d2a > 0) {
dateStr = dateStr.substring(0,d2a+3); // we need the "day" part
}
dateStr = stripTags(dateStr,0).toString();
date = trecSrc.parseDate(dateStr.trim());
}
// title... first try with SUBJECT, them with HEADLINE
String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null);
if (title==null) {
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
}
if (title!=null) {
title = stripTags(title,0).toString().trim();
}
docData.clear();
docData.setName(name);
docData.setDate(date);
docData.setTitle(title);
docData.setBody(stripTags(docBuf, mark).toString());
return docData;
}
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.benchmark.byTask.feeds;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/**
* Parser for trec docs which selects the parser to apply according
* to the source files path, defaulting to {@link TrecGov2Parser}.
*/
public class TrecParserByPath extends TrecDocParser {
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}
}

View File

@ -158,8 +158,10 @@ public class StringBuilderReader extends Reader {
synchronized (lock) { synchronized (lock) {
this.sb = sb; this.sb = sb;
length = sb.length(); length = sb.length();
next = mark = 0;
} }
} }
@Override @Override
public long skip(long ns) throws IOException { public long skip(long ns) throws IOException {
synchronized (lock) { synchronized (lock) {

View File

@ -18,14 +18,20 @@ package org.apache.lucene.benchmark.byTask.feeds;
*/ */
import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.text.ParseException; import java.text.ParseException;
import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.HashSet;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.DateTools; import org.apache.lucene.document.DateTools;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TrecContentSourceTest extends LuceneTestCase { public class TrecContentSourceTest extends LuceneTestCase {
@ -329,5 +335,62 @@ public class TrecContentSourceTest extends LuceneTestCase {
// Don't test that NoMoreDataException is thrown, since the forever flag is // Don't test that NoMoreDataException is thrown, since the forever flag is
// turned on. // turned on.
} }
/**
* Open a trec content source over a directory with files of all trec path types and all
* supported formats - bzip, gzip, txt.
*/
public void testTrecFeedDirAllTypes() throws Exception {
File dataDir = _TestUtil.getTempDir("trecFeedAllTypes");
_TestUtil.unzip(getDataFile("trecdocs.zip"), dataDir);
TrecContentSource tcs = new TrecContentSource();
Properties props = new Properties();
props.setProperty("print.props", "false");
props.setProperty("content.source.verbose", "false");
props.setProperty("content.source.excludeIteration", "true");
props.setProperty("doc.maker.forever", "false");
props.setProperty("docs.dir", dataDir.getCanonicalPath().replace('\\','/'));
props.setProperty("trec.doc.parser", TrecParserByPath.class.getName());
props.setProperty("content.source.forever", "false");
tcs.setConfig(new Config(props));
tcs.resetInputs();
DocData dd = new DocData();
int n = 0;
boolean gotExpectedException = false;
HashSet<ParsePathType> unseenTypes = new HashSet<ParsePathType>(Arrays.asList(ParsePathType.values()));
try {
while (n<100) { // arbiterary limit to prevent looping forever in case of test failure
dd = tcs.getNextDocData(dd);
++n;
assertNotNull("doc data "+n+" should not be null!", dd);
unseenTypes.remove(tcs.currPathType);
switch(tcs.currPathType) {
case GOV2:
assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
break;
case FBIS:
assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991"));
break;
case FR94:
// no title extraction in this source for now
assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994"));
break;
case FT:
assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424"));
break;
case LATIMES:
assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday"));
break;
default:
assertTrue("Should never get here!", false);
}
}
} catch (NoMoreDataException e) {
gotExpectedException = true;
}
assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
assertEquals("Wrong numbre of documents created by osurce!",5,n);
assertTrue("Did not see all types!",unseenTypes.isEmpty());
}
} }