diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 729c64ff531..39eeef0f81b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -17,20 +17,13 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import java.io.PrintStream; import java.util.Arrays; -import java.util.Enumeration; import java.util.List; import java.util.Random; -import java.util.zip.ZipEntry; -import java.util.zip.ZipFile; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -78,39 +71,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase { } */ - /* Unzips zipName --> dirName, removing dirName - first */ - public void unzip(File zipName, String destDirName) throws IOException { - - ZipFile zipFile = new ZipFile(zipName); - - Enumeration entries = zipFile.entries(); - - String dirName = fullDir(destDirName); - - File fileDir = new File(dirName); - rmDir(destDirName); - - fileDir.mkdir(); - - while (entries.hasMoreElements()) { - ZipEntry entry = entries.nextElement(); - - InputStream in = zipFile.getInputStream(entry); - OutputStream out = new BufferedOutputStream(new FileOutputStream(new File(fileDir, entry.getName()))); - - byte[] buffer = new byte[8192]; - int len; - while((len = in.read(buffer)) >= 0) { - out.write(buffer, 0, len); - } - - in.close(); - out.close(); - } - - zipFile.close(); - } /* public void testCreateCFS() throws IOException { String dirName = "testindex.cfs"; @@ -153,10 +113,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase { if (VERBOSE) { System.out.println("TEST: index " + unsupportedNames[i]); } - unzip(getDataFile("unsupported." + unsupportedNames[i] + ".zip"), unsupportedNames[i]); - - String fullPath = fullDir(unsupportedNames[i]); - Directory dir = newFSDirectory(new File(fullPath)); + File oldIndxeDir = _TestUtil.getTempDir(unsupportedNames[i]); + _TestUtil.unzip(getDataFile("unsupported." + unsupportedNames[i] + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); IndexReader reader = null; IndexWriter writer = null; @@ -200,7 +159,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { assertTrue(bos.toString().contains(IndexFormatTooOldException.class.getName())); dir.close(); - rmDir(unsupportedNames[i]); + _TestUtil.rmDir(oldIndxeDir); } } @@ -209,10 +168,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase { if (VERBOSE) { System.out.println("\nTEST: index=" + oldNames[i]); } - unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); - - String fullPath = fullDir(oldNames[i]); - Directory dir = newFSDirectory(new File(fullPath)); + File oldIndxeDir = _TestUtil.getTempDir(oldNames[i]); + _TestUtil.unzip(getDataFile("index." + oldNames[i] + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer())); @@ -223,15 +181,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase { _TestUtil.checkIndex(dir); dir.close(); - rmDir(oldNames[i]); + _TestUtil.rmDir(oldIndxeDir); } } public void testAddOldIndexes() throws IOException { for (String name : oldNames) { - unzip(getDataFile("index." + name + ".zip"), name); - String fullPath = fullDir(name); - Directory dir = newFSDirectory(new File(fullPath)); + File oldIndxeDir = _TestUtil.getTempDir(name); + _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); Directory targetDir = newDirectory(); IndexWriter w = new IndexWriter(targetDir, newIndexWriterConfig( @@ -243,15 +201,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase { dir.close(); targetDir.close(); - rmDir(name); + _TestUtil.rmDir(oldIndxeDir); } } public void testAddOldIndexesReader() throws IOException { for (String name : oldNames) { - unzip(getDataFile("index." + name + ".zip"), name); - String fullPath = fullDir(name); - Directory dir = newFSDirectory(new File(fullPath)); + File oldIndxeDir = _TestUtil.getTempDir(name); + _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); IndexReader reader = IndexReader.open(dir); Directory targetDir = newDirectory(); @@ -265,23 +223,25 @@ public class TestBackwardsCompatibility extends LuceneTestCase { dir.close(); targetDir.close(); - rmDir(name); + _TestUtil.rmDir(oldIndxeDir); } } public void testSearchOldIndex() throws IOException { for(int i=0;i entries = zipFile.entries(); + + rmDir(destDir); + + destDir.mkdir(); + + while (entries.hasMoreElements()) { + ZipEntry entry = entries.nextElement(); + + InputStream in = zipFile.getInputStream(entry); + File targetFile = new File(destDir, entry.getName()); + if (entry.isDirectory()) { + // allow unzipping with directory structure + targetFile.mkdirs(); + } else { + if (targetFile.getParentFile()!=null) { + // be on the safe side: do not rely on that directories are always extracted + // before their children (although this makes sense, but is it guaranteed?) + targetFile.getParentFile().mkdirs(); + } + OutputStream out = new BufferedOutputStream(new FileOutputStream(targetFile)); + + byte[] buffer = new byte[8192]; + int len; + while((len = in.read(buffer)) >= 0) { + out.write(buffer, 0, len); + } + + in.close(); + out.close(); + } + } + + zipFile.close(); + } + public static void syncConcurrentMerges(IndexWriter writer) { syncConcurrentMerges(writer.getConfig().getMergeScheduler()); } diff --git a/modules/benchmark/CHANGES.txt b/modules/benchmark/CHANGES.txt index 3811723e38b..8f5f082e7dd 100644 --- a/modules/benchmark/CHANGES.txt +++ b/modules/benchmark/CHANGES.txt @@ -2,6 +2,12 @@ Lucene Benchmark Contrib Change Log The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. +02/05/2011 + LUCENE-1540: Improvements to contrib.benchmark for TREC collections. + ContentSource can now process plain text files, gzip files, and bzip2 files. + TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR + collection (both used by many TREC tasks). (Shai Erera, Doron Cohen) + 01/26/2011 LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That way, if a previous extract attempt failed, "ant extract-reuters" will still diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java index 817e57d1c03..b831e69adab 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java @@ -56,11 +56,14 @@ import org.apache.lucene.benchmark.byTask.utils.Config; public abstract class ContentSource { private static final int BZIP = 0; - private static final int OTHER = 1; + private static final int GZIP = 1; + private static final int OTHER = 2; private static final Map extensionToType = new HashMap(); static { extensionToType.put(".bz2", Integer.valueOf(BZIP)); extensionToType.put(".bzip", Integer.valueOf(BZIP)); + extensionToType.put(".gz", Integer.valueOf(GZIP)); + extensionToType.put(".gzip", Integer.valueOf(GZIP)); } protected static final int BUFFER_SIZE = 1 << 16; // 64K @@ -78,11 +81,13 @@ public abstract class ContentSource { private CompressorStreamFactory csFactory = new CompressorStreamFactory(); + /** update count of bytes generated by this source */ protected final synchronized void addBytes(long numBytes) { bytesCount += numBytes; totalBytesCount += numBytes; } + /** update count of documents generated by this source */ protected final synchronized void addDoc() { ++docsCount; ++totalDocsCount; @@ -130,21 +135,25 @@ public abstract class ContentSource { type = typeInt.intValue(); } } - switch (type) { - case BZIP: - try { + + try { + switch (type) { + case BZIP: // According to BZip2CompressorInputStream's code, it reads the first // two file header chars ('B' and 'Z'). It is important to wrap the // underlying input stream with a buffered one since // Bzip2CompressorInputStream uses the read() method exclusively. is = csFactory.createCompressorInputStream("bzip2", is); - } catch (CompressorException e) { - IOException ioe = new IOException(e.getMessage()); - ioe.initCause(e); - throw ioe; - } - break; - default: // Do nothing, stay with FileInputStream + break; + case GZIP: + is = csFactory.createCompressorInputStream("gz", is); + break; + default: // Do nothing, stay with FileInputStream + } + } catch (CompressorException e) { + IOException ioe = new IOException(e.getMessage()); + ioe.initCause(e); + throw ioe; } return is; diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java index d57777a0036..873c658a338 100755 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java @@ -29,11 +29,14 @@ import java.util.Properties; */ public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser { - public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException { org.apache.lucene.demo.html.HTMLParser p = new org.apache.lucene.demo.html.HTMLParser(reader); // title - String title = p.getTitle(); + if (title==null) { + title = p.getTitle(); + } + // properties Properties props = p.getMetaTags(); // body diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java index 6c8b9fa4a87..47eed373e5f 100755 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java @@ -29,16 +29,18 @@ public interface HTMLParser { /** * Parse the input Reader and return DocData. - * A provided name or date is used for the result, otherwise an attempt is - * made to set them from the parsed data. - * @param dateFormat date formatter to use for extracting the date. - * @param name name of the result doc data. If null, attempt to set by parsed data. + * The provided name,title,date are used for the result, unless when they're null, + * in which case an attempt is made to set them from the parsed data. + * @param docData result reused + * @param name name of the result doc data. * @param date date of the result doc data. If null, attempt to set by parsed data. - * @param reader of html text to parse. + * @param title title of the result doc data. If null, attempt to set by parsed data. + * @param reader reader of html text to parse. + * @param dateFormat date formatter to use for extracting the date. * @return Parsed doc data. * @throws IOException * @throws InterruptedException */ - public DocData parse(DocData docData, String name, Date date, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; + public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException; } diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java index 1101e661c91..d60a12ccf90 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java @@ -19,8 +19,8 @@ package org.apache.lucene.benchmark.byTask.feeds; import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.text.DateFormat; @@ -29,8 +29,8 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; -import java.util.zip.GZIPInputStream; +import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; import org.apache.lucene.util.ThreadInterruptedException; @@ -46,8 +46,10 @@ import org.apache.lucene.util.ThreadInterruptedException; *
  • docs.dir - specifies the directory where the TREC files reside. * Can be set to a relative path if "work.dir" is also specified * (default=trec). + *
  • trec.doc.parser - specifies the {@link TrecDocParser} class to use for + * parsing the TREC documents content (default=TrecGov2Parser). *
  • html.parser - specifies the {@link HTMLParser} class to use for - * parsing the TREC documents content (default=DemoHTMLParser). + * parsing the HTML parts of the TREC documents content (default=DemoHTMLParser). *
  • content.source.encoding - if not specified, ISO-8859-1 is used. *
  • content.source.excludeIteration - if true, do not append iteration number to docname * @@ -59,22 +61,24 @@ public class TrecContentSource extends ContentSource { ParsePosition pos; } - private static final String DATE = "Date: "; - private static final String DOCHDR = ""; - private static final String TERMINATING_DOCHDR = ""; - private static final String DOCNO = ""; - private static final String TERMINATING_DOCNO = ""; - private static final String DOC = ""; - private static final String TERMINATING_DOC = ""; + public static final String DOCNO = ""; + public static final String TERMINATING_DOCNO = ""; + public static final String DOC = ""; + public static final String TERMINATING_DOC = ""; - private static final String NEW_LINE = System.getProperty("line.separator"); + /** separator between lines in the byffer */ + public static final String NEW_LINE = System.getProperty("line.separator"); private static final String DATE_FORMATS [] = { - "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST - "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT - "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 + "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST + "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT + "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 + "dd MMM yyyy", // 1 March 1994 + "MMM dd, yyyy", // February 3, 1994 + "yyMMdd", // 910513 + "hhmm z.z.z. MMM dd, yyyy", // 0901 u.t.c. April 28, 1994 }; private ThreadLocal dateFormats = new ThreadLocal(); @@ -83,7 +87,7 @@ public class TrecContentSource extends ContentSource { private File dataDir = null; private ArrayList inputFiles = new ArrayList(); private int nextFile = 0; - private int rawDocSize; + private int rawDocSize = 0; // Use to synchronize threads on reading from the TREC documents. private Object lock = new Object(); @@ -92,7 +96,10 @@ public class TrecContentSource extends ContentSource { BufferedReader reader; int iteration = 0; HTMLParser htmlParser; + private boolean excludeDocnameIteration; + private TrecDocParser trecDocParser = new TrecGov2Parser(); // default + ParsePathType currPathType; // not private for tests private DateFormatInfo getDateFormatInfo() { DateFormatInfo dfi = dateFormats.get(); @@ -118,7 +125,7 @@ public class TrecContentSource extends ContentSource { return sb; } - private Reader getTrecDocReader(StringBuilder docBuffer) { + Reader getTrecDocReader(StringBuilder docBuffer) { StringBuilderReader r = trecDocReader.get(); if (r == null) { r = new StringBuilderReader(docBuffer); @@ -129,10 +136,21 @@ public class TrecContentSource extends ContentSource { return r; } - // read until finding a line that starts with the specified prefix, or a terminating tag has been found. - private void read(StringBuilder buf, String prefix, boolean collectMatchLine, - boolean collectAll, String terminatingTag) - throws IOException, NoMoreDataException { + HTMLParser getHtmlParser() { + return htmlParser; + } + + /** + * Read until a line starting with the specified lineStart. + * @param buf buffer for collecting the data if so specified/ + * @param lineStart line start to look for, must not be null. + * @param collectMatchLine whether to collect the matching line into buffer. + * @param collectAll whether to collect all lines into buffer. + * @throws IOException + * @throws NoMoreDataException + */ + private void read(StringBuilder buf, String lineStart, + boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException { String sep = ""; while (true) { String line = reader.readLine(); @@ -144,20 +162,12 @@ public class TrecContentSource extends ContentSource { rawDocSize += line.length(); - if (line.startsWith(prefix)) { + if (lineStart!=null && line.startsWith(lineStart)) { if (collectMatchLine) { buf.append(sep).append(line); sep = NEW_LINE; } - break; - } - - if (terminatingTag != null && line.startsWith(terminatingTag)) { - // didn't find the prefix that was asked, but the terminating - // tag was found. set the length to 0 to signal no match was - // found. - buf.setLength(0); - break; + return; } if (collectAll) { @@ -169,7 +179,7 @@ public class TrecContentSource extends ContentSource { void openNextFile() throws NoMoreDataException, IOException { close(); - int retries = 0; + currPathType = null; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. @@ -184,13 +194,13 @@ public class TrecContentSource extends ContentSource { System.out.println("opening: " + f + " length: " + f.length()); } try { - GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE); - reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE); + InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension + reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE); + currPathType = TrecDocParser.pathType(f); return; } catch (Exception e) { - retries++; - if (retries < 20 && verbose) { - System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " #retries=" + retries); + if (verbose) { + System.out.println("Skipping 'bad' file " + f.getAbsolutePath()+" due to "+e.getMessage()); continue; } throw new NoMoreDataException(); @@ -198,7 +208,7 @@ public class TrecContentSource extends ContentSource { } } - Date parseDate(String dateStr) { + public Date parseDate(String dateStr) { dateStr = dateStr.trim(); DateFormatInfo dfi = getDateFormatInfo(); for (int i = 0; i < dfi.dfs.length; i++) { @@ -237,70 +247,47 @@ public class TrecContentSource extends ContentSource { @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { - String dateStr = null, name = null; - Reader r = null; + String name = null; + StringBuilder docBuf = getDocBuffer(); + ParsePathType parsedPathType; + // protect reading from the TREC files by multiple threads. The rest of the - // method, i.e., parsing the content and returning the DocData can run - // unprotected. + // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } - - StringBuilder docBuf = getDocBuffer(); - // 1. skip until doc start + // 1. skip until doc start - required for all TREC formats docBuf.setLength(0); - read(docBuf, DOC, false, false, null); - - // 2. name + read(docBuf, DOC, false, false); + + // save parsedFile for passing trecDataParser after the sync block, in + // case another thread will open another file in between. + parsedPathType = currPathType; + + // 2. name - required for all TREC formats docBuf.setLength(0); - read(docBuf, DOCNO, true, false, null); + read(docBuf, DOCNO, true, false); name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, - DOCNO.length())); - if (!excludeDocnameIteration) + DOCNO.length())).trim(); + + if (!excludeDocnameIteration) { name = name + "_" + iteration; - - // 3. skip until doc header - docBuf.setLength(0); - read(docBuf, DOCHDR, false, false, null); - - boolean findTerminatingDocHdr = false; - - // 4. date - look for the date only until /DOCHDR - docBuf.setLength(0); - read(docBuf, DATE, true, false, TERMINATING_DOCHDR); - if (docBuf.length() != 0) { - // Date found. - dateStr = docBuf.substring(DATE.length()); - findTerminatingDocHdr = true; } - // 5. skip until end of doc header - if (findTerminatingDocHdr) { - docBuf.setLength(0); - read(docBuf, TERMINATING_DOCHDR, false, false, null); - } - - // 6. collect until end of doc + // 3. read all until end of doc docBuf.setLength(0); - read(docBuf, TERMINATING_DOC, false, true, null); - - // 7. Set up a Reader over the read content - r = getTrecDocReader(docBuf); - // Resetting the thread's reader means it will reuse the instance - // allocated as well as re-read from docBuf. - r.reset(); - - // count char length of parsed html text (larger than the plain doc body text). - addBytes(docBuf.length()); + read(docBuf, TERMINATING_DOC, false, true); } + + // count char length of text to be parsed (may be larger than the resulted plain doc body text). + addBytes(docBuf.length()); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. - Date date = dateStr != null ? parseDate(dateStr) : null; try { - docData = htmlParser.parse(docData, name, date, r, null); + docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType); addDoc(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); @@ -322,27 +309,40 @@ public class TrecContentSource extends ContentSource { @Override public void setConfig(Config config) { super.setConfig(config); + // dirs File workDir = new File(config.get("work.dir", "work")); String d = config.get("docs.dir", "trec"); dataDir = new File(d); if (!dataDir.isAbsolute()) { dataDir = new File(workDir, d); } + // files collectFiles(dataDir, inputFiles); if (inputFiles.size() == 0) { throw new IllegalArgumentException("No files in dataDir: " + dataDir); } + // trec doc parser try { - String parserClassName = config.get("html.parser", - "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); - htmlParser = Class.forName(parserClassName).asSubclass(HTMLParser.class).newInstance(); + String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); + trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } + // html parser + try { + String htmlParserClassName = config.get("html.parser", + "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); + htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance(); + } catch (Exception e) { + // Should not get here. Throw runtime exception. + throw new RuntimeException(e); + } + // encoding if (encoding == null) { encoding = "ISO-8859-1"; } + // iteration exclusion in doc name excludeDocnameIteration = config.get("content.source.excludeIteration", false); } diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java new file mode 100644 index 00000000000..d87aa3ab679 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java @@ -0,0 +1,135 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * Parser for trec doc content, invoked on doc text excluding and + * which are handled in TrecContentSource. Required to be stateless and hence thread safe. + */ +public abstract class TrecDocParser { + + /** Types of trec parse paths, */ + public enum ParsePathType { GOV2, FBIS, FT, FR94, LATIMES } + + /** trec parser type used for unknown extensions */ + public static final ParsePathType DEFAULT_PATH_TYPE = ParsePathType.GOV2; + + static final Map pathType2parser = new HashMap(); + static { + pathType2parser.put(ParsePathType.GOV2, new TrecGov2Parser()); + pathType2parser.put(ParsePathType.FBIS, new TrecFBISParser()); + pathType2parser.put(ParsePathType.FR94, new TrecFR94Parser()); + pathType2parser.put(ParsePathType.FT, new TrecFTParser()); + pathType2parser.put(ParsePathType.LATIMES, new TrecLATimesParser()); + } + + static final Map pathName2Type = new HashMap(); + static { + for (ParsePathType ppt : ParsePathType.values()) { + pathName2Type.put(ppt.name(),ppt); + } + } + + /** max length of walk up from file to its ancestors when looking for a known path type */ + private static final int MAX_PATH_LENGTH = 10; + + /** + * Compute the path type of a file by inspecting name of file and its parents + */ + public static ParsePathType pathType(File f) { + int pathLength = 0; + while (f != null && ++pathLength < MAX_PATH_LENGTH) { + ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase()); + if (ppt!=null) { + return ppt; + } + f = f.getParentFile(); + } + return DEFAULT_PATH_TYPE; + } + + /** + * parse the text prepared in docBuf into a result DocData, + * no synchronization is required. + * @param docData reusable result + * @param name name that should be set to the result + * @param trecSrc calling trec content source + * @param docBuf text to parse + * @param pathType type of parsed file, or null if unknown - may be used by + * parsers to alter their behavior according to the file path type. + */ + public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException; + + /** + * strip tags from buf: each tag is replaced by a single blank. + * @return text obtained when stripping all tags from buf (Input StringBuilder is unmodified). + */ + public static String stripTags(StringBuilder buf, int start) { + return stripTags(buf.substring(start),0); + } + + /** + * strip tags from input. + * @see #stripTags(StringBuilder, int) + */ + public static String stripTags(String buf, int start) { + if (start>0) { + buf = buf.substring(0); + } + return buf.replaceAll("<[^>]*>", " "); + } + + /** + * Extract from buf the text of interest within specified tags + * @param buf entire input text + * @param startTag tag marking start of text of interest + * @param endTag tag marking end of text of interest + * @param maxPos if ≥ 0 sets a limit on start of text of interest + * @return text of interest or null if not found + */ + public static String extract(StringBuilder buf, String startTag, String endTag, int maxPos, String noisePrefixes[]) { + int k1 = buf.indexOf(startTag); + if (k1>=0 && (maxPos<0 || k1=0 && (maxPos<0 || k2=0 && k1a2<>1?",0)); + //} + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java new file mode 100644 index 00000000000..8efcd04e91d --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java @@ -0,0 +1,65 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Date; + +/** + * Parser for the FBIS docs in trec disks 4+5 collection format + */ +public class TrecFBISParser extends TrecDocParser { + + private static final String HEADER = "
    "; + private static final String HEADER_END = "
    "; + private static final int HEADER_END_LENGTH = HEADER_END.length(); + + private static final String DATE1 = ""; + private static final String DATE1_END = ""; + + private static final String TI = ""; + private static final String TI_END = ""; + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + int mark = 0; // that much is skipped + // optionally skip some of the text, set date, title + Date date = null; + String title = null; + int h1 = docBuf.indexOf(HEADER); + if (h1>=0) { + int h2 = docBuf.indexOf(HEADER_END,h1); + mark = h2+HEADER_END_LENGTH; + // date... + String dateStr = extract(docBuf, DATE1, DATE1_END, h2, null); + if (dateStr != null) { + date = trecSrc.parseDate(dateStr); + } + // title... + title = extract(docBuf, TI, TI_END, h2, null); + } + docData.clear(); + docData.setName(name); + docData.setDate(date); + docData.setTitle(title); + docData.setBody(stripTags(docBuf, mark).toString()); + return docData; + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java new file mode 100644 index 00000000000..ce6492120d7 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java @@ -0,0 +1,66 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Date; + +/** + * Parser for the FR94 docs in trec disks 4+5 collection format + */ +public class TrecFR94Parser extends TrecDocParser { + + private static final String TEXT = ""; + private static final int TEXT_LENGTH = TEXT.length(); + private static final String TEXT_END = ""; + + private static final String DATE = ""; + private static final String[] DATE_NOISE_PREFIXES = { + "DATE:", + "date:", //TODO improve date extraction for this format + "t.c.", + }; + private static final String DATE_END = ""; + + //TODO can we also extract title for this format? + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + int mark = 0; // that much is skipped + // optionally skip some of the text, set date (no title?) + Date date = null; + int h1 = docBuf.indexOf(TEXT); + if (h1>=0) { + int h2 = docBuf.indexOf(TEXT_END,h1); + mark = h1+TEXT_LENGTH; + // date... + String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); + if (dateStr != null) { + dateStr = stripTags(dateStr,0).toString(); + date = trecSrc.parseDate(dateStr.trim()); + } + } + docData.clear(); + docData.setName(name); + docData.setDate(date); + docData.setBody(stripTags(docBuf, mark).toString()); + return docData; + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java new file mode 100644 index 00000000000..ab39d9c2860 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java @@ -0,0 +1,57 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Date; + +/** + * Parser for the FT docs in trec disks 4+5 collection format + */ +public class TrecFTParser extends TrecDocParser { + + private static final String DATE = ""; + private static final String DATE_END = ""; + + private static final String HEADLINE = ""; + private static final String HEADLINE_END = ""; + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + int mark = 0; // that much is skipped + + // date... + Date date = null; + String dateStr = extract(docBuf, DATE, DATE_END, -1, null); + if (dateStr != null) { + date = trecSrc.parseDate(dateStr); + } + + // title... + String title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null); + + docData.clear(); + docData.setName(name); + docData.setDate(date); + docData.setTitle(title); + docData.setBody(stripTags(docBuf, mark).toString()); + return docData; + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java new file mode 100755 index 00000000000..ef8371d1735 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java @@ -0,0 +1,59 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.Date; + +/** + * Parser for the GOV2 collection format + */ +public class TrecGov2Parser extends TrecDocParser { + + private static final String DATE = "Date: "; + private static final String DATE_END = TrecContentSource.NEW_LINE; + + private static final String DOCHDR = ""; + private static final String TERMINATING_DOCHDR = ""; + private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length(); + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf + Reader r = trecSrc.getTrecDocReader(docBuf); + + // skip some of the text, optionally set date + Date date = null; + int h1 = docBuf.indexOf(DOCHDR); + if (h1>=0) { + int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1); + String dateStr = extract(docBuf, DATE, DATE_END, h2, null); + if (dateStr != null) { + date = trecSrc.parseDate(dateStr); + } + r.mark(h2+TERMINATING_DOCHDR_LENGTH); + } + + r.reset(); + HTMLParser htmlParser = trecSrc.getHtmlParser(); + return htmlParser.parse(docData, name, date, null, r, null); + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java new file mode 100644 index 00000000000..367015bee36 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java @@ -0,0 +1,71 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Date; + +/** + * Parser for the FT docs in trec disks 4+5 collection format + */ +public class TrecLATimesParser extends TrecDocParser { + + private static final String DATE = ""; + private static final String DATE_END = ""; + private static final String DATE_NOISE = "day,"; // anything aftre the ',' + + private static final String SUBJECT = ""; + private static final String SUBJECT_END = ""; + private static final String HEADLINE = ""; + private static final String HEADLINE_END = ""; + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + int mark = 0; // that much is skipped + + // date... + Date date = null; + String dateStr = extract(docBuf, DATE, DATE_END, -1, null); + if (dateStr != null) { + int d2a = dateStr.indexOf(DATE_NOISE); + if (d2a > 0) { + dateStr = dateStr.substring(0,d2a+3); // we need the "day" part + } + dateStr = stripTags(dateStr,0).toString(); + date = trecSrc.parseDate(dateStr.trim()); + } + + // title... first try with SUBJECT, them with HEADLINE + String title = extract(docBuf, SUBJECT, SUBJECT_END, -1, null); + if (title==null) { + title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null); + } + if (title!=null) { + title = stripTags(title,0).toString().trim(); + } + + docData.clear(); + docData.setName(name); + docData.setDate(date); + docData.setTitle(title); + docData.setBody(stripTags(docBuf, mark).toString()); + return docData; + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java new file mode 100644 index 00000000000..fc882035a01 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java @@ -0,0 +1,33 @@ +package org.apache.lucene.benchmark.byTask.feeds; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +/** + * Parser for trec docs which selects the parser to apply according + * to the source files path, defaulting to {@link TrecGov2Parser}. + */ +public class TrecParserByPath extends TrecDocParser { + + @Override + public DocData parse(DocData docData, String name, TrecContentSource trecSrc, + StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException { + return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType); + } + +} diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java index c6e9510e01d..a10d5371c72 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBuilderReader.java @@ -158,8 +158,10 @@ public class StringBuilderReader extends Reader { synchronized (lock) { this.sb = sb; length = sb.length(); + next = mark = 0; } } + @Override public long skip(long ns) throws IOException { synchronized (lock) { diff --git a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java index a178c6a6b1f..8222e5782ff 100644 --- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java +++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java @@ -18,14 +18,20 @@ package org.apache.lucene.benchmark.byTask.feeds; */ import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.StringReader; import java.text.ParseException; +import java.util.Arrays; import java.util.Date; +import java.util.HashSet; +import java.util.Properties; +import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.DateTools; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; public class TrecContentSourceTest extends LuceneTestCase { @@ -329,5 +335,62 @@ public class TrecContentSourceTest extends LuceneTestCase { // Don't test that NoMoreDataException is thrown, since the forever flag is // turned on. } + + /** + * Open a trec content source over a directory with files of all trec path types and all + * supported formats - bzip, gzip, txt. + */ + public void testTrecFeedDirAllTypes() throws Exception { + File dataDir = _TestUtil.getTempDir("trecFeedAllTypes"); + _TestUtil.unzip(getDataFile("trecdocs.zip"), dataDir); + TrecContentSource tcs = new TrecContentSource(); + Properties props = new Properties(); + props.setProperty("print.props", "false"); + props.setProperty("content.source.verbose", "false"); + props.setProperty("content.source.excludeIteration", "true"); + props.setProperty("doc.maker.forever", "false"); + props.setProperty("docs.dir", dataDir.getCanonicalPath().replace('\\','/')); + props.setProperty("trec.doc.parser", TrecParserByPath.class.getName()); + props.setProperty("content.source.forever", "false"); + tcs.setConfig(new Config(props)); + tcs.resetInputs(); + DocData dd = new DocData(); + int n = 0; + boolean gotExpectedException = false; + HashSet unseenTypes = new HashSet(Arrays.asList(ParsePathType.values())); + try { + while (n<100) { // arbiterary limit to prevent looping forever in case of test failure + dd = tcs.getNextDocData(dd); + ++n; + assertNotNull("doc data "+n+" should not be null!", dd); + unseenTypes.remove(tcs.currPathType); + switch(tcs.currPathType) { + case GOV2: + assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); + break; + case FBIS: + assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.parseDate("1 January 1991")); + break; + case FR94: + // no title extraction in this source for now + assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.parseDate("February 3, 1994")); + break; + case FT: + assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.parseDate("980424")); + break; + case LATIMES: + assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.parseDate("January 17, 1997, Sunday")); + break; + default: + assertTrue("Should never get here!", false); + } + } + } catch (NoMoreDataException e) { + gotExpectedException = true; + } + assertTrue("Should have gotten NoMoreDataException!", gotExpectedException); + assertEquals("Wrong numbre of documents created by osurce!",5,n); + assertTrue("Did not see all types!",unseenTypes.isEmpty()); + } } diff --git a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip new file mode 100644 index 00000000000..850d5b6aad2 Binary files /dev/null and b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/trecdocs.zip differ