From e76ad0990d2b389aee62246f12e7eeb69e90fa1e Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Wed, 26 Jan 2011 09:10:06 +0000 Subject: [PATCH] LUCENE-929: contrib/benchmark build doesn't handle checking if content is properly extracted (trunk) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1063647 13f79535-47bb-0310-9956-ffa450edef68 --- modules/benchmark/CHANGES.txt | 5 + modules/benchmark/build.xml | 1 - .../benchmark/utils/ExtractReuters.java | 237 ++++++++---------- 3 files changed, 110 insertions(+), 133 deletions(-) diff --git a/modules/benchmark/CHANGES.txt b/modules/benchmark/CHANGES.txt index 82c005c3ea9..3811723e38b 100644 --- a/modules/benchmark/CHANGES.txt +++ b/modules/benchmark/CHANGES.txt @@ -2,6 +2,11 @@ Lucene Benchmark Contrib Change Log The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. +01/26/2011 + LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That + way, if a previous extract attempt failed, "ant extract-reuters" will still + extract the files. (Shai Erera, Doron Cohen, Grant Ingersoll) + 01/24/2011 LUCENE-2885: Add WaitForMerges task (calls IndexWriter.waitForMerges()). (Mike McCandless) diff --git a/modules/benchmark/build.xml b/modules/benchmark/build.xml index f2abf4a3ef7..10d1510fd1d 100644 --- a/modules/benchmark/build.xml +++ b/modules/benchmark/build.xml @@ -87,7 +87,6 @@ - diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java index 3e4104b5b85..395d640fc72 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java @@ -29,146 +29,119 @@ import java.util.regex.Pattern; /** * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body */ -public class ExtractReuters -{ - private File reutersDir; - private File outputDir; - private static final String LINE_SEPARATOR = System.getProperty("line.separator"); - - public ExtractReuters(File reutersDir, File outputDir) - { - this.reutersDir = reutersDir; - this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - File [] files = outputDir.listFiles(); - for (int i = 0; i < files.length; i++) - { - files[i].delete(); - } +public class ExtractReuters { + private File reutersDir; + private File outputDir; + private static final String LINE_SEPARATOR = System.getProperty("line.separator"); + public ExtractReuters(File reutersDir, File outputDir) { + this.reutersDir = reutersDir; + this.outputDir = outputDir; + System.out.println("Deleting all files in " + outputDir); + for (File f : outputDir.listFiles()) { + f.delete(); } + } - public void extract() - { - File [] sgmFiles = reutersDir.listFiles(new FileFilter() - { - public boolean accept(File file) - { - return file.getName().endsWith(".sgm"); - } - }); - if (sgmFiles != null && sgmFiles.length > 0) - { - for (int i = 0; i < sgmFiles.length; i++) - { - File sgmFile = sgmFiles[i]; - extractFile(sgmFile); + public void extract() { + File[] sgmFiles = reutersDir.listFiles(new FileFilter() { + public boolean accept(File file) { + return file.getName().endsWith(".sgm"); + } + }); + if (sgmFiles != null && sgmFiles.length > 0) { + for (File sgmFile : sgmFiles) { + extractFile(sgmFile); + } + } else { + System.err.println("No .sgm files in " + reutersDir); + } + } + + Pattern EXTRACTION_PATTERN = Pattern + .compile("(.*?)|(.*?)|(.*?)"); + + private static String[] META_CHARS = { "&", "<", ">", "\"", "'" }; + + private static String[] META_CHARS_SERIALIZATIONS = { "&", "<", + ">", """, "'" }; + + /** + * Override if you wish to change what is extracted + * + * @param sgmFile + */ + protected void extractFile(File sgmFile) { + try { + BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); + + StringBuilder buffer = new StringBuilder(1024); + StringBuilder outBuffer = new StringBuilder(1024); + + String line = null; + int docNumber = 0; + while ((line = reader.readLine()) != null) { + // when we see a closing reuters tag, flush the file + + if (line.indexOf("(.*?)|(.*?)|(.*?)"); - - private static String[] META_CHARS - = {"&", "<", ">", "\"", "'"}; - - private static String[] META_CHARS_SERIALIZATIONS - = {"&", "<", ">", """, "'"}; - - /** - * Override if you wish to change what is extracted - * - * @param sgmFile - */ - protected void extractFile(File sgmFile) - { - try - { - BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); - - StringBuilder buffer = new StringBuilder(1024); - StringBuilder outBuffer = new StringBuilder(1024); - - String line = null; - int docNumber = 0; - while ((line = reader.readLine()) != null) - { - //when we see a closing reuters tag, flush the file - - if (line.indexOf(" org.apache.lucene.benchmark.utils.ExtractReuters "); - } + private static void printUsage() { + System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters "); + } + }