From 8d0c1b62af1e5f3206f63951b563ca4e3afaf381 Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Thu, 24 Mar 2011 12:22:13 +0000 Subject: [PATCH] LUCENE-2977: WriteLineDocTask should write gzip/bzip2/txt according to the extension of specified output file name. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1084929 13f79535-47bb-0310-9956-ffa450edef68 --- modules/benchmark/CHANGES.txt | 5 + .../benchmark/byTask/feeds/ContentSource.java | 78 ---------- .../byTask/feeds/EnwikiContentSource.java | 5 +- .../benchmark/byTask/feeds/LineDocSource.java | 5 +- .../byTask/feeds/TrecContentSource.java | 5 +- .../byTask/tasks/WriteLineDocTask.java | 44 ++---- .../benchmark/byTask/utils/StreamUtils.java | 144 ++++++++++++++++++ .../byTask/feeds/LineDocSourceTest.java | 38 ++--- .../byTask/tasks/WriteLineDocTaskTest.java | 60 ++++---- .../StreamUtilsTest.java} | 96 +++++++----- 10 files changed, 278 insertions(+), 202 deletions(-) create mode 100644 modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StreamUtils.java rename modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/{feeds/ContentSourceTest.java => utils/StreamUtilsTest.java} (55%) diff --git a/modules/benchmark/CHANGES.txt b/modules/benchmark/CHANGES.txt index 12ce6b9ac51..00b6a5134c6 100644 --- a/modules/benchmark/CHANGES.txt +++ b/modules/benchmark/CHANGES.txt @@ -2,6 +2,11 @@ Lucene Benchmark Contrib Change Log The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways. +03/24/2011 + LUCENE-2977: WriteLineDocTask now automatically detects how to write - + GZip or BZip2 or Plain-text - according to the output file extension. + Property bzip.compression of WriteLineDocTask was canceled. (Doron Cohen) + 03/23/2011 LUCENE-2980: Benchmark's ContentSource no more requires lower case file suffixes for detecting file type (gzip/bzip2/text). As part of this fix worked around an diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java index 4af7dde7623..cfe377c3ffe 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java @@ -17,19 +17,11 @@ package org.apache.lucene.benchmark.byTask.feeds; * limitations under the License. */ -import java.io.BufferedInputStream; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; -import org.apache.commons.compress.compressors.CompressorException; -import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.utils.Config; /** @@ -56,17 +48,6 @@ import org.apache.lucene.benchmark.byTask.utils.Config; */ public abstract class ContentSource { - private static final Map extensionToType = new HashMap(); - static { - // these in are lower case, we will lower case at the test as well - extensionToType.put(".bz2", CompressorStreamFactory.BZIP2); - extensionToType.put(".bzip", CompressorStreamFactory.BZIP2); - extensionToType.put(".gz", CompressorStreamFactory.GZIP); - extensionToType.put(".gzip", CompressorStreamFactory.GZIP); - } - - protected static final int BUFFER_SIZE = 1 << 16; // 64K - private long bytesCount; private long totalBytesCount; private int docsCount; @@ -78,8 +59,6 @@ public abstract class ContentSource { protected boolean verbose; protected String encoding; - private CompressorStreamFactory csFactory = new CompressorStreamFactory(); - /** update count of bytes generated by this source */ protected final synchronized void addBytes(long numBytes) { bytesCount += numBytes; @@ -114,63 +93,6 @@ public abstract class ContentSource { } } - /** - * Returns an {@link InputStream} over the requested file. This method - * attempts to identify the appropriate {@link InputStream} instance to return - * based on the file name (e.g., if it ends with .bz2 or .bzip, return a - * 'bzip' {@link InputStream}). - */ - protected InputStream getInputStream(File file) throws IOException { - // First, create a FileInputStream, as this will be required by all types. - // Wrap with BufferedInputStream for better performance - InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE); - - String fileName = file.getName(); - int idx = fileName.lastIndexOf('.'); - String type = null; - if (idx != -1) { - type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ENGLISH)); - } - - if (type!=null) { // bzip or gzip - try { - return closableCompressorInputStream(type,is); - } catch (CompressorException e) { - IOException ioe = new IOException(e.getMessage()); - ioe.initCause(e); - throw ioe; - } - } - - return is; - } - - /** - * Wrap the compressor input stream so that calling close will also close - * the underlying stream - workaround for CommonsCompress bug (COMPRESS-127). - */ - private InputStream closableCompressorInputStream(String type, final InputStream is) throws CompressorException { - final InputStream delegee = csFactory.createCompressorInputStream(type, is); - if (!type.equals(CompressorStreamFactory.GZIP)) { - return delegee; //compressor bug affects only gzip - } - return new InputStream() { - @Override public int read() throws IOException { return delegee.read(); } - @Override public int read(byte[] b) throws IOException { return delegee.read(b); } - @Override public int available() throws IOException { return delegee.available(); } - @Override public synchronized void mark(int readlimit) { delegee.mark(readlimit); } - @Override public boolean markSupported() { return delegee.markSupported(); } - @Override public int read(byte[] b, int off, int len) throws IOException { return delegee.read(b, off, len); } - @Override public synchronized void reset() throws IOException { delegee.reset(); } - @Override public long skip(long n) throws IOException { return delegee.skip(n); } - @Override - public void close() throws IOException { - delegee.close(); - is.close(); - } - }; - } - /** * Returns true whether it's time to log a message (depending on verbose and * the number of documents generated). diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index 5c71c5a4024..5153ad0c4eb 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -24,6 +24,7 @@ import java.util.HashMap; import java.util.Map; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.util.ThreadInterruptedException; import org.xml.sax.Attributes; import org.xml.sax.InputSource; @@ -189,7 +190,7 @@ public class EnwikiContentSource extends ContentSource { return; } else if (localFileIS == is) { // If file is not already re-opened then re-open it now - is = getInputStream(file); + is = StreamUtils.inputStream(file); } } } @@ -290,7 +291,7 @@ public class EnwikiContentSource extends ContentSource { @Override public void resetInputs() throws IOException { super.resetInputs(); - is = getInputStream(file); + is = StreamUtils.inputStream(file); } @Override diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java index 443fac5337a..ecc0aed42c1 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java @@ -28,6 +28,7 @@ import java.util.Properties; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; /** * A {@link ContentSource} reading one line at a time as a @@ -178,8 +179,8 @@ public class LineDocSource extends ContentSource { if (reader != null) { reader.close(); } - InputStream is = getInputStream(file); - reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE); + InputStream is = StreamUtils.inputStream(file); + reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE); if (skipHeaderLine) { reader.readLine(); // skip one line - the header line - already handled that info } diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java index d60a12ccf90..3069c27463f 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java @@ -32,6 +32,7 @@ import java.util.Locale; import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader; import org.apache.lucene.util.ThreadInterruptedException; @@ -194,8 +195,8 @@ public class TrecContentSource extends ContentSource { System.out.println("opening: " + f + " length: " + f.length()); } try { - InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension - reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE); + InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension + reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); currPathType = TrecDocParser.pathType(f); return; } catch (Exception e) { diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java index 3369e30e3dc..197fe45c153 100644 --- a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java @@ -17,9 +17,8 @@ package org.apache.lucene.benchmark.byTask.tasks; * limitations under the License. */ -import java.io.BufferedOutputStream; import java.io.BufferedWriter; -import java.io.FileOutputStream; +import java.io.File; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; @@ -28,10 +27,10 @@ import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -40,14 +39,17 @@ import org.apache.lucene.document.Field; * following format: title <TAB> date <TAB> body. The output of this * task can be consumed by * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocSource} and is intended - * to save the IO overhead of opening a file per document to be indexed.
+ * to save the IO overhead of opening a file per document to be indexed. + *

+ * The format of the output is set according to the output file extension. + * Compression is recommended when the output file is expected to be large. + * See info on file extensions in {@link StreamUtils.Type} + *

* Supports the following parameters: *

    - *
  • line.file.out - the name of the file to write the output to. That + *
  • line.file.out - the name of the file to write the output to. That * parameter is mandatory. NOTE: the file is re-created. - *
  • bzip.compression - whether the output should be bzip-compressed. This is - * recommended when the output file is expected to be large. - *
  • line.fields - which fields should be written in each line. + *
  • line.fields - which fields should be written in each line. * (optional, default: {@link #DEFAULT_FIELDS}). *
  • sufficient.fields - list of field names, separated by comma, which, * if all of them are missing, the document will be skipped. For example, to require @@ -91,30 +93,12 @@ public class WriteLineDocTask extends PerfTask { public WriteLineDocTask(PerfRunData runData) throws Exception { super(runData); Config config = runData.getConfig(); - String fileName = config.get("line.file.out", null); - if (fileName == null) { + String fname = config.get("line.file.out", null); + if (fname == null) { throw new IllegalArgumentException("line.file.out must be set"); } - - OutputStream out = new FileOutputStream(fileName); - boolean doBzipCompression = false; - String doBZCompress = config.get("bzip.compression", null); - if (doBZCompress != null) { - // Property was set, use the value. - doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue(); - } else { - // Property was not set, attempt to detect based on file's extension - doBzipCompression = fileName.endsWith("bz2"); - } - - if (doBzipCompression) { - // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int) - // and does not use the write(byte[]) version. This proved to speed the - // compression process by 70% ! - out = new BufferedOutputStream(out, 1 << 16); - out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out); - } - lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16)); + OutputStream out = StreamUtils.outputStream(new File(fname)); + lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE)); docMaker = runData.getDocMaker(); // init fields diff --git a/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StreamUtils.java b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StreamUtils.java new file mode 100644 index 00000000000..b6f8d674d54 --- /dev/null +++ b/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StreamUtils.java @@ -0,0 +1,144 @@ +package org.apache.lucene.benchmark.byTask.utils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorStreamFactory; + +/** + * Stream utilities. + */ +public class StreamUtils { + + /** Buffer size used across the benchmark package */ + public static final int BUFFER_SIZE = 1 << 16; // 64K + + /** File format type */ + public enum Type { + /** BZIP2 is automatically used for .bz2 and .bzip2 extensions. */ + BZIP2(CompressorStreamFactory.BZIP2), + /** GZIP is automatically used for .gz and .gzip extensions. */ + GZIP(CompressorStreamFactory.GZIP), + /** Plain text is used for anything which is not GZIP or BZIP. */ + PLAIN(null); + private final String csfType; + Type(String csfType) { + this.csfType = csfType; + } + private InputStream inputStream(InputStream in) throws IOException { + try { + return csfType==null ? in : closableCompressorInputStream(this, in); + } catch (CompressorException e) { + IOException ioe = new IOException(e.getMessage()); + ioe.initCause(e); + throw ioe; } + } + private OutputStream outputStream(OutputStream os) throws IOException { + try { + return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os); + } catch (CompressorException e) { + IOException ioe = new IOException(e.getMessage()); + ioe.initCause(e); + throw ioe; + } + } + } + + private static final Map extensionToType = new HashMap(); + static { + // these in are lower case, we will lower case at the test as well + extensionToType.put(".bz2", Type.BZIP2); + extensionToType.put(".bzip", Type.BZIP2); + extensionToType.put(".gz", Type.GZIP); + extensionToType.put(".gzip", Type.GZIP); + } + + + /** + * Returns an {@link InputStream} over the requested file. This method + * attempts to identify the appropriate {@link InputStream} instance to return + * based on the file name (e.g., if it ends with .bz2 or .bzip, return a + * 'bzip' {@link InputStream}). + */ + public static InputStream inputStream(File file) throws IOException { + // First, create a FileInputStream, as this will be required by all types. + // Wrap with BufferedInputStream for better performance + InputStream in = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE); + return fileType(file).inputStream(in); + } + + /** Return the type of the file, or null if unknown */ + private static Type fileType(File file) { + Type type = null; + String fileName = file.getName(); + int idx = fileName.lastIndexOf('.'); + if (idx != -1) { + type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ENGLISH)); + } + return type==null ? Type.PLAIN : type; + } + + /** + * Wrap the compressor input stream so that calling close will also close + * the underlying stream - workaround for CommonsCompress bug (COMPRESS-127). + */ + private static InputStream closableCompressorInputStream(Type type, final InputStream is) throws CompressorException { + final InputStream delegee = new CompressorStreamFactory().createCompressorInputStream(type.csfType, is); + if (!Type.GZIP.equals(type)) { + return delegee; //compressor bug affects only gzip + } + return new InputStream() { + @Override public int read() throws IOException { return delegee.read(); } + @Override public int read(byte[] b) throws IOException { return delegee.read(b); } + @Override public int available() throws IOException { return delegee.available(); } + @Override public synchronized void mark(int readlimit) { delegee.mark(readlimit); } + @Override public boolean markSupported() { return delegee.markSupported(); } + @Override public int read(byte[] b, int off, int len) throws IOException { return delegee.read(b, off, len); } + @Override public synchronized void reset() throws IOException { delegee.reset(); } + @Override public long skip(long n) throws IOException { return delegee.skip(n); } + @Override + public void close() throws IOException { + delegee.close(); + is.close(); + } + }; + } + + /** + * Returns an {@link OutputStream} over the requested file, identifying + * the appropriate {@link OutputStream} instance similar to {@link #inputStream(File)}. + */ + public static OutputStream outputStream(File file) throws IOException { + // First, create a FileInputStream, as this will be required by all types. + // Wrap with BufferedInputStream for better performance + OutputStream os = new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE); + return fileType(file).outputStream(os); + } +} diff --git a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java index 39672deb6db..7cc7dc0da2d 100644 --- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java +++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java @@ -103,23 +103,19 @@ public class LineDocSourceTest extends BenchmarkTestCase { writer.close(); } - private void doIndexAndSearchTest(File file, boolean setBZCompress, - String bz2CompressVal, Class lineParserClass, String storedField) throws Exception { - doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions - doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition - doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions + private void doIndexAndSearchTest(File file, Class lineParserClass, String storedField) throws Exception { + doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions + doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition + doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions } - private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress, - String bz2CompressVal, Class lineParserClass, int numAdds, String storedField) throws Exception { + private void doIndexAndSearchTestWithRepeats(File file, + Class lineParserClass, int numAdds, String storedField) throws Exception { Properties props = new Properties(); // LineDocSource specific settings. props.setProperty("docs.file", file.getAbsolutePath()); - if (setBZCompress) { - props.setProperty("bzip.compression", bz2CompressVal); - } if (lineParserClass != null) { props.setProperty("line.parser", lineParserClass.getName()); } @@ -160,37 +156,31 @@ public class LineDocSourceTest extends BenchmarkTestCase { public void testBZip2() throws Exception { File file = new File(getWorkDir(), "one-line.bz2"); createBZ2LineFile(file,true); - doIndexAndSearchTest(file, true, "true", null, null); + doIndexAndSearchTest(file, null, null); } public void testBZip2NoHeaderLine() throws Exception { File file = new File(getWorkDir(), "one-line.bz2"); createBZ2LineFile(file,false); - doIndexAndSearchTest(file, true, "true", null, null); - } - - public void testBZip2AutoDetect() throws Exception { - File file = new File(getWorkDir(), "one-line.bz2"); - createBZ2LineFile(file,false); - doIndexAndSearchTest(file, false, null, null, null); + doIndexAndSearchTest(file, null, null); } public void testRegularFile() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFile(file,true); - doIndexAndSearchTest(file, false, null, null, null); + doIndexAndSearchTest(file, null, null); } public void testRegularFileSpecialHeader() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFile(file,true); - doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null); + doIndexAndSearchTest(file, HeaderLineParser.class, null); } public void testRegularFileNoHeaderLine() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFile(file,false); - doIndexAndSearchTest(file, false, null, null, null); + doIndexAndSearchTest(file, null, null); } public void testInvalidFormat() throws Exception { @@ -210,7 +200,7 @@ public class LineDocSourceTest extends BenchmarkTestCase { writer.newLine(); writer.close(); try { - doIndexAndSearchTest(file, false, null, null, null); + doIndexAndSearchTest(file, null, null); fail("Some exception should have been thrown for: [" + testCases[i] + "]"); } catch (Exception e) { // expected. @@ -222,7 +212,7 @@ public class LineDocSourceTest extends BenchmarkTestCase { public void testWithDocsName() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD); - doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD); + doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD); } /** Use fields names that are not defined in Docmaker and so will go to Properties */ @@ -230,7 +220,7 @@ public class LineDocSourceTest extends BenchmarkTestCase { File file = new File(getWorkDir(), "one-line"); String specialField = "mySpecialField"; createRegularLineFileWithMoreFields(file, specialField); - doIndexAndSearchTest(file, false, null, null, specialField); + doIndexAndSearchTest(file, null, specialField); } } diff --git a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java index 908c069e800..09e055229cd 100644 --- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java +++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java @@ -31,6 +31,7 @@ import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils.Type; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; @@ -135,16 +136,12 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); - private PerfRunData createPerfRunData(File file, boolean setBZCompress, + private PerfRunData createPerfRunData(File file, boolean allowEmptyDocs, - String bz2CompressVal, String docMakerName) throws Exception { Properties props = new Properties(); props.setProperty("doc.maker", docMakerName); props.setProperty("line.file.out", file.getAbsolutePath()); - if (setBZCompress) { - props.setProperty("bzip.compression", bz2CompressVal); - } props.setProperty("directory", "RAMDirectory"); // no accidental FS dir. if (allowEmptyDocs) { props.setProperty("sufficient.fields", ","); @@ -157,11 +154,19 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { return new PerfRunData(config); } - private void doReadTest(File file, boolean bz2File, String expTitle, + private void doReadTest(File file, Type fileType, String expTitle, String expDate, String expBody) throws Exception { InputStream in = new FileInputStream(file); - if (bz2File) { - in = csFactory.createCompressorInputStream("bzip2", in); + switch(fileType) { + case BZIP2: + in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in); + break; + case GZIP: + in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in); + case PLAIN: + break; // nothing to do + default: + assertFalse("Unknown file type!",true); //fail, should not happen } BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8")); try { @@ -192,36 +197,37 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { // Create a document in bz2 format. File file = new File(getWorkDir(), "one-line.bz2"); - PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, true, "title", "date", "body"); + doReadTest(file, Type.BZIP2, "title", "date", "body"); } - public void testBZip2AutoDetect() throws Exception { + /* Tests WriteLineDocTask with a gzip format. */ + public void testGZip() throws Exception { - // Create a document in bz2 format. - File file = new File(getWorkDir(), "one-line.bz2"); - PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName()); + // Create a document in gz format. + File file = new File(getWorkDir(), "one-line.gz"); + PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, true, "title", "date", "body"); + doReadTest(file, Type.GZIP, "title", "date", "body"); } public void testRegularFile() throws Exception { // Create a document in regular format. File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, false, "title", "date", "body"); + doReadTest(file, Type.PLAIN, "title", "date", "body"); } public void testCharsReplace() throws Exception { @@ -229,12 +235,12 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { // separator char. However, it didn't replace newline characters, which // resulted in errors in LineDocSource. File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, NewLinesDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, false, "title text", "date text", "body text two"); + doReadTest(file, Type.PLAIN, "title text", "date text", "body text two"); } public void testEmptyBody() throws Exception { @@ -242,28 +248,28 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { // had a TITLE element (LUCENE-1755). It should throw away documents if they // don't have BODY nor TITLE File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, NoBodyDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, false, "title", "date", null); + doReadTest(file, Type.PLAIN, "title", "date", null); } public void testEmptyTitle() throws Exception { File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, NoTitleDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); - doReadTest(file, false, "", "date", "body"); + doReadTest(file, Type.PLAIN, "", "date", "body"); } /** Fail by default when there's only date */ public void testJustDate() throws Exception { File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, JustDateDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); @@ -281,7 +287,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { public void testLegalJustDate() throws Exception { File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, LegalJustDateDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); @@ -299,7 +305,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { public void testEmptyDoc() throws Exception { File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, true, EmptyDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); @@ -317,7 +323,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase { public void testMultiThreaded() throws Exception { File file = new File(getWorkDir(), "one-line"); - PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName()); + PerfRunData runData = createPerfRunData(file, false, ThreadingDocMaker.class.getName()); final WriteLineDocTask wldt = new WriteLineDocTask(runData); Thread[] threads = new Thread[10]; for (int i = 0; i < threads.length; i++) { diff --git a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/ContentSourceTest.java b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/utils/StreamUtilsTest.java similarity index 55% rename from modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/ContentSourceTest.java rename to modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/utils/StreamUtilsTest.java index 6ba647a1721..110812b1fe3 100644 --- a/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/ContentSourceTest.java +++ b/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/utils/StreamUtilsTest.java @@ -1,4 +1,4 @@ -package org.apache.lucene.benchmark.byTask.feeds; +package org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -30,39 +30,63 @@ import java.io.OutputStreamWriter; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.BenchmarkTestCase; +import org.apache.lucene.benchmark.byTask.utils.StreamUtils; import org.apache.lucene.util._TestUtil; import org.junit.After; import org.junit.Before; import org.junit.Test; -public class ContentSourceTest extends BenchmarkTestCase { +public class StreamUtilsTest extends BenchmarkTestCase { private static final String TEXT = "Some-Text..."; private File testDir; - private CompressorStreamFactory csFactory = new CompressorStreamFactory(); @Test public void testGetInputStreamPlainText() throws Exception { - assertReadText(textFile("txt")); - assertReadText(textFile("TXT")); + assertReadText(rawTextFile("txt")); + assertReadText(rawTextFile("TXT")); } @Test public void testGetInputStreamGzip() throws Exception { - assertReadText(gzipFile("gz")); - assertReadText(gzipFile("gzip")); - assertReadText(gzipFile("GZ")); - assertReadText(gzipFile("GZIP")); + assertReadText(rawGzipFile("gz")); + assertReadText(rawGzipFile("gzip")); + assertReadText(rawGzipFile("GZ")); + assertReadText(rawGzipFile("GZIP")); } @Test public void testGetInputStreamBzip2() throws Exception { - assertReadText(bzip2File("bz2")); - assertReadText(bzip2File("bzip")); - assertReadText(bzip2File("BZ2")); - assertReadText(bzip2File("BZIP")); + assertReadText(rawBzip2File("bz2")); + assertReadText(rawBzip2File("bzip")); + assertReadText(rawBzip2File("BZ2")); + assertReadText(rawBzip2File("BZIP")); + } + + @Test + public void testGetOutputStreamBzip2() throws Exception { + assertReadText(autoOutFile("bz2")); + assertReadText(autoOutFile("bzip")); + assertReadText(autoOutFile("BZ2")); + assertReadText(autoOutFile("BZIP")); } - private File textFile(String ext) throws Exception { + @Test + public void testGetOutputStreamGzip() throws Exception { + assertReadText(autoOutFile("gz")); + assertReadText(autoOutFile("gzip")); + assertReadText(autoOutFile("GZ")); + assertReadText(autoOutFile("GZIP")); + } + + @Test + public void testGetOutputStreamPlain() throws Exception { + assertReadText(autoOutFile("txt")); + assertReadText(autoOutFile("text")); + assertReadText(autoOutFile("TXT")); + assertReadText(autoOutFile("TEXT")); + } + + private File rawTextFile(String ext) throws Exception { File f = new File(testDir,"testfile." + ext); BufferedWriter w = new BufferedWriter(new FileWriter(f)); w.write(TEXT); @@ -71,38 +95,36 @@ public class ContentSourceTest extends BenchmarkTestCase { return f; } - private File gzipFile(String ext) throws Exception { + private File rawGzipFile(String ext) throws Exception { File f = new File(testDir,"testfile." + ext); - OutputStream os = csFactory.createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f)); - BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); - w.write(TEXT); - w.newLine(); - w.close(); + OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f)); + writeText(os); return f; } - private File bzip2File(String ext) throws Exception { + private File rawBzip2File(String ext) throws Exception { File f = new File(testDir,"testfile." + ext); - OutputStream os = csFactory.createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f)); - BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); - w.write(TEXT); - w.newLine(); - w.close(); + OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f)); + writeText(os); return f; } + private File autoOutFile(String ext) throws Exception { + File f = new File(testDir,"testfile." + ext); + OutputStream os = StreamUtils.outputStream(f); + writeText(os); + return f; + } + + private void writeText(OutputStream os) throws IOException { + BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os)); + w.write(TEXT); + w.newLine(); + w.close(); + } + private void assertReadText(File f) throws Exception { - ContentSource src = new ContentSource() { - @Override - public void close() throws IOException { - } - @Override - public DocData getNextDocData(DocData docData) throws NoMoreDataException, - IOException { - return null; - } - }; - InputStream ir = src.getInputStream(f); + InputStream ir = StreamUtils.inputStream(f); InputStreamReader in = new InputStreamReader(ir); BufferedReader r = new BufferedReader(in); String line = r.readLine();