mirror of https://github.com/apache/lucene.git
LUCENE-2977: WriteLineDocTask should write gzip/bzip2/txt according to the extension of specified output file name.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1084929 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
68ebd4392d
commit
8d0c1b62af
|
@ -2,6 +2,11 @@ Lucene Benchmark Contrib Change Log
|
|||
|
||||
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
|
||||
|
||||
03/24/2011
|
||||
LUCENE-2977: WriteLineDocTask now automatically detects how to write -
|
||||
GZip or BZip2 or Plain-text - according to the output file extension.
|
||||
Property bzip.compression of WriteLineDocTask was canceled. (Doron Cohen)
|
||||
|
||||
03/23/2011
|
||||
LUCENE-2980: Benchmark's ContentSource no more requires lower case file suffixes
|
||||
for detecting file type (gzip/bzip2/text). As part of this fix worked around an
|
||||
|
|
|
@ -17,19 +17,11 @@ package org.apache.lucene.benchmark.byTask.feeds;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorException;
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
|
||||
/**
|
||||
|
@ -56,17 +48,6 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
|||
*/
|
||||
public abstract class ContentSource {
|
||||
|
||||
private static final Map<String,String> extensionToType = new HashMap<String,String>();
|
||||
static {
|
||||
// these in are lower case, we will lower case at the test as well
|
||||
extensionToType.put(".bz2", CompressorStreamFactory.BZIP2);
|
||||
extensionToType.put(".bzip", CompressorStreamFactory.BZIP2);
|
||||
extensionToType.put(".gz", CompressorStreamFactory.GZIP);
|
||||
extensionToType.put(".gzip", CompressorStreamFactory.GZIP);
|
||||
}
|
||||
|
||||
protected static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
||||
private long bytesCount;
|
||||
private long totalBytesCount;
|
||||
private int docsCount;
|
||||
|
@ -78,8 +59,6 @@ public abstract class ContentSource {
|
|||
protected boolean verbose;
|
||||
protected String encoding;
|
||||
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
/** update count of bytes generated by this source */
|
||||
protected final synchronized void addBytes(long numBytes) {
|
||||
bytesCount += numBytes;
|
||||
|
@ -114,63 +93,6 @@ public abstract class ContentSource {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link InputStream} over the requested file. This method
|
||||
* attempts to identify the appropriate {@link InputStream} instance to return
|
||||
* based on the file name (e.g., if it ends with .bz2 or .bzip, return a
|
||||
* 'bzip' {@link InputStream}).
|
||||
*/
|
||||
protected InputStream getInputStream(File file) throws IOException {
|
||||
// First, create a FileInputStream, as this will be required by all types.
|
||||
// Wrap with BufferedInputStream for better performance
|
||||
InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
|
||||
|
||||
String fileName = file.getName();
|
||||
int idx = fileName.lastIndexOf('.');
|
||||
String type = null;
|
||||
if (idx != -1) {
|
||||
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
|
||||
if (type!=null) { // bzip or gzip
|
||||
try {
|
||||
return closableCompressorInputStream(type,is);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
|
||||
return is;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap the compressor input stream so that calling close will also close
|
||||
* the underlying stream - workaround for CommonsCompress bug (COMPRESS-127).
|
||||
*/
|
||||
private InputStream closableCompressorInputStream(String type, final InputStream is) throws CompressorException {
|
||||
final InputStream delegee = csFactory.createCompressorInputStream(type, is);
|
||||
if (!type.equals(CompressorStreamFactory.GZIP)) {
|
||||
return delegee; //compressor bug affects only gzip
|
||||
}
|
||||
return new InputStream() {
|
||||
@Override public int read() throws IOException { return delegee.read(); }
|
||||
@Override public int read(byte[] b) throws IOException { return delegee.read(b); }
|
||||
@Override public int available() throws IOException { return delegee.available(); }
|
||||
@Override public synchronized void mark(int readlimit) { delegee.mark(readlimit); }
|
||||
@Override public boolean markSupported() { return delegee.markSupported(); }
|
||||
@Override public int read(byte[] b, int off, int len) throws IOException { return delegee.read(b, off, len); }
|
||||
@Override public synchronized void reset() throws IOException { delegee.reset(); }
|
||||
@Override public long skip(long n) throws IOException { return delegee.skip(n); }
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
delegee.close();
|
||||
is.close();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true whether it's time to log a message (depending on verbose and
|
||||
* the number of documents generated).
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
|
@ -189,7 +190,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
return;
|
||||
} else if (localFileIS == is) {
|
||||
// If file is not already re-opened then re-open it now
|
||||
is = getInputStream(file);
|
||||
is = StreamUtils.inputStream(file);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -290,7 +291,7 @@ public class EnwikiContentSource extends ContentSource {
|
|||
@Override
|
||||
public void resetInputs() throws IOException {
|
||||
super.resetInputs();
|
||||
is = getInputStream(file);
|
||||
is = StreamUtils.inputStream(file);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Properties;
|
|||
|
||||
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
|
||||
/**
|
||||
* A {@link ContentSource} reading one line at a time as a
|
||||
|
@ -178,8 +179,8 @@ public class LineDocSource extends ContentSource {
|
|||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
InputStream is = getInputStream(file);
|
||||
reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
|
||||
InputStream is = StreamUtils.inputStream(file);
|
||||
reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE);
|
||||
if (skipHeaderLine) {
|
||||
reader.readLine(); // skip one line - the header line - already handled that info
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.util.Locale;
|
|||
|
||||
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
|
||||
|
@ -194,8 +195,8 @@ public class TrecContentSource extends ContentSource {
|
|||
System.out.println("opening: " + f + " length: " + f.length());
|
||||
}
|
||||
try {
|
||||
InputStream inputStream = getInputStream(f); // support either gzip, bzip2, or regular text file, by extension
|
||||
reader = new BufferedReader(new InputStreamReader(inputStream, encoding), BUFFER_SIZE);
|
||||
InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension
|
||||
reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE);
|
||||
currPathType = TrecDocParser.pathType(f);
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
|
|
|
@ -17,9 +17,8 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.PrintWriter;
|
||||
|
@ -28,10 +27,10 @@ import java.util.HashSet;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
|
@ -40,14 +39,17 @@ import org.apache.lucene.document.Field;
|
|||
* following format: title <TAB> date <TAB> body. The output of this
|
||||
* task can be consumed by
|
||||
* {@link org.apache.lucene.benchmark.byTask.feeds.LineDocSource} and is intended
|
||||
* to save the IO overhead of opening a file per document to be indexed.<br>
|
||||
* to save the IO overhead of opening a file per document to be indexed.
|
||||
* <p>
|
||||
* The format of the output is set according to the output file extension.
|
||||
* Compression is recommended when the output file is expected to be large.
|
||||
* See info on file extensions in {@link StreamUtils.Type}
|
||||
* <p>
|
||||
* Supports the following parameters:
|
||||
* <ul>
|
||||
* <li><b>line.file.out<b> - the name of the file to write the output to. That
|
||||
* <li><b>line.file.out</b> - the name of the file to write the output to. That
|
||||
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
|
||||
* <li><b>bzip.compression<b> - whether the output should be bzip-compressed. This is
|
||||
* recommended when the output file is expected to be large.
|
||||
* <li><b>line.fields<b> - which fields should be written in each line.
|
||||
* <li><b>line.fields</b> - which fields should be written in each line.
|
||||
* (optional, default: {@link #DEFAULT_FIELDS}).
|
||||
* <li><b>sufficient.fields</b> - list of field names, separated by comma, which,
|
||||
* if all of them are missing, the document will be skipped. For example, to require
|
||||
|
@ -91,30 +93,12 @@ public class WriteLineDocTask extends PerfTask {
|
|||
public WriteLineDocTask(PerfRunData runData) throws Exception {
|
||||
super(runData);
|
||||
Config config = runData.getConfig();
|
||||
String fileName = config.get("line.file.out", null);
|
||||
if (fileName == null) {
|
||||
String fname = config.get("line.file.out", null);
|
||||
if (fname == null) {
|
||||
throw new IllegalArgumentException("line.file.out must be set");
|
||||
}
|
||||
|
||||
OutputStream out = new FileOutputStream(fileName);
|
||||
boolean doBzipCompression = false;
|
||||
String doBZCompress = config.get("bzip.compression", null);
|
||||
if (doBZCompress != null) {
|
||||
// Property was set, use the value.
|
||||
doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue();
|
||||
} else {
|
||||
// Property was not set, attempt to detect based on file's extension
|
||||
doBzipCompression = fileName.endsWith("bz2");
|
||||
}
|
||||
|
||||
if (doBzipCompression) {
|
||||
// Wrap with BOS since BZip2CompressorOutputStream calls out.write(int)
|
||||
// and does not use the write(byte[]) version. This proved to speed the
|
||||
// compression process by 70% !
|
||||
out = new BufferedOutputStream(out, 1 << 16);
|
||||
out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out);
|
||||
}
|
||||
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
|
||||
OutputStream out = StreamUtils.outputStream(new File(fname));
|
||||
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), StreamUtils.BUFFER_SIZE));
|
||||
docMaker = runData.getDocMaker();
|
||||
|
||||
// init fields
|
||||
|
|
|
@ -0,0 +1,144 @@
|
|||
package org.apache.lucene.benchmark.byTask.utils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.compress.compressors.CompressorException;
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
|
||||
/**
|
||||
* Stream utilities.
|
||||
*/
|
||||
public class StreamUtils {
|
||||
|
||||
/** Buffer size used across the benchmark package */
|
||||
public static final int BUFFER_SIZE = 1 << 16; // 64K
|
||||
|
||||
/** File format type */
|
||||
public enum Type {
|
||||
/** BZIP2 is automatically used for <b>.bz2</b> and <b>.bzip2</b> extensions. */
|
||||
BZIP2(CompressorStreamFactory.BZIP2),
|
||||
/** GZIP is automatically used for <b>.gz</b> and <b>.gzip</b> extensions. */
|
||||
GZIP(CompressorStreamFactory.GZIP),
|
||||
/** Plain text is used for anything which is not GZIP or BZIP. */
|
||||
PLAIN(null);
|
||||
private final String csfType;
|
||||
Type(String csfType) {
|
||||
this.csfType = csfType;
|
||||
}
|
||||
private InputStream inputStream(InputStream in) throws IOException {
|
||||
try {
|
||||
return csfType==null ? in : closableCompressorInputStream(this, in);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe; }
|
||||
}
|
||||
private OutputStream outputStream(OutputStream os) throws IOException {
|
||||
try {
|
||||
return csfType==null ? os : new CompressorStreamFactory().createCompressorOutputStream(csfType, os);
|
||||
} catch (CompressorException e) {
|
||||
IOException ioe = new IOException(e.getMessage());
|
||||
ioe.initCause(e);
|
||||
throw ioe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String,Type> extensionToType = new HashMap<String,Type>();
|
||||
static {
|
||||
// these in are lower case, we will lower case at the test as well
|
||||
extensionToType.put(".bz2", Type.BZIP2);
|
||||
extensionToType.put(".bzip", Type.BZIP2);
|
||||
extensionToType.put(".gz", Type.GZIP);
|
||||
extensionToType.put(".gzip", Type.GZIP);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an {@link InputStream} over the requested file. This method
|
||||
* attempts to identify the appropriate {@link InputStream} instance to return
|
||||
* based on the file name (e.g., if it ends with .bz2 or .bzip, return a
|
||||
* 'bzip' {@link InputStream}).
|
||||
*/
|
||||
public static InputStream inputStream(File file) throws IOException {
|
||||
// First, create a FileInputStream, as this will be required by all types.
|
||||
// Wrap with BufferedInputStream for better performance
|
||||
InputStream in = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
|
||||
return fileType(file).inputStream(in);
|
||||
}
|
||||
|
||||
/** Return the type of the file, or null if unknown */
|
||||
private static Type fileType(File file) {
|
||||
Type type = null;
|
||||
String fileName = file.getName();
|
||||
int idx = fileName.lastIndexOf('.');
|
||||
if (idx != -1) {
|
||||
type = extensionToType.get(fileName.substring(idx).toLowerCase(Locale.ENGLISH));
|
||||
}
|
||||
return type==null ? Type.PLAIN : type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap the compressor input stream so that calling close will also close
|
||||
* the underlying stream - workaround for CommonsCompress bug (COMPRESS-127).
|
||||
*/
|
||||
private static InputStream closableCompressorInputStream(Type type, final InputStream is) throws CompressorException {
|
||||
final InputStream delegee = new CompressorStreamFactory().createCompressorInputStream(type.csfType, is);
|
||||
if (!Type.GZIP.equals(type)) {
|
||||
return delegee; //compressor bug affects only gzip
|
||||
}
|
||||
return new InputStream() {
|
||||
@Override public int read() throws IOException { return delegee.read(); }
|
||||
@Override public int read(byte[] b) throws IOException { return delegee.read(b); }
|
||||
@Override public int available() throws IOException { return delegee.available(); }
|
||||
@Override public synchronized void mark(int readlimit) { delegee.mark(readlimit); }
|
||||
@Override public boolean markSupported() { return delegee.markSupported(); }
|
||||
@Override public int read(byte[] b, int off, int len) throws IOException { return delegee.read(b, off, len); }
|
||||
@Override public synchronized void reset() throws IOException { delegee.reset(); }
|
||||
@Override public long skip(long n) throws IOException { return delegee.skip(n); }
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
delegee.close();
|
||||
is.close();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@link OutputStream} over the requested file, identifying
|
||||
* the appropriate {@link OutputStream} instance similar to {@link #inputStream(File)}.
|
||||
*/
|
||||
public static OutputStream outputStream(File file) throws IOException {
|
||||
// First, create a FileInputStream, as this will be required by all types.
|
||||
// Wrap with BufferedInputStream for better performance
|
||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE);
|
||||
return fileType(file).outputStream(os);
|
||||
}
|
||||
}
|
|
@ -103,23 +103,19 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
writer.close();
|
||||
}
|
||||
|
||||
private void doIndexAndSearchTest(File file, boolean setBZCompress,
|
||||
String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition
|
||||
doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions
|
||||
private void doIndexAndSearchTest(File file, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
|
||||
doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions
|
||||
doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition
|
||||
doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions
|
||||
}
|
||||
|
||||
private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
|
||||
String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
|
||||
private void doIndexAndSearchTestWithRepeats(File file,
|
||||
Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
|
||||
|
||||
Properties props = new Properties();
|
||||
|
||||
// LineDocSource specific settings.
|
||||
props.setProperty("docs.file", file.getAbsolutePath());
|
||||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
if (lineParserClass != null) {
|
||||
props.setProperty("line.parser", lineParserClass.getName());
|
||||
}
|
||||
|
@ -160,37 +156,31 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
public void testBZip2() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file,true);
|
||||
doIndexAndSearchTest(file, true, "true", null, null);
|
||||
doIndexAndSearchTest(file, null, null);
|
||||
}
|
||||
|
||||
public void testBZip2NoHeaderLine() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file,false);
|
||||
doIndexAndSearchTest(file, true, "true", null, null);
|
||||
}
|
||||
|
||||
public void testBZip2AutoDetect() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
createBZ2LineFile(file,false);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
doIndexAndSearchTest(file, null, null);
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file,true);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
doIndexAndSearchTest(file, null, null);
|
||||
}
|
||||
|
||||
public void testRegularFileSpecialHeader() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file,true);
|
||||
doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
|
||||
doIndexAndSearchTest(file, HeaderLineParser.class, null);
|
||||
}
|
||||
|
||||
public void testRegularFileNoHeaderLine() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFile(file,false);
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
doIndexAndSearchTest(file, null, null);
|
||||
}
|
||||
|
||||
public void testInvalidFormat() throws Exception {
|
||||
|
@ -210,7 +200,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
writer.newLine();
|
||||
writer.close();
|
||||
try {
|
||||
doIndexAndSearchTest(file, false, null, null, null);
|
||||
doIndexAndSearchTest(file, null, null);
|
||||
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
|
||||
} catch (Exception e) {
|
||||
// expected.
|
||||
|
@ -222,7 +212,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
public void testWithDocsName() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
|
||||
doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
|
||||
doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD);
|
||||
}
|
||||
|
||||
/** Use fields names that are not defined in Docmaker and so will go to Properties */
|
||||
|
@ -230,7 +220,7 @@ public class LineDocSourceTest extends BenchmarkTestCase {
|
|||
File file = new File(getWorkDir(), "one-line");
|
||||
String specialField = "mySpecialField";
|
||||
createRegularLineFileWithMoreFields(file, specialField);
|
||||
doIndexAndSearchTest(file, false, null, null, specialField);
|
||||
doIndexAndSearchTest(file, null, specialField);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.benchmark.BenchmarkTestCase;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||
import org.apache.lucene.benchmark.byTask.utils.Config;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils.Type;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.Index;
|
||||
|
@ -135,16 +136,12 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
private PerfRunData createPerfRunData(File file, boolean setBZCompress,
|
||||
private PerfRunData createPerfRunData(File file,
|
||||
boolean allowEmptyDocs,
|
||||
String bz2CompressVal,
|
||||
String docMakerName) throws Exception {
|
||||
Properties props = new Properties();
|
||||
props.setProperty("doc.maker", docMakerName);
|
||||
props.setProperty("line.file.out", file.getAbsolutePath());
|
||||
if (setBZCompress) {
|
||||
props.setProperty("bzip.compression", bz2CompressVal);
|
||||
}
|
||||
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
|
||||
if (allowEmptyDocs) {
|
||||
props.setProperty("sufficient.fields", ",");
|
||||
|
@ -157,11 +154,19 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
return new PerfRunData(config);
|
||||
}
|
||||
|
||||
private void doReadTest(File file, boolean bz2File, String expTitle,
|
||||
private void doReadTest(File file, Type fileType, String expTitle,
|
||||
String expDate, String expBody) throws Exception {
|
||||
InputStream in = new FileInputStream(file);
|
||||
if (bz2File) {
|
||||
in = csFactory.createCompressorInputStream("bzip2", in);
|
||||
switch(fileType) {
|
||||
case BZIP2:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.BZIP2, in);
|
||||
break;
|
||||
case GZIP:
|
||||
in = csFactory.createCompressorInputStream(CompressorStreamFactory.GZIP, in);
|
||||
case PLAIN:
|
||||
break; // nothing to do
|
||||
default:
|
||||
assertFalse("Unknown file type!",true); //fail, should not happen
|
||||
}
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
|
||||
try {
|
||||
|
@ -192,36 +197,37 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, true, "title", "date", "body");
|
||||
doReadTest(file, Type.BZIP2, "title", "date", "body");
|
||||
}
|
||||
|
||||
public void testBZip2AutoDetect() throws Exception {
|
||||
/* Tests WriteLineDocTask with a gzip format. */
|
||||
public void testGZip() throws Exception {
|
||||
|
||||
// Create a document in bz2 format.
|
||||
File file = new File(getWorkDir(), "one-line.bz2");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
|
||||
// Create a document in gz format.
|
||||
File file = new File(getWorkDir(), "one-line.gz");
|
||||
PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, true, "title", "date", "body");
|
||||
doReadTest(file, Type.GZIP, "title", "date", "body");
|
||||
}
|
||||
|
||||
public void testRegularFile() throws Exception {
|
||||
|
||||
// Create a document in regular format.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, WriteLineDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, false, "title", "date", "body");
|
||||
doReadTest(file, Type.PLAIN, "title", "date", "body");
|
||||
}
|
||||
|
||||
public void testCharsReplace() throws Exception {
|
||||
|
@ -229,12 +235,12 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
// separator char. However, it didn't replace newline characters, which
|
||||
// resulted in errors in LineDocSource.
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, NewLinesDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, false, "title text", "date text", "body text two");
|
||||
doReadTest(file, Type.PLAIN, "title text", "date text", "body text two");
|
||||
}
|
||||
|
||||
public void testEmptyBody() throws Exception {
|
||||
|
@ -242,28 +248,28 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
// had a TITLE element (LUCENE-1755). It should throw away documents if they
|
||||
// don't have BODY nor TITLE
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, NoBodyDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, false, "title", "date", null);
|
||||
doReadTest(file, Type.PLAIN, "title", "date", null);
|
||||
}
|
||||
|
||||
public void testEmptyTitle() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, NoTitleDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
||||
doReadTest(file, false, "", "date", "body");
|
||||
doReadTest(file, Type.PLAIN, "", "date", "body");
|
||||
}
|
||||
|
||||
/** Fail by default when there's only date */
|
||||
public void testJustDate() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, JustDateDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -281,7 +287,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
public void testLegalJustDate() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, LegalJustDateDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -299,7 +305,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
public void testEmptyDoc() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, true, EmptyDocMaker.class.getName());
|
||||
WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
wldt.doLogic();
|
||||
wldt.close();
|
||||
|
@ -317,7 +323,7 @@ public class WriteLineDocTaskTest extends BenchmarkTestCase {
|
|||
|
||||
public void testMultiThreaded() throws Exception {
|
||||
File file = new File(getWorkDir(), "one-line");
|
||||
PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
|
||||
PerfRunData runData = createPerfRunData(file, false, ThreadingDocMaker.class.getName());
|
||||
final WriteLineDocTask wldt = new WriteLineDocTask(runData);
|
||||
Thread[] threads = new Thread[10];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.benchmark.byTask.feeds;
|
||||
package org.apache.lucene.benchmark.byTask.utils;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -30,39 +30,63 @@ import java.io.OutputStreamWriter;
|
|||
|
||||
import org.apache.commons.compress.compressors.CompressorStreamFactory;
|
||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class ContentSourceTest extends BenchmarkTestCase {
|
||||
public class StreamUtilsTest extends BenchmarkTestCase {
|
||||
private static final String TEXT = "Some-Text...";
|
||||
private File testDir;
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
@Test
|
||||
public void testGetInputStreamPlainText() throws Exception {
|
||||
assertReadText(textFile("txt"));
|
||||
assertReadText(textFile("TXT"));
|
||||
assertReadText(rawTextFile("txt"));
|
||||
assertReadText(rawTextFile("TXT"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetInputStreamGzip() throws Exception {
|
||||
assertReadText(gzipFile("gz"));
|
||||
assertReadText(gzipFile("gzip"));
|
||||
assertReadText(gzipFile("GZ"));
|
||||
assertReadText(gzipFile("GZIP"));
|
||||
assertReadText(rawGzipFile("gz"));
|
||||
assertReadText(rawGzipFile("gzip"));
|
||||
assertReadText(rawGzipFile("GZ"));
|
||||
assertReadText(rawGzipFile("GZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetInputStreamBzip2() throws Exception {
|
||||
assertReadText(bzip2File("bz2"));
|
||||
assertReadText(bzip2File("bzip"));
|
||||
assertReadText(bzip2File("BZ2"));
|
||||
assertReadText(bzip2File("BZIP"));
|
||||
assertReadText(rawBzip2File("bz2"));
|
||||
assertReadText(rawBzip2File("bzip"));
|
||||
assertReadText(rawBzip2File("BZ2"));
|
||||
assertReadText(rawBzip2File("BZIP"));
|
||||
}
|
||||
|
||||
private File textFile(String ext) throws Exception {
|
||||
@Test
|
||||
public void testGetOutputStreamBzip2() throws Exception {
|
||||
assertReadText(autoOutFile("bz2"));
|
||||
assertReadText(autoOutFile("bzip"));
|
||||
assertReadText(autoOutFile("BZ2"));
|
||||
assertReadText(autoOutFile("BZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOutputStreamGzip() throws Exception {
|
||||
assertReadText(autoOutFile("gz"));
|
||||
assertReadText(autoOutFile("gzip"));
|
||||
assertReadText(autoOutFile("GZ"));
|
||||
assertReadText(autoOutFile("GZIP"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetOutputStreamPlain() throws Exception {
|
||||
assertReadText(autoOutFile("txt"));
|
||||
assertReadText(autoOutFile("text"));
|
||||
assertReadText(autoOutFile("TXT"));
|
||||
assertReadText(autoOutFile("TEXT"));
|
||||
}
|
||||
|
||||
private File rawTextFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
BufferedWriter w = new BufferedWriter(new FileWriter(f));
|
||||
w.write(TEXT);
|
||||
|
@ -71,38 +95,36 @@ public class ContentSourceTest extends BenchmarkTestCase {
|
|||
return f;
|
||||
}
|
||||
|
||||
private File gzipFile(String ext) throws Exception {
|
||||
private File rawGzipFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = csFactory.createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f));
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, new FileOutputStream(f));
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private File bzip2File(String ext) throws Exception {
|
||||
private File rawBzip2File(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = csFactory.createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.BZIP2, new FileOutputStream(f));
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private File autoOutFile(String ext) throws Exception {
|
||||
File f = new File(testDir,"testfile." + ext);
|
||||
OutputStream os = StreamUtils.outputStream(f);
|
||||
writeText(os);
|
||||
return f;
|
||||
}
|
||||
|
||||
private void writeText(OutputStream os) throws IOException {
|
||||
BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
|
||||
w.write(TEXT);
|
||||
w.newLine();
|
||||
w.close();
|
||||
}
|
||||
|
||||
private void assertReadText(File f) throws Exception {
|
||||
ContentSource src = new ContentSource() {
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
}
|
||||
@Override
|
||||
public DocData getNextDocData(DocData docData) throws NoMoreDataException,
|
||||
IOException {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
InputStream ir = src.getInputStream(f);
|
||||
InputStream ir = StreamUtils.inputStream(f);
|
||||
InputStreamReader in = new InputStreamReader(ir);
|
||||
BufferedReader r = new BufferedReader(in);
|
||||
String line = r.readLine();
|
Loading…
Reference in New Issue