mirror of https://github.com/apache/lucene.git
LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files, unless a different encoding is specified. Additionally, ContentSource now supports a content.source.encoding parameter in the configuration file.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791528 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
705f099238
commit
f780f77366
|
@ -4,6 +4,12 @@ The Benchmark contrib package contains code for benchmarking Lucene in a variety
|
|||
|
||||
$Id:$
|
||||
|
||||
7/6/2009
|
||||
LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files,
|
||||
unless a different encoding is specified. Additionally, ContentSource now supports
|
||||
a content.source.encoding parameter in the configuration file.
|
||||
(Shai Erera via Mark Miller)
|
||||
|
||||
6/26/2009
|
||||
LUCENE-1716: Added the following support:
|
||||
doc.tokenized.norms: specifies whether to store norms
|
||||
|
|
|
@ -34,9 +34,9 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
|||
/**
|
||||
* Represents content from a specified source, such as TREC, Reuters etc. A
|
||||
* {@link ContentSource} is responsible for creating {@link DocData} objects for
|
||||
* its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of
|
||||
* various statistics, such as how many documents were generated, size in bytes
|
||||
* etc.
|
||||
* its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track
|
||||
* of various statistics, such as how many documents were generated, size in
|
||||
* bytes etc.
|
||||
* <p>
|
||||
* Supports the following configuration parameters:
|
||||
* <ul>
|
||||
|
@ -44,6 +44,9 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
|||
* forever (<b>default=true</b>).
|
||||
* <li><b>content.source.verbose</b> - specifies whether messages should be
|
||||
* output by the content source (<b>default=false</b>).
|
||||
* <li><b>content.source.encoding</b> - specifies which encoding to use when
|
||||
* reading the files of that content source. Certain implementations may define
|
||||
* a default value if this parameter is not specified. (<b>default=null</b>).
|
||||
* <li><b>content.source.log.step</b> - specifies for how many documents a
|
||||
* message should be logged. If set to 0 it means no logging should occur.
|
||||
* <b>NOTE:</b> if verbose is set to false, logging should not occur even if
|
||||
|
@ -71,6 +74,7 @@ public abstract class ContentSource {
|
|||
protected boolean forever;
|
||||
protected int logStep;
|
||||
protected boolean verbose;
|
||||
protected String encoding;
|
||||
|
||||
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
|
||||
|
||||
|
@ -196,6 +200,7 @@ public abstract class ContentSource {
|
|||
forever = config.get("content.source.forever", true);
|
||||
logStep = config.get("content.source.log.step", 0);
|
||||
verbose = config.get("content.source.verbose", false);
|
||||
encoding = config.get("content.source.encoding", null);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.lucene.benchmark.byTask.utils.Config;
|
|||
* Config properties:
|
||||
* <ul>
|
||||
* <li>docs.file=<path to the file>
|
||||
* <li>content.source.encoding - default to UTF-8.
|
||||
* </ul>
|
||||
*/
|
||||
public class LineDocSource extends ContentSource {
|
||||
|
@ -54,7 +55,7 @@ public class LineDocSource extends ContentSource {
|
|||
reader.close();
|
||||
}
|
||||
InputStream is = getInputStream(file);
|
||||
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
|
||||
reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
@ -111,6 +112,9 @@ public class LineDocSource extends ContentSource {
|
|||
throw new IllegalArgumentException("docs.file must be set");
|
||||
}
|
||||
file = new File(fileName).getAbsoluteFile();
|
||||
if (encoding == null) {
|
||||
encoding = "UTF-8";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -47,10 +47,11 @@ import org.apache.lucene.benchmark.byTask.utils.StringBufferReader;
|
|||
* (<b>default=trec</b>).
|
||||
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
|
||||
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
|
||||
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
|
||||
* </ul>
|
||||
*/
|
||||
public class TrecContentSource extends ContentSource {
|
||||
// TODO (3.0): change StringBuffer to StringBuffer
|
||||
// TODO (3.0): change StringBuffer to StringBuilder
|
||||
|
||||
private static final class DateFormatInfo {
|
||||
DateFormat[] dfs;
|
||||
|
@ -181,8 +182,8 @@ public class TrecContentSource extends ContentSource {
|
|||
System.out.println("opening: " + f + " length: " + f.length());
|
||||
}
|
||||
try {
|
||||
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16);
|
||||
reader = new BufferedReader(new InputStreamReader(zis), 1 << 16);
|
||||
GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE);
|
||||
reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE);
|
||||
return;
|
||||
} catch (Exception e) {
|
||||
retries++;
|
||||
|
@ -334,6 +335,9 @@ public class TrecContentSource extends ContentSource {
|
|||
// Should not get here. Throw runtime exception.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
if (encoding == null) {
|
||||
encoding = "ISO-8859-1";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ public class TrecContentSourceTest extends TestCase {
|
|||
this.forever = forever;
|
||||
}
|
||||
|
||||
protected void openNextFile() throws NoMoreDataException, IOException {
|
||||
void openNextFile() throws NoMoreDataException, IOException {
|
||||
if (reader != null) {
|
||||
if (!forever) {
|
||||
throw new NoMoreDataException();
|
||||
|
|
Loading…
Reference in New Issue