Optimize rcmuir's commit to seek using RandomAccessFile and later use input stream (after seeking). Also fail on UTF-8 coding errors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1336942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2012-05-10 23:10:18 +00:00
parent a41f0ca63b
commit 0d0ef08cfc
1 changed files with 47 additions and 24 deletions

View File

@ -24,6 +24,9 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
@ -71,46 +74,66 @@ public class LineFileDocs implements Closeable {
reader = null;
}
}
private long randomSeekPos(Random random, long size) {
if (random == null || size <= 3L)
return 0L;
return (random.nextLong()&Long.MAX_VALUE) % (size/3);
}
private synchronized void open(Random random) throws IOException {
InputStream is = getClass().getResourceAsStream(path);
boolean needSkip = true;
long size = 0L;
if (is == null) {
// if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
is = new FileInputStream(path);
}
File file = new File(path);
long size;
if (file.exists()) {
size = file.length();
File file = new File(path);
if (path.endsWith(".gz")) {
// if it is a gzip file, we need to use InputStream and slowly skipTo:
is = new FileInputStream(file);
} else {
// optimized seek using RandomAccessFile:
size = file.length();
final long seekTo = randomSeekPos(random, size);
final FileChannel channel = new RandomAccessFile(path, "r").getChannel();
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
}
channel.position(seekTo);
is = Channels.newInputStream(channel);
needSkip = false;
}
} else {
// if the file comes from Classpath:
size = is.available();
}
if (path.endsWith(".gz")) {
is = new GZIPInputStream(is);
// guestimate:
size *= 2.8;
}
// Override sizes for currently "known" line files:
if (path.equals("europarl.lines.txt.gz")) {
size = 15129506L;
} else if (path.equals("/home/hudson/lucene-data/enwiki.random.lines.txt.gz")) {
size = 3038178822L;
}
// Randomly seek to starting point:
if (random != null && size > 3) {
final long seekTo = (random.nextLong()&Long.MAX_VALUE) % (size/3);
// If we only have an InputStream, we need to seek now,
// but this seek is a scan, so very inefficient!!!
if (needSkip) {
final long skipTo = randomSeekPos(random, size);
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: LineFileDocs: seek to fp=" + seekTo + " on open");
System.out.println("TEST: LineFileDocs: stream skip to fp=" + skipTo + " on open");
}
is.skip(seekTo);
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
.onMalformedInput(CodingErrorAction.IGNORE)
.onUnmappableCharacter(CodingErrorAction.IGNORE);
reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
reader.readLine();
is.skip(skipTo);
}
int b;
do {
b = is.read();
} while (b >= 0 && b != 13 && b != 10);
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
// read one more line, to make sure we are not inside a Windows linebreak (\r\n):
reader.readLine();
}
public synchronized void reset(Random random) throws IOException {