mirror of https://github.com/apache/lucene.git
Optimize rcmuir's commit to seek using RandomAccessFile and later use input stream (after seeking). Also fail on UTF-8 coding errors
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1336942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a41f0ca63b
commit
0d0ef08cfc
|
@ -24,6 +24,9 @@ import java.io.FileInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.channels.Channels;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
@ -71,46 +74,66 @@ public class LineFileDocs implements Closeable {
|
|||
reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
private long randomSeekPos(Random random, long size) {
|
||||
if (random == null || size <= 3L)
|
||||
return 0L;
|
||||
return (random.nextLong()&Long.MAX_VALUE) % (size/3);
|
||||
}
|
||||
|
||||
private synchronized void open(Random random) throws IOException {
|
||||
InputStream is = getClass().getResourceAsStream(path);
|
||||
boolean needSkip = true;
|
||||
long size = 0L;
|
||||
if (is == null) {
|
||||
// if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
|
||||
is = new FileInputStream(path);
|
||||
}
|
||||
File file = new File(path);
|
||||
long size;
|
||||
if (file.exists()) {
|
||||
size = file.length();
|
||||
File file = new File(path);
|
||||
if (path.endsWith(".gz")) {
|
||||
// if it is a gzip file, we need to use InputStream and slowly skipTo:
|
||||
is = new FileInputStream(file);
|
||||
} else {
|
||||
// optimized seek using RandomAccessFile:
|
||||
size = file.length();
|
||||
final long seekTo = randomSeekPos(random, size);
|
||||
final FileChannel channel = new RandomAccessFile(path, "r").getChannel();
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: LineFileDocs: file seek to fp=" + seekTo + " on open");
|
||||
}
|
||||
channel.position(seekTo);
|
||||
is = Channels.newInputStream(channel);
|
||||
needSkip = false;
|
||||
}
|
||||
} else {
|
||||
// if the file comes from Classpath:
|
||||
size = is.available();
|
||||
}
|
||||
|
||||
if (path.endsWith(".gz")) {
|
||||
is = new GZIPInputStream(is);
|
||||
// guestimate:
|
||||
size *= 2.8;
|
||||
}
|
||||
|
||||
// Override sizes for currently "known" line files:
|
||||
if (path.equals("europarl.lines.txt.gz")) {
|
||||
size = 15129506L;
|
||||
} else if (path.equals("/home/hudson/lucene-data/enwiki.random.lines.txt.gz")) {
|
||||
size = 3038178822L;
|
||||
}
|
||||
|
||||
// Randomly seek to starting point:
|
||||
if (random != null && size > 3) {
|
||||
final long seekTo = (random.nextLong()&Long.MAX_VALUE) % (size/3);
|
||||
|
||||
// If we only have an InputStream, we need to seek now,
|
||||
// but this seek is a scan, so very inefficient!!!
|
||||
if (needSkip) {
|
||||
final long skipTo = randomSeekPos(random, size);
|
||||
if (LuceneTestCase.VERBOSE) {
|
||||
System.out.println("TEST: LineFileDocs: seek to fp=" + seekTo + " on open");
|
||||
System.out.println("TEST: LineFileDocs: stream skip to fp=" + skipTo + " on open");
|
||||
}
|
||||
is.skip(seekTo);
|
||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.IGNORE)
|
||||
.onUnmappableCharacter(CodingErrorAction.IGNORE);
|
||||
reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
|
||||
reader.readLine();
|
||||
is.skip(skipTo);
|
||||
}
|
||||
|
||||
int b;
|
||||
do {
|
||||
b = is.read();
|
||||
} while (b >= 0 && b != 13 && b != 10);
|
||||
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT)
|
||||
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
||||
reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
|
||||
// read one more line, to make sure we are not inside a Windows linebreak (\r\n):
|
||||
reader.readLine();
|
||||
}
|
||||
|
||||
public synchronized void reset(Random random) throws IOException {
|
||||
|
|
Loading…
Reference in New Issue