diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index daea07f6dd7..8679f689d8d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -748,6 +748,10 @@ Changes in backwards compatibility policy
* LUCENE-3712: Removed unused and untested ReaderUtil#subReader methods.
(Uwe Schindler)
+
+* LUCENE-3672: Deprecate Directory.fileModified and
+ IndexCommit.getTimestamp and .getVersion. (Andrzej Bialecki, Robert
+ Muir, Mike McCandless)
Security fixes
@@ -802,6 +806,9 @@ New Features
* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
markup. (Steve Rowe)
+
+* LUCENE-3725: Added optional packing to FST building; this uses extra
+ RAM during building but results in a smaller FST. (Mike McCandless)
Bug fixes
@@ -845,6 +852,12 @@ Bug fixes
TermAllGroupsCollector or TermAllGroupHeadsCollector if instantiated with a
non default small size. (Martijn van Groningen, yonik)
+* LUCENE-3727: When writing stored fields and term vectors, Lucene
+ checks file sizes to detect a bug in some Sun JREs (LUCENE-1282),
+ however, on some NFS filesystems File.length() could be stale,
+ resulting in false errors like "fdx size mismatch while indexing".
+ These checks now use getFilePointer instead to avoid this.
+ (Jamir Shaikh, Mike McCandless, Robert Muir)
Optimizations
diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index b95a2973178..3c646719681 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -62,6 +62,14 @@ New Features
* LUCENE-3602: Added query time joining under the join module. (Martijn van Groningen, Michael McCandless)
+ * LUCENE-2795: Generified DirectIOLinuxDirectory to work across any
+ unix supporting the O_DIRECT flag when opening a file (tested on
+ Linux and OS X but likely other Unixes will work), and improved it
+ so it can be used for indexing and searching. The directory uses
+ direct IO when doing large merges to avoid unnecessarily evicting
+ cached IO pages due to large merges. (Varun Thacker, Mike
+ McCandless)
+
API Changes
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
@@ -192,6 +200,9 @@ Bug Fixes
* LUCENE-3697: SimpleBoundaryScanner does not work well when highlighting
at the beginning of the text. (Shay Banon via Koji Sekiguchi)
+ * LUCENE-3719: FVH: slow performance on very large queries.
+ (Igor Motov via Koji Sekiguchi)
+
Documentation
* LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly
diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
index e329136647c..c5f8d76de46 100644
--- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
+++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
@@ -17,11 +17,11 @@ package org.apache.lucene.search.vectorhighlight;
*/
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -60,7 +60,7 @@ public class FieldQuery {
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
this.fieldMatch = fieldMatch;
- List See Overview
+ * for more details.
+ *
+ * To use this you must compile
* NativePosixUtil.cpp (exposes Linux-specific APIs through
- * JNI) for your platform.
+ * JNI) for your platform, by running WARNING: this code is very new and quite easily
* could contain horrible bugs. For example, here's one
- * known issue: if you use seek in IndexOutput, and then
+ * known issue: if you use seek in This directory passes Solr and Lucene tests on Linux
+ * and OS X; other Unixes should work but have not been
+ * tested! Use at your own risk.
+ *
* @lucene.experimental
*/
-public class DirectIOLinuxDirectory extends FSDirectory {
+public class NativeUnixDirectory extends FSDirectory {
+ // TODO: this is OS dependent, but likely 512 is the LCD
private final static long ALIGN = 512;
private final static long ALIGN_NOT_MASK = ~(ALIGN-1);
+
+ /** Default buffer size before writing to disk (256 MB);
+ * larger means less IO load but more RAM and direct
+ * buffer storage space consumed during merging. */
- private final int forcedBufferSize;
+ public final static int DEFAULT_MERGE_BUFFER_SIZE = 262144;
+
+ /** Default min expected merge size before direct IO is
+ * used (10 MB): */
+ public final static long DEFAULT_MIN_BYTES_DIRECT = 10*1024*1024;
+
+ private final int mergeBufferSize;
+ private final long minBytesDirect;
+ private final Directory delegate;
/** Create a new NIOFSDirectory for the named location.
*
* @param path the path of the directory
- * @param lockFactory the lock factory to use, or null for the default
- * ({@link NativeFSLockFactory});
- * @param forcedBufferSize if this is 0, just use Lucene's
- * default buffer size; else, force this buffer size.
- * For best performance, force the buffer size to
- * something fairly large (eg 1 MB), but note that this
- * will eat up the JRE's direct buffer storage space
+ * @param mergeBufferSize Size of buffer to use for
+ * merging. See {@link #DEFAULT_MERGE_BUFFER_SIZE}.
+ * @param minBytesDirect Merges, or files to be opened for
+ * reading, smaller than this will
+ * not use direct IO. See {@link
+ * #DEFAULT_MIN_BYTES_DIRECT}
+ * @param delegate fallback Directory for non-merges
* @throws IOException
*/
- public DirectIOLinuxDirectory(File path, LockFactory lockFactory, int forcedBufferSize) throws IOException {
- super(path, lockFactory);
- this.forcedBufferSize = forcedBufferSize;
+ public NativeUnixDirectory(File path, int mergeBufferSize, long minBytesDirect, Directory delegate) throws IOException {
+ super(path, delegate.getLockFactory());
+ if ((mergeBufferSize & ALIGN) != 0) {
+ throw new IllegalArgumentException("mergeBufferSize must be 0 mod " + ALIGN + " (got: " + mergeBufferSize + ")");
+ }
+ this.mergeBufferSize = mergeBufferSize;
+ this.minBytesDirect = minBytesDirect;
+ this.delegate = delegate;
}
+
+ /** Create a new NIOFSDirectory for the named location.
+ *
+ * @param path the path of the directory
+ * @param delegate fallback Directory for non-merges
+ * @throws IOException
+ */
+ public NativeUnixDirectory(File path, Directory delegate) throws IOException {
+ this(path, DEFAULT_MERGE_BUFFER_SIZE, DEFAULT_MIN_BYTES_DIRECT, delegate);
+ }
@Override
public IndexInput openInput(String name, IOContext context) throws IOException {
ensureOpen();
- return new DirectIOLinuxIndexInput(new File(getDirectory(), name),
- bufferSize(context));
+ if (context.context != Context.MERGE || context.mergeInfo.estimatedMergeBytes < minBytesDirect || fileLength(name) < minBytesDirect) {
+ return delegate.openInput(name, context);
+ } else {
+ return new NativeUnixIndexInput(new File(getDirectory(), name), mergeBufferSize);
+ }
}
@Override
public IndexOutput createOutput(String name, IOContext context) throws IOException {
ensureOpen();
- ensureCanWrite(name);
- return new DirectIOLinuxIndexOutput(new File(getDirectory(), name), bufferSize(context));
- }
-
- private int bufferSize(IOContext context) {
- return forcedBufferSize != 0 ? forcedBufferSize : BufferedIndexInput
- .bufferSize(context);
+ if (context.context != Context.MERGE || context.mergeInfo.estimatedMergeBytes < minBytesDirect) {
+ return delegate.createOutput(name, context);
+ } else {
+ ensureCanWrite(name);
+ return new NativeUnixIndexOutput(new File(getDirectory(), name), mergeBufferSize);
+ }
}
- private final static class DirectIOLinuxIndexOutput extends IndexOutput {
+ private final static class NativeUnixIndexOutput extends IndexOutput {
private final ByteBuffer buffer;
private final FileOutputStream fos;
private final FileChannel channel;
@@ -101,9 +153,9 @@ public class DirectIOLinuxDirectory extends FSDirectory {
private long fileLength;
private boolean isOpen;
- public DirectIOLinuxIndexOutput(File path, int bufferSize) throws IOException {
+ public NativeUnixIndexOutput(File path, int bufferSize) throws IOException {
//this.path = path;
- FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), false);
+ final FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), false);
fos = new FileOutputStream(fd);
//fos = new FileOutputStream(path);
channel = fos.getChannel();
@@ -206,7 +258,7 @@ public class DirectIOLinuxDirectory extends FSDirectory {
@Override
public long length() throws IOException {
- return fileLength;
+ return fileLength + bufferPos;
}
@Override
@@ -233,7 +285,7 @@ public class DirectIOLinuxDirectory extends FSDirectory {
}
}
- private final static class DirectIOLinuxIndexInput extends IndexInput {
+ private final static class NativeUnixIndexInput extends IndexInput {
private final ByteBuffer buffer;
private final FileInputStream fis;
private final FileChannel channel;
@@ -244,10 +296,9 @@ public class DirectIOLinuxDirectory extends FSDirectory {
private long filePos;
private int bufferPos;
- public DirectIOLinuxIndexInput(File path, int bufferSize) throws IOException {
- // TODO make use of IOContext
- super("DirectIOLinuxIndexInput(path=\"" + path.getPath() + "\")");
- FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), true);
+ public NativeUnixIndexInput(File path, int bufferSize) throws IOException {
+ super("NativeUnixIndexInput(path=\"" + path.getPath() + "\")");
+ final FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), true);
fis = new FileInputStream(fd);
channel = fis.getChannel();
this.bufferSize = bufferSize;
@@ -260,7 +311,7 @@ public class DirectIOLinuxDirectory extends FSDirectory {
}
// for clone
- public DirectIOLinuxIndexInput(DirectIOLinuxIndexInput other) throws IOException {
+ public NativeUnixIndexInput(NativeUnixIndexInput other) throws IOException {
super(other.toString());
this.fis = null;
channel = other.channel;
@@ -296,13 +347,17 @@ public class DirectIOLinuxDirectory extends FSDirectory {
public void seek(long pos) throws IOException {
if (pos != getFilePointer()) {
final long alignedPos = pos & ALIGN_NOT_MASK;
- //System.out.println("seek pos=" + pos + " aligned=" + alignedPos + " bufferSize=" + bufferSize + " this=" + this);
filePos = alignedPos-bufferSize;
- refill();
final int delta = (int) (pos - alignedPos);
- buffer.position(delta);
- bufferPos = delta;
+ if (delta != 0) {
+ refill();
+ buffer.position(delta);
+ bufferPos = delta;
+ } else {
+ // force refill on next read
+ bufferPos = bufferSize;
+ }
}
}
@@ -371,7 +426,7 @@ public class DirectIOLinuxDirectory extends FSDirectory {
@Override
public Object clone() {
try {
- return new DirectIOLinuxIndexInput(this);
+ return new NativeUnixIndexInput(this);
} catch (IOException ioe) {
throw new RuntimeException("IOException during clone: " + this, ioe);
}
diff --git a/lucene/contrib/misc/src/java/overview.html b/lucene/contrib/misc/src/java/overview.html
index 7574699964c..a2c668d35ca 100644
--- a/lucene/contrib/misc/src/java/overview.html
+++ b/lucene/contrib/misc/src/java/overview.html
@@ -27,33 +27,29 @@
The misc package has various tools for splitting/merging indices,
changing norms, finding high freq terms, and others.
-
NOTE: This uses C++ sources (accessible via JNI), which you'll
-have to compile on your platform. Further, this is a very
-platform-specific extensions (runs only on Linux, and likely only on
-2.6.x kernels).
+have to compile on your platform.
-DirectIOLinuxDirectory is a Directory implementation that bypasses the
-OS's buffer cache for any IndexInput and IndexOutput opened through it
-(using the linux-specific O_DIRECT flag).
+{@link org.apache.lucene.store.NativeUnixDirectory} is a Directory implementation that bypasses the
+OS's buffer cache (using direct IO) for any IndexInput and IndexOutput
+used during merging of segments larger than a specified size (default
+10 MB). This avoids evicting hot pages that are still in-use for
+searching, keeping search more responsive while large merges run.
-Note that doing so typically results in bad performance loss! You
-should not use this for searching, but rather for indexing (or maybe
-just merging during indexing), to avoid evicting useful pages from the
-buffer cache.
-
-See here
+See this blog post
for details.
Steps to build:
-To use this, you'll likely want to make a custom subclass of
-FSDirectory that only opens direct IndexInput/Output for merging. One
-hackish way to do this is to check if the current thread's name starts
-with "Lucene Merge Thread". Alternatively, you could use this Dir as
-is for all indexing ops, but not for searching.
-
NativePosixUtil.cpp/java also expose access to the posix_madvise,
madvise, posix_fadvise functions, which are somewhat more cross
diff --git a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
index 141eaf57d8f..9cbc07f6837 100644
--- a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
+++ b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java
@@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer {
final long indexStartFP;
final long rootBlockFP;
final BytesRef rootCode;
- private FST If instead this reader is a near real-time reader
- * (ie, obtained by a call to {@link
- * IndexWriter#getReader}, or by calling {@link #openIfChanged}
- * on a near real-time reader), then this method returns
- * the version of the last commit done by the writer.
- * Note that even as further changes are made with the
- * writer, the version will not changed until a commit is
- * completed. Thus, you should not rely on this method to
- * determine when a near real-time reader should be
- * opened. Use {@link #isCurrent} instead.ant
+ * build-native-unix
, and then putting the resulting
+ * libNativePosixUtil.so
(from
+ * lucene/build/native
) onto your dynamic
+ * linker search path.
*
* IndexOutput
, and then
* write more than one buffer's worth of bytes, then the
- * file will be wrong. Lucene does not do this (only writes
- * small number of bytes after seek).
-
+ * file will be wrong. Lucene does not do this today (only writes
+ * small number of bytes after seek), but that may change.
+ *
+ * DirectIOLinuxDirectory
+
+NativeUnixDirectory
-
NOTE: The algorithm is described at * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698
* - * If your outputs are ByteSequenceOutput then the final FST - * will be minimal, but if you use PositiveIntOutput then - * it's only "near minimal". For example, aa/0, aab/1, bbb/2 - * will produce 6 states when a 5 state fst is also - * possible. - * * The parameterized type T is the output type. See the * subclasses of {@link Outputs}. * @@ -52,7 +46,7 @@ public class BuilderNote that any operation that returns NO_OUTPUT must + * return the same singleton object from {@link + * #getNoOutput}.
+ * * @lucene.experimental */ @@ -56,6 +60,8 @@ public abstract class Outputs