diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index daea07f6dd7..8679f689d8d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -748,6 +748,10 @@ Changes in backwards compatibility policy * LUCENE-3712: Removed unused and untested ReaderUtil#subReader methods. (Uwe Schindler) + +* LUCENE-3672: Deprecate Directory.fileModified and + IndexCommit.getTimestamp and .getVersion. (Andrzej Bialecki, Robert + Muir, Mike McCandless) Security fixes @@ -802,6 +806,9 @@ New Features * LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML markup. (Steve Rowe) + +* LUCENE-3725: Added optional packing to FST building; this uses extra + RAM during building but results in a smaller FST. (Mike McCandless) Bug fixes @@ -845,6 +852,12 @@ Bug fixes TermAllGroupsCollector or TermAllGroupHeadsCollector if instantiated with a non default small size. (Martijn van Groningen, yonik) +* LUCENE-3727: When writing stored fields and term vectors, Lucene + checks file sizes to detect a bug in some Sun JREs (LUCENE-1282), + however, on some NFS filesystems File.length() could be stale, + resulting in false errors like "fdx size mismatch while indexing". + These checks now use getFilePointer instead to avoid this. + (Jamir Shaikh, Mike McCandless, Robert Muir) Optimizations diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index b95a2973178..3c646719681 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -62,6 +62,14 @@ New Features * LUCENE-3602: Added query time joining under the join module. (Martijn van Groningen, Michael McCandless) + * LUCENE-2795: Generified DirectIOLinuxDirectory to work across any + unix supporting the O_DIRECT flag when opening a file (tested on + Linux and OS X but likely other Unixes will work), and improved it + so it can be used for indexing and searching. The directory uses + direct IO when doing large merges to avoid unnecessarily evicting + cached IO pages due to large merges. (Varun Thacker, Mike + McCandless) + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread @@ -192,6 +200,9 @@ Bug Fixes * LUCENE-3697: SimpleBoundaryScanner does not work well when highlighting at the beginning of the text. (Shay Banon via Koji Sekiguchi) + * LUCENE-3719: FVH: slow performance on very large queries. + (Igor Motov via Koji Sekiguchi) + Documentation * LUCENE-3599: Javadocs for DistanceUtils.haversine() were incorrectly diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java index e329136647c..c5f8d76de46 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java @@ -17,11 +17,11 @@ package org.apache.lucene.search.vectorhighlight; */ import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -60,7 +60,7 @@ public class FieldQuery { FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this.fieldMatch = fieldMatch; - List flatQueries = new ArrayList(); + Set flatQueries = new LinkedHashSet(); flatten( query, reader, flatQueries ); saveTerms( flatQueries, reader ); Collection expandQueries = expand( flatQueries ); @@ -133,7 +133,7 @@ public class FieldQuery { * => expandQueries={a,"b c","c d","b c d"} */ Collection expand( Collection flatQueries ){ - List expandQueries = new ArrayList(); + Set expandQueries = new LinkedHashSet(); for( Iterator i = flatQueries.iterator(); i.hasNext(); ){ Query query = i.next(); i.remove(); diff --git a/lucene/contrib/misc/build.xml b/lucene/contrib/misc/build.xml index 7613de6acf0..7e7e39d3c70 100644 --- a/lucene/contrib/misc/build.xml +++ b/lucene/contrib/misc/build.xml @@ -40,11 +40,13 @@ + + diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/store/NativePosixUtil.cpp b/lucene/contrib/misc/src/java/org/apache/lucene/store/NativePosixUtil.cpp index fa05142f877..ae8c60aa12e 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/store/NativePosixUtil.cpp +++ b/lucene/contrib/misc/src/java/org/apache/lucene/store/NativePosixUtil.cpp @@ -15,6 +15,16 @@ * the License. */ +#ifdef LINUX + #define DIRECT_FLAG O_DIRECT | O_NOATIME + #define LINUX +#elif __APPLE__ + #define DIRECT_FLAG 0 + #define OSX +#else + #define DIRECT_FLAG O_DIRECT // __unix__ is not used as even Linux falls under it. +#endif + #include #include // posix_fadvise, constants for open #include // strerror @@ -26,6 +36,7 @@ // java -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/java:./build/classes/demo -Dlucene.version=2.9-dev -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.TestDoc +#ifdef LINUX /* * Class: org_apache_lucene_store_NativePosixUtil * Method: posix_fadvise @@ -89,7 +100,7 @@ JNIEXPORT jint JNICALL Java_org_apache_lucene_store_NativePosixUtil_posix_1fadvi return 0; } - +#endif /* * Class: org_apache_lucene_store_NativePosixUtil @@ -107,16 +118,26 @@ JNIEXPORT jobject JNICALL Java_org_apache_lucene_store_NativePosixUtil_open_1dir char *fname; class_ioex = env->FindClass("java/io/IOException"); - if (class_ioex == NULL) return NULL; + if (class_ioex == NULL) { + return NULL; + } class_fdesc = env->FindClass("java/io/FileDescriptor"); - if (class_fdesc == NULL) return NULL; + if (class_fdesc == NULL) { + return NULL; + } fname = (char *) env->GetStringUTFChars(filename, NULL); if (readOnly) { - fd = open(fname, O_RDONLY | O_DIRECT | O_NOATIME); + fd = open(fname, O_RDONLY | DIRECT_FLAG); + #ifdef OSX + fcntl(fd, F_NOCACHE, 1); + #endif } else { - fd = open(fname, O_RDWR | O_CREAT | O_DIRECT | O_NOATIME, 0666); + fd = open(fname, O_RDWR | O_CREAT | DIRECT_FLAG, 0666); + #ifdef OSX + fcntl(fd, F_NOCACHE, 1); + #endif } //printf("open %s -> %d; ro %d\n", fname, fd, readOnly); fflush(stdout); @@ -131,19 +152,22 @@ JNIEXPORT jobject JNICALL Java_org_apache_lucene_store_NativePosixUtil_open_1dir // construct a new FileDescriptor const_fdesc = env->GetMethodID(class_fdesc, "", "()V"); - if (const_fdesc == NULL) return NULL; + if (const_fdesc == NULL) { + return NULL; + } ret = env->NewObject(class_fdesc, const_fdesc); // poke the "fd" field with the file descriptor field_fd = env->GetFieldID(class_fdesc, "fd", "I"); - if (field_fd == NULL) return NULL; + if (field_fd == NULL) { + return NULL; + } env->SetIntField(ret, field_fd, fd); // and return it return ret; } - /* * Class: org_apache_lucene_store_NativePosixUtil * Method: pread diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/store/DirectIOLinuxDirectory.java b/lucene/contrib/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java similarity index 68% rename from lucene/contrib/misc/src/java/org/apache/lucene/store/DirectIOLinuxDirectory.java rename to lucene/contrib/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java index fdaf3d7641e..926a87a0394 100644 --- a/lucene/contrib/misc/src/java/org/apache/lucene/store/DirectIOLinuxDirectory.java +++ b/lucene/contrib/misc/src/java/org/apache/lucene/store/NativeUnixDirectory.java @@ -27,68 +27,120 @@ import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import org.apache.lucene.store.Directory; // javadoc -import org.apache.lucene.store.NativeFSLockFactory; // javadoc +import org.apache.lucene.store.IOContext.Context; + +// TODO +// - newer Linux kernel versions (after 2.6.29) have +// improved MADV_SEQUENTIAL (and hopefully also +// FADV_SEQUENTIAL) interaction with the buffer +// cache; we should explore using that instead of direct +// IO when context is merge /** - * An {@link Directory} implementation that uses the - * Linux-specific O_DIRECT flag to bypass all OS level - * caching. To use this you must compile + * A {@link Directory} implementation for all Unixes that uses + * DIRECT I/O to bypass OS level IO caching during + * merging. For all other cases (searching, writing) we delegate + * to the provided Directory instance. + * + *

See Overview + * for more details. + * + *

To use this you must compile * NativePosixUtil.cpp (exposes Linux-specific APIs through - * JNI) for your platform. + * JNI) for your platform, by running ant + * build-native-unix, and then putting the resulting + * libNativePosixUtil.so (from + * lucene/build/native) onto your dynamic + * linker search path. * *

WARNING: this code is very new and quite easily * could contain horrible bugs. For example, here's one - * known issue: if you use seek in IndexOutput, and then + * known issue: if you use seek in IndexOutput, and then * write more than one buffer's worth of bytes, then the - * file will be wrong. Lucene does not do this (only writes - * small number of bytes after seek). - + * file will be wrong. Lucene does not do this today (only writes + * small number of bytes after seek), but that may change. + * + *

This directory passes Solr and Lucene tests on Linux + * and OS X; other Unixes should work but have not been + * tested! Use at your own risk. + * * @lucene.experimental */ -public class DirectIOLinuxDirectory extends FSDirectory { +public class NativeUnixDirectory extends FSDirectory { + // TODO: this is OS dependent, but likely 512 is the LCD private final static long ALIGN = 512; private final static long ALIGN_NOT_MASK = ~(ALIGN-1); + + /** Default buffer size before writing to disk (256 MB); + * larger means less IO load but more RAM and direct + * buffer storage space consumed during merging. */ - private final int forcedBufferSize; + public final static int DEFAULT_MERGE_BUFFER_SIZE = 262144; + + /** Default min expected merge size before direct IO is + * used (10 MB): */ + public final static long DEFAULT_MIN_BYTES_DIRECT = 10*1024*1024; + + private final int mergeBufferSize; + private final long minBytesDirect; + private final Directory delegate; /** Create a new NIOFSDirectory for the named location. * * @param path the path of the directory - * @param lockFactory the lock factory to use, or null for the default - * ({@link NativeFSLockFactory}); - * @param forcedBufferSize if this is 0, just use Lucene's - * default buffer size; else, force this buffer size. - * For best performance, force the buffer size to - * something fairly large (eg 1 MB), but note that this - * will eat up the JRE's direct buffer storage space + * @param mergeBufferSize Size of buffer to use for + * merging. See {@link #DEFAULT_MERGE_BUFFER_SIZE}. + * @param minBytesDirect Merges, or files to be opened for + * reading, smaller than this will + * not use direct IO. See {@link + * #DEFAULT_MIN_BYTES_DIRECT} + * @param delegate fallback Directory for non-merges * @throws IOException */ - public DirectIOLinuxDirectory(File path, LockFactory lockFactory, int forcedBufferSize) throws IOException { - super(path, lockFactory); - this.forcedBufferSize = forcedBufferSize; + public NativeUnixDirectory(File path, int mergeBufferSize, long minBytesDirect, Directory delegate) throws IOException { + super(path, delegate.getLockFactory()); + if ((mergeBufferSize & ALIGN) != 0) { + throw new IllegalArgumentException("mergeBufferSize must be 0 mod " + ALIGN + " (got: " + mergeBufferSize + ")"); + } + this.mergeBufferSize = mergeBufferSize; + this.minBytesDirect = minBytesDirect; + this.delegate = delegate; } + + /** Create a new NIOFSDirectory for the named location. + * + * @param path the path of the directory + * @param delegate fallback Directory for non-merges + * @throws IOException + */ + public NativeUnixDirectory(File path, Directory delegate) throws IOException { + this(path, DEFAULT_MERGE_BUFFER_SIZE, DEFAULT_MIN_BYTES_DIRECT, delegate); + } @Override public IndexInput openInput(String name, IOContext context) throws IOException { ensureOpen(); - return new DirectIOLinuxIndexInput(new File(getDirectory(), name), - bufferSize(context)); + if (context.context != Context.MERGE || context.mergeInfo.estimatedMergeBytes < minBytesDirect || fileLength(name) < minBytesDirect) { + return delegate.openInput(name, context); + } else { + return new NativeUnixIndexInput(new File(getDirectory(), name), mergeBufferSize); + } } @Override public IndexOutput createOutput(String name, IOContext context) throws IOException { ensureOpen(); - ensureCanWrite(name); - return new DirectIOLinuxIndexOutput(new File(getDirectory(), name), bufferSize(context)); - } - - private int bufferSize(IOContext context) { - return forcedBufferSize != 0 ? forcedBufferSize : BufferedIndexInput - .bufferSize(context); + if (context.context != Context.MERGE || context.mergeInfo.estimatedMergeBytes < minBytesDirect) { + return delegate.createOutput(name, context); + } else { + ensureCanWrite(name); + return new NativeUnixIndexOutput(new File(getDirectory(), name), mergeBufferSize); + } } - private final static class DirectIOLinuxIndexOutput extends IndexOutput { + private final static class NativeUnixIndexOutput extends IndexOutput { private final ByteBuffer buffer; private final FileOutputStream fos; private final FileChannel channel; @@ -101,9 +153,9 @@ public class DirectIOLinuxDirectory extends FSDirectory { private long fileLength; private boolean isOpen; - public DirectIOLinuxIndexOutput(File path, int bufferSize) throws IOException { + public NativeUnixIndexOutput(File path, int bufferSize) throws IOException { //this.path = path; - FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), false); + final FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), false); fos = new FileOutputStream(fd); //fos = new FileOutputStream(path); channel = fos.getChannel(); @@ -206,7 +258,7 @@ public class DirectIOLinuxDirectory extends FSDirectory { @Override public long length() throws IOException { - return fileLength; + return fileLength + bufferPos; } @Override @@ -233,7 +285,7 @@ public class DirectIOLinuxDirectory extends FSDirectory { } } - private final static class DirectIOLinuxIndexInput extends IndexInput { + private final static class NativeUnixIndexInput extends IndexInput { private final ByteBuffer buffer; private final FileInputStream fis; private final FileChannel channel; @@ -244,10 +296,9 @@ public class DirectIOLinuxDirectory extends FSDirectory { private long filePos; private int bufferPos; - public DirectIOLinuxIndexInput(File path, int bufferSize) throws IOException { - // TODO make use of IOContext - super("DirectIOLinuxIndexInput(path=\"" + path.getPath() + "\")"); - FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), true); + public NativeUnixIndexInput(File path, int bufferSize) throws IOException { + super("NativeUnixIndexInput(path=\"" + path.getPath() + "\")"); + final FileDescriptor fd = NativePosixUtil.open_direct(path.toString(), true); fis = new FileInputStream(fd); channel = fis.getChannel(); this.bufferSize = bufferSize; @@ -260,7 +311,7 @@ public class DirectIOLinuxDirectory extends FSDirectory { } // for clone - public DirectIOLinuxIndexInput(DirectIOLinuxIndexInput other) throws IOException { + public NativeUnixIndexInput(NativeUnixIndexInput other) throws IOException { super(other.toString()); this.fis = null; channel = other.channel; @@ -296,13 +347,17 @@ public class DirectIOLinuxDirectory extends FSDirectory { public void seek(long pos) throws IOException { if (pos != getFilePointer()) { final long alignedPos = pos & ALIGN_NOT_MASK; - //System.out.println("seek pos=" + pos + " aligned=" + alignedPos + " bufferSize=" + bufferSize + " this=" + this); filePos = alignedPos-bufferSize; - refill(); final int delta = (int) (pos - alignedPos); - buffer.position(delta); - bufferPos = delta; + if (delta != 0) { + refill(); + buffer.position(delta); + bufferPos = delta; + } else { + // force refill on next read + bufferPos = bufferSize; + } } } @@ -371,7 +426,7 @@ public class DirectIOLinuxDirectory extends FSDirectory { @Override public Object clone() { try { - return new DirectIOLinuxIndexInput(this); + return new NativeUnixIndexInput(this); } catch (IOException ioe) { throw new RuntimeException("IOException during clone: " + this, ioe); } diff --git a/lucene/contrib/misc/src/java/overview.html b/lucene/contrib/misc/src/java/overview.html index 7574699964c..a2c668d35ca 100644 --- a/lucene/contrib/misc/src/java/overview.html +++ b/lucene/contrib/misc/src/java/overview.html @@ -27,33 +27,29 @@ The misc package has various tools for splitting/merging indices, changing norms, finding high freq terms, and others. -

DirectIOLinuxDirectory

+ +

NativeUnixDirectory

NOTE: This uses C++ sources (accessible via JNI), which you'll -have to compile on your platform. Further, this is a very -platform-specific extensions (runs only on Linux, and likely only on -2.6.x kernels). +have to compile on your platform.

-DirectIOLinuxDirectory is a Directory implementation that bypasses the -OS's buffer cache for any IndexInput and IndexOutput opened through it -(using the linux-specific O_DIRECT flag). +{@link org.apache.lucene.store.NativeUnixDirectory} is a Directory implementation that bypasses the +OS's buffer cache (using direct IO) for any IndexInput and IndexOutput +used during merging of segments larger than a specified size (default +10 MB). This avoids evicting hot pages that are still in-use for +searching, keeping search more responsive while large merges run.

-Note that doing so typically results in bad performance loss! You -should not use this for searching, but rather for indexing (or maybe -just merging during indexing), to avoid evicting useful pages from the -buffer cache. - -See here +See this blog post for details. Steps to build:

  • cd lucene/contrib/misc/ -
  • To compile NativePosixUtil.cpp -> libNativePosixUtil.so on Linux run ant build-native-unix. +
  • To compile NativePosixUtil.cpp -> libNativePosixUtil.so, run ant build-native-unix.
  • libNativePosixUtil.so will be located in the lucene/build/native/ folder @@ -62,13 +58,6 @@ Steps to build:
  • ant jar to compile the java source and put that JAR on your CLASSPATH
-

-To use this, you'll likely want to make a custom subclass of -FSDirectory that only opens direct IndexInput/Output for merging. One -hackish way to do this is to check if the current thread's name starts -with "Lucene Merge Thread". Alternatively, you could use this Dir as -is for all indexing ops, but not for searching. -

NativePosixUtil.cpp/java also expose access to the posix_madvise, madvise, posix_fadvise functions, which are somewhat more cross diff --git a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java index 141eaf57d8f..9cbc07f6837 100644 --- a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java @@ -398,7 +398,7 @@ public class BlockTreeTermsReader extends FieldsProducer { final long indexStartFP; final long rootBlockFP; final BytesRef rootCode; - private FST index; + private final FST index; //private boolean DEBUG; @@ -433,6 +433,8 @@ public class BlockTreeTermsReader extends FieldsProducer { w.close(); } */ + } else { + index = null; } } @@ -495,6 +497,8 @@ public class BlockTreeTermsReader extends FieldsProducer { private final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; + // TODO: can we share this with the frame in STE? private final class Frame { final int ord; @@ -755,6 +759,12 @@ public class BlockTreeTermsReader extends FieldsProducer { arcs[arcIdx] = new FST.Arc(); } + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(0); + } + // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to // the initial term and likely to subsequent terms @@ -842,7 +852,7 @@ public class BlockTreeTermsReader extends FieldsProducer { // TODO: we could be more efficient for the next() // case by using current arc as starting point, // passed to findTargetArc - arc = index.findTargetArc(target, arc, getArc(1+idx)); + arc = index.findTargetArc(target, arc, getArc(1+idx), fstReader); assert arc != null; output = fstOutputs.add(output, arc.output); idx++; @@ -1186,6 +1196,7 @@ public class BlockTreeTermsReader extends FieldsProducer { private boolean eof; final BytesRef term = new BytesRef(); + private final FST.BytesReader fstReader; @SuppressWarnings("unchecked") private FST.Arc[] arcs = new FST.Arc[1]; @@ -1196,6 +1207,12 @@ public class BlockTreeTermsReader extends FieldsProducer { // Used to hold seek by TermState, or cached seek staticFrame = new Frame(-1); + if (index == null) { + fstReader = null; + } else { + fstReader = index.getBytesReader(0); + } + // Init w/ root block; don't use index since it may // not (and need not) have been loaded for(int arcIdx=0;arcIdx nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); if (nextArc == null) { @@ -1838,7 +1855,7 @@ public class BlockTreeTermsReader extends FieldsProducer { final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; - final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto)); + final FST.Arc nextArc = index.findTargetArc(targetLabel, arc, getArc(1+targetUpto), fstReader); if (nextArc == null) { diff --git a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index 0b9870df057..b0f4624b137 100644 --- a/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -288,7 +288,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, null); + outputs, null, false); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} @@ -831,7 +831,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { 0, 0, true, true, Integer.MAX_VALUE, noOutputs, - new FindBlocks()); + new FindBlocks(), false); postingsWriter.setField(fieldInfo); } diff --git a/lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java b/lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java index 5ad16a02967..efa27ce6d1b 100644 --- a/lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java @@ -229,7 +229,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in - fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer)); + fstBuilder.add(new IntsRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; } @@ -260,7 +260,7 @@ public class VariableGapTermsIndexWriter extends TermsIndexWriterBase { final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm, text); try { - fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer)); + fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } diff --git a/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java index dabf65f0383..af4a826b531 100644 --- a/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java @@ -521,9 +521,10 @@ class SimpleTextFieldsReader extends FieldsProducer { private void loadTerms() throws IOException { PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false); final Builder>> b; - b = new Builder>>(FST.INPUT_TYPE.BYTE1, - new PairOutputs>(posIntOutputs, - new PairOutputs(posIntOutputs, posIntOutputs))); + final PairOutputs outputsInner = new PairOutputs(posIntOutputs, posIntOutputs); + final PairOutputs> outputs = new PairOutputs>(posIntOutputs, + outputsInner); + b = new Builder>>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRef lastTerm = new BytesRef(10); @@ -536,9 +537,9 @@ class SimpleTextFieldsReader extends FieldsProducer { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, - new PairOutputs.Pair((long) docFreq, - posIntOutputs.get(totalTermFreq)))); + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), + outputs.newPair(lastDocsStart, + outputsInner.newPair((long) docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; @@ -553,9 +554,8 @@ class SimpleTextFieldsReader extends FieldsProducer { totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { - b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, - new PairOutputs.Pair((long) docFreq, - posIntOutputs.get(totalTermFreq)))); + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), outputs.newPair(lastDocsStart, + outputsInner.newPair((long) docFreq, totalTermFreq))); } lastDocsStart = in.getFilePointer(); final int len = scratch.length - TERM.length; diff --git a/lucene/src/java/org/apache/lucene/index/DirectoryReader.java b/lucene/src/java/org/apache/lucene/index/DirectoryReader.java index a39e9039dd0..3a733532a0f 100644 --- a/lucene/src/java/org/apache/lucene/index/DirectoryReader.java +++ b/lucene/src/java/org/apache/lucene/index/DirectoryReader.java @@ -425,7 +425,6 @@ final class DirectoryReader extends BaseMultiReader { Collection files; Directory dir; long generation; - long version; final Map userData; private final int segmentCount; @@ -434,7 +433,6 @@ final class DirectoryReader extends BaseMultiReader { this.dir = dir; userData = infos.getUserData(); files = Collections.unmodifiableCollection(infos.files(dir, true)); - version = infos.getVersion(); generation = infos.getGeneration(); segmentCount = infos.size(); } @@ -464,11 +462,6 @@ final class DirectoryReader extends BaseMultiReader { return dir; } - @Override - public long getVersion() { - return version; - } - @Override public long getGeneration() { return generation; diff --git a/lucene/src/java/org/apache/lucene/index/IndexCommit.java b/lucene/src/java/org/apache/lucene/index/IndexCommit.java index 79de850390b..752abb24cdc 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexCommit.java +++ b/lucene/src/java/org/apache/lucene/index/IndexCommit.java @@ -83,39 +83,31 @@ public abstract class IndexCommit implements Comparable { public boolean equals(Object other) { if (other instanceof IndexCommit) { IndexCommit otherCommit = (IndexCommit) other; - return otherCommit.getDirectory().equals(getDirectory()) && otherCommit.getVersion() == getVersion(); - } else + return otherCommit.getDirectory().equals(getDirectory()) && otherCommit.getGeneration() == getGeneration(); + } else { return false; + } } @Override public int hashCode() { - return (int) (getDirectory().hashCode() + getVersion()); + return getDirectory().hashCode() + Long.valueOf(getGeneration()).hashCode(); } - /** Returns the version for this IndexCommit. This is the - * same value that {@link IndexReader#getVersion} would - * return if it were opened on this commit. */ - public abstract long getVersion(); - /** Returns the generation (the _N in segments_N) for this * IndexCommit */ public abstract long getGeneration(); - /** Convenience method that returns the last modified time - * of the segments_N file corresponding to this index - * commit, equivalent to - * getDirectory().fileModified(getSegmentsFileName()). */ - public long getTimestamp() throws IOException { - return getDirectory().fileModified(getSegmentsFileName()); - } - /** Returns userData, previously passed to {@link * IndexWriter#commit(Map)} for this commit. Map is * String -> String. */ public abstract Map getUserData() throws IOException; public int compareTo(IndexCommit commit) { + if (getDirectory() != commit.getDirectory()) { + throw new UnsupportedOperationException("cannot compare IndexCommits from different Directory instances"); + } + long gen = getGeneration(); long comgen = commit.getGeneration(); if (gen < comgen) { @@ -126,5 +118,4 @@ public abstract class IndexCommit implements Comparable { return 0; } } - } diff --git a/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java b/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java index 05069de2d8f..ef8c2ae1678 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java +++ b/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java @@ -655,7 +655,6 @@ final class IndexFileDeleter { boolean deleted; Directory directory; Collection commitsToDelete; - long version; long generation; final Map userData; private final int segmentCount; @@ -665,7 +664,6 @@ final class IndexFileDeleter { this.commitsToDelete = commitsToDelete; userData = segmentInfos.getUserData(); segmentsFileName = segmentInfos.getCurrentSegmentFileName(); - version = segmentInfos.getVersion(); generation = segmentInfos.getGeneration(); files = Collections.unmodifiableCollection(segmentInfos.files(directory, true)); segmentCount = segmentInfos.size(); @@ -696,11 +694,6 @@ final class IndexFileDeleter { return directory; } - @Override - public long getVersion() { - return version; - } - @Override public long getGeneration() { return generation; diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index ae4037d22b2..5e83092cd9f 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -468,36 +468,6 @@ public abstract class IndexReader implements Closeable { throw new UnsupportedOperationException("This reader does not support this method."); } - /** - * Returns the time the index in the named directory was last modified. - * Do not use this to check whether the reader is still up-to-date, use - * {@link #isCurrent()} instead. - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public static long lastModified(final Directory directory2) throws CorruptIndexException, IOException { - return ((Long) new SegmentInfos.FindSegmentsFile(directory2) { - @Override - public Object doBody(String segmentFileName) throws IOException { - return Long.valueOf(directory2.fileModified(segmentFileName)); - } - }.run()).longValue(); - } - - /** - * Reads version number from segments files. The version number is - * initialized with a timestamp and then increased by one for each change of - * the index. - * - * @param directory where the index resides. - * @return version number. - * @throws CorruptIndexException if the index is corrupt - * @throws IOException if there is a low-level IO error - */ - public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException { - return SegmentInfos.readCurrentVersion(directory); - } - /** * Reads commitUserData, previously passed to {@link * IndexWriter#commit(Map)}, from current index @@ -525,18 +495,7 @@ public abstract class IndexReader implements Closeable { * a reader based on a Directory), then this method * returns the version recorded in the commit that the * reader opened. This version is advanced every time - * {@link IndexWriter#commit} is called.

- * - *

If instead this reader is a near real-time reader - * (ie, obtained by a call to {@link - * IndexWriter#getReader}, or by calling {@link #openIfChanged} - * on a near real-time reader), then this method returns - * the version of the last commit done by the writer. - * Note that even as further changes are made with the - * writer, the version will not changed until a commit is - * completed. Thus, you should not rely on this method to - * determine when a near real-time reader should be - * opened. Use {@link #isCurrent} instead.

+ * a change is made with {@link IndexWriter}.

* * @throws UnsupportedOperationException unless overridden in subclass */ diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java index 4855f5d6e68..9b7b70c12b4 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java @@ -94,16 +94,15 @@ public final class SegmentInfos implements Cloneable, Iterable { * Whenever you add a new format, make it 1 smaller (negative version logic)! */ public static final int FORMAT_SEGMENTS_GEN_CURRENT = -2; - public int counter = 0; // used to name new segments + public int counter; // used to name new segments /** - * counts how often the index has been changed by adding or deleting docs. - * starting with the current time in milliseconds forces to create unique version numbers. + * counts how often the index has been changed */ - public long version = System.currentTimeMillis(); + public long version; - private long generation = 0; // generation of the "segments_N" for the next commit - private long lastGeneration = 0; // generation of the "segments_N" file we last successfully read + private long generation; // generation of the "segments_N" for the next commit + private long lastGeneration; // generation of the "segments_N" file we last successfully read // or wrote; this is normally the same as generation except if // there was an IOException that had interrupted a commit diff --git a/lucene/src/java/org/apache/lucene/index/SnapshotDeletionPolicy.java b/lucene/src/java/org/apache/lucene/index/SnapshotDeletionPolicy.java index 23137ba3bc3..1bf50cc90b6 100644 --- a/lucene/src/java/org/apache/lucene/index/SnapshotDeletionPolicy.java +++ b/lucene/src/java/org/apache/lucene/index/SnapshotDeletionPolicy.java @@ -125,11 +125,6 @@ public class SnapshotDeletionPolicy implements IndexDeletionPolicy { return cp.getUserData(); } - @Override - public long getVersion() { - return cp.getVersion(); - } - @Override public boolean isDeleted() { return cp.isDeleted(); diff --git a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java index c0304b58bb6..260c2fa38bc 100644 --- a/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/CompoundFileDirectory.java @@ -249,14 +249,6 @@ public final class CompoundFileDirectory extends Directory { return entries.containsKey(IndexFileNames.stripSegmentName(name)); } - - /** Returns the time the compound file was last modified. */ - @Override - public long fileModified(String name) throws IOException { - ensureOpen(); - return directory.fileModified(fileName); - } - /** Not implemented * @throws UnsupportedOperationException */ @Override diff --git a/lucene/src/java/org/apache/lucene/store/Directory.java b/lucene/src/java/org/apache/lucene/store/Directory.java index 8dd98264709..350b0044335 100644 --- a/lucene/src/java/org/apache/lucene/store/Directory.java +++ b/lucene/src/java/org/apache/lucene/store/Directory.java @@ -62,10 +62,6 @@ public abstract class Directory implements Closeable { public abstract boolean fileExists(String name) throws IOException; - /** Returns the time the named file was last modified. */ - public abstract long fileModified(String name) - throws IOException; - /** Removes an existing file in the directory. */ public abstract void deleteFile(String name) throws IOException; diff --git a/lucene/src/java/org/apache/lucene/store/FSDirectory.java b/lucene/src/java/org/apache/lucene/store/FSDirectory.java index 5bd5e7e2291..c9944263a54 100644 --- a/lucene/src/java/org/apache/lucene/store/FSDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/FSDirectory.java @@ -250,14 +250,6 @@ public abstract class FSDirectory extends Directory { return file.exists(); } - /** Returns the time the named file was last modified. */ - @Override - public long fileModified(String name) { - ensureOpen(); - File file = new File(directory, name); - return file.lastModified(); - } - /** Returns the time the named file was last modified. */ public static long fileModified(File directory, String name) { File file = new File(directory, name); diff --git a/lucene/src/java/org/apache/lucene/store/FileSwitchDirectory.java b/lucene/src/java/org/apache/lucene/store/FileSwitchDirectory.java index 2abf48c28af..201d0ce2ff9 100644 --- a/lucene/src/java/org/apache/lucene/store/FileSwitchDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/FileSwitchDirectory.java @@ -137,11 +137,6 @@ public class FileSwitchDirectory extends Directory { return getDirectory(name).fileExists(name); } - @Override - public long fileModified(String name) throws IOException { - return getDirectory(name).fileModified(name); - } - @Override public void deleteFile(String name) throws IOException { getDirectory(name).deleteFile(name); diff --git a/lucene/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/src/java/org/apache/lucene/store/NRTCachingDirectory.java index cfc1e0b1d15..337c0f84fdd 100644 --- a/lucene/src/java/org/apache/lucene/store/NRTCachingDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/NRTCachingDirectory.java @@ -152,15 +152,6 @@ public class NRTCachingDirectory extends Directory { return cache.fileExists(name) || delegate.fileExists(name); } - @Override - public synchronized long fileModified(String name) throws IOException { - if (cache.fileExists(name)) { - return cache.fileModified(name); - } else { - return delegate.fileModified(name); - } - } - @Override public synchronized void deleteFile(String name) throws IOException { if (VERBOSE) { diff --git a/lucene/src/java/org/apache/lucene/store/RAMDirectory.java b/lucene/src/java/org/apache/lucene/store/RAMDirectory.java index 81b815ea763..2c9ddc5b5ec 100644 --- a/lucene/src/java/org/apache/lucene/store/RAMDirectory.java +++ b/lucene/src/java/org/apache/lucene/store/RAMDirectory.java @@ -98,19 +98,6 @@ public class RAMDirectory extends Directory { return fileMap.containsKey(name); } - /** Returns the time the named file was last modified. - * @throws IOException if the file does not exist - */ - @Override - public final long fileModified(String name) throws IOException { - ensureOpen(); - RAMFile file = fileMap.get(name); - if (file == null) { - throw new FileNotFoundException(name); - } - return file.getLastModified(); - } - /** Returns the length in bytes of a file in the directory. * @throws IOException if the file does not exist */ diff --git a/lucene/src/java/org/apache/lucene/store/RAMFile.java b/lucene/src/java/org/apache/lucene/store/RAMFile.java index bd946cd7843..88f6c83d64f 100644 --- a/lucene/src/java/org/apache/lucene/store/RAMFile.java +++ b/lucene/src/java/org/apache/lucene/store/RAMFile.java @@ -26,8 +26,6 @@ public class RAMFile { RAMDirectory directory; protected long sizeInBytes; - private long lastModified = System.currentTimeMillis(); - // File used as buffer, in no RAMDirectory public RAMFile() {} @@ -44,15 +42,6 @@ public class RAMFile { this.length = length; } - // For non-stream access from thread that might be concurrent with writing - public synchronized long getLastModified() { - return lastModified; - } - - protected synchronized void setLastModified(long lastModified) { - this.lastModified = lastModified; - } - protected final byte[] addBuffer(int size) { byte[] buffer = newBuffer(size); synchronized(this) { diff --git a/lucene/src/java/org/apache/lucene/store/RAMOutputStream.java b/lucene/src/java/org/apache/lucene/store/RAMOutputStream.java index 62a3848c408..66a44b61f9d 100644 --- a/lucene/src/java/org/apache/lucene/store/RAMOutputStream.java +++ b/lucene/src/java/org/apache/lucene/store/RAMOutputStream.java @@ -167,7 +167,6 @@ public class RAMOutputStream extends IndexOutput { @Override public void flush() throws IOException { - file.setLastModified(System.currentTimeMillis()); setFileLength(); } diff --git a/lucene/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/src/java/org/apache/lucene/util/FixedBitSet.java index ca23078e775..103da990320 100644 --- a/lucene/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/src/java/org/apache/lucene/util/FixedBitSet.java @@ -95,7 +95,7 @@ public final class FixedBitSet extends DocIdSet implements Bits { } public boolean get(int index) { - assert index >= 0 && index < numBits; + assert index >= 0 && index < numBits: "index=" + index; int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. diff --git a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java index 3abc40884e4..e81a6077a0e 100644 --- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -588,7 +588,7 @@ public final class UnicodeUtil { out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f)); offset += 2; } else { - assert b < 0xf8; + assert b < 0xf8: "b=" + b; int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f); offset += 3; if (ch < UNI_MAX_BMP) { diff --git a/lucene/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/fst/Builder.java index 34e9f5baa66..fbda31bb99f 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Builder.java @@ -17,15 +17,15 @@ package org.apache.lucene.util.fst; * limitations under the License. */ -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc - import java.io.IOException; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc + /** - * Builds a compact FST (maps an IntsRef term to an arbitrary + * Builds a minimal FST (maps an IntsRef term to an arbitrary * output) from pre-sorted terms with outputs (the FST * becomes an FSA if you use NoOutputs). The FST is written * on-the-fly into a compact serialized format byte array, which can @@ -35,12 +35,6 @@ import java.io.IOException; *

NOTE: The algorithm is described at * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

* - * If your outputs are ByteSequenceOutput then the final FST - * will be minimal, but if you use PositiveIntOutput then - * it's only "near minimal". For example, aa/0, aab/1, bbb/2 - * will produce 6 states when a 5 state fst is also - * possible. - * * The parameterized type T is the output type. See the * subclasses of {@link Outputs}. * @@ -52,7 +46,7 @@ public class Builder { private final FST fst; private final T NO_OUTPUT; - // private static final boolean DEBUG = false; + // private static final boolean DEBUG = true; // simplistic pruning: we prune node (and all following // nodes) if less than this number of terms go through it: @@ -84,11 +78,12 @@ public class Builder { /** * Instantiates an FST/FSA builder without any pruning. A shortcut - * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, FreezeTail)} with + * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, + * boolean, int, Outputs, FreezeTail, boolean)} with * pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, false); } /** @@ -127,16 +122,21 @@ public class Builder { * @param outputs The output type for each input sequence. Applies only if building an FST. For * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. + * + * @param willPackFST Pass true if you will pack the FST before saving. This + * causes the FST to create additional data structures internally to facilitate packing, but + * it means the resulting FST cannot be saved: it must + * first be packed using {@link FST#pack(int, int)}}. */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - FreezeTail freezeTail) { + FreezeTail freezeTail, boolean willPackFST) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.freezeTail = freezeTail; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; - fst = new FST(inputType, outputs); + fst = new FST(inputType, outputs, willPackFST); if (doShareSuffix) { dedupHash = new NodeHash(fst); } else { @@ -170,23 +170,23 @@ public class Builder { fst.setAllowArrayArcs(b); } - private CompiledNode compileNode(UnCompiledNode n, int tailLength) throws IOException { - final int address; - if (dedupHash != null && (doShareNonSingletonNodes || n.numArcs <= 1) && tailLength <= shareMaxTailLength) { - if (n.numArcs == 0) { - address = fst.addNode(n); + private CompiledNode compileNode(UnCompiledNode nodeIn, int tailLength) throws IOException { + final int node; + if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { + if (nodeIn.numArcs == 0) { + node = fst.addNode(nodeIn); } else { - address = dedupHash.add(n); + node = dedupHash.add(nodeIn); } } else { - address = fst.addNode(n); + node = fst.addNode(nodeIn); } - assert address != -2; + assert node != -2; - n.clear(); + nodeIn.clear(); final CompiledNode fn = new CompiledNode(); - fn.address = address; + fn.node = node; return fn; } @@ -319,6 +319,11 @@ public class Builder { } */ + // De-dup NO_OUTPUT since it must be a singleton: + if (output.equals(NO_OUTPUT)) { + output = NO_OUTPUT; + } + assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; assert validOutput(output); @@ -443,7 +448,7 @@ public class Builder { } } //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); - fst.finish(compileNode(root, lastInput.length).address); + fst.finish(compileNode(root, lastInput.length).node); return fst; } @@ -480,7 +485,7 @@ public class Builder { } static final class CompiledNode implements Node { - int address; + int node; public boolean isCompiled() { return true; } @@ -560,7 +565,7 @@ public class Builder { final Arc arc = arcs[numArcs-1]; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; arc.target = target; - //assert target.address != -2; + //assert target.node != -2; arc.nextFinalOutput = nextFinalOutput; arc.isFinal = isFinal; } diff --git a/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene/src/java/org/apache/lucene/util/fst/FST.java index 109eac83756..4e5dcb6d7b3 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -25,6 +25,8 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -33,12 +35,23 @@ import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.fst.Builder.UnCompiledNode; +// TODO: break this into WritableFST and ReadOnlyFST.. then +// we can have subclasses of ReadOnlyFST to handle the +// different byte[] level encodings (packed or +// not)... and things like nodeCount, arcCount are read only + // TODO: if FST is pure prefix trie we can do a more compact // job, ie, once we are at a 'suffix only', just store the // completion labels as a string not as a series of arcs. +// TODO: maybe make an explicit thread state that holds +// reusable stuff eg BytesReader, a scratch arc + // NOTE: while the FST is able to represent a non-final // dead-end state (NON_FINAL_END_NODE=0), the layers above // (FSTEnum, Util) have problems with this!! @@ -52,13 +65,15 @@ import org.apache.lucene.util.fst.Builder.UnCompiledNode; * * @lucene.experimental */ -public class FST { +public final class FST { public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; public final INPUT_TYPE inputType; private final static int BIT_FINAL_ARC = 1 << 0; private final static int BIT_LAST_ARC = 1 << 1; - private final static int BIT_TARGET_NEXT = 1 << 2; + final static int BIT_TARGET_NEXT = 1 << 2; + + // TODO: we can free up a bit if we can nuke this: private final static int BIT_STOP_NODE = 1 << 3; private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; @@ -66,7 +81,12 @@ public class FST { // Arcs are stored as fixed-size (per entry) array, so // that we can find an arc using binary search. We do // this when number of arcs is > NUM_ARCS_ARRAY: - private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + + // If set, the target node is delta coded vs current + // position: + private final static int BIT_TARGET_DELTA = 1 << 6; + + private final static byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; /** * @see #shouldExpand(UnCompiledNode) @@ -95,7 +115,10 @@ public class FST { /** Write BYTE2 labels as 2-byte short, not vInt. */ private final static int VERSION_SHORT_BYTE2_LABELS = 2; - private final static int VERSION_CURRENT = VERSION_SHORT_BYTE2_LABELS; + /** Added optional packed format. */ + private final static int VERSION_PACKED = 3; + + private final static int VERSION_CURRENT = VERSION_PACKED; // Never serialized; just used to represent the virtual // final node w/ no arcs: @@ -126,6 +149,9 @@ public class FST { public int arcCount; public int arcWithOutputCount; + private final boolean packed; + private final int[] nodeRefToAddress; + // If arc has this label then that arc is final/accepted public static final int END_LABEL = -1; @@ -137,10 +163,17 @@ public class FST { public int label; public T output; + // From node (ord or address); currently only used when + // building an FST w/ willPackFST=true: + int node; + + // To node (ord or address): public int target; byte flags; public T nextFinalOutput; + + // address (into the byte[]), or ord/address if label == END_LABEL int nextArc; // This is non-zero if current arcs are fixed array: @@ -151,19 +184,18 @@ public class FST { /** Returns this */ public Arc copyFrom(Arc other) { + node = other.node; label = other.label; target = other.target; flags = other.flags; output = other.output; nextFinalOutput = other.nextFinalOutput; nextArc = other.nextArc; - if (other.bytesPerArc != 0) { - bytesPerArc = other.bytesPerArc; + bytesPerArc = other.bytesPerArc; + if (bytesPerArc != 0) { posArcsStart = other.posArcsStart; arcIdx = other.arcIdx; numArcs = other.numArcs; - } else { - bytesPerArc = 0; } return this; } @@ -179,40 +211,91 @@ public class FST { public boolean isFinal() { return flag(BIT_FINAL_ARC); } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("node=" + node); + b.append(" target=" + target); + b.append(" label=" + label); + if (flag(BIT_LAST_ARC)) { + b.append(" last"); + } + if (flag(BIT_FINAL_ARC)) { + b.append(" final"); + } + if (flag(BIT_TARGET_NEXT)) { + b.append(" targetNext"); + } + if (flag(BIT_ARC_HAS_OUTPUT)) { + b.append(" hasOutput"); + } + if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + b.append(" hasFinalOutput"); + } + if (bytesPerArc != 0) { + b.append(" arcArray(idx=" + arcIdx + " of " + numArcs + ")"); + } + return b.toString(); + } }; - static boolean flag(int flags, int bit) { + private final static boolean flag(int flags, int bit) { return (flags & bit) != 0; } private final BytesWriter writer; - // make a new empty FST, for building - public FST(INPUT_TYPE inputType, Outputs outputs) { + // TODO: we can save RAM here by using growable packed + // ints...: + private int[] nodeAddress; + + // TODO: we could be smarter here, and prune periodically + // as we go; high in-count nodes will "usually" become + // clear early on: + private int[] inCounts; + + // make a new empty FST, for building; Builder invokes + // this ctor + FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST) { this.inputType = inputType; this.outputs = outputs; bytes = new byte[128]; NO_OUTPUT = outputs.getNoOutput(); + if (willPackFST) { + nodeAddress = new int[8]; + inCounts = new int[8]; + } else { + nodeAddress = null; + inCounts = null; + } writer = new BytesWriter(); emptyOutput = null; + packed = false; + nodeRefToAddress = null; } - // create an existing FST + /** Load a previously saved FST. */ public FST(DataInput in, Outputs outputs) throws IOException { this.outputs = outputs; writer = null; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): - CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_SHORT_BYTE2_LABELS, VERSION_SHORT_BYTE2_LABELS); + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_PACKED); + packed = in.readByte() == 1; if (in.readByte() == 1) { // accepts empty string int numBytes = in.readVInt(); // messy bytes = new byte[numBytes]; in.readBytes(bytes, 0, numBytes); - emptyOutput = outputs.read(getBytesReader(numBytes-1)); + if (packed) { + emptyOutput = outputs.read(getBytesReader(0)); + } else { + emptyOutput = outputs.read(getBytesReader(numBytes-1)); + } } else { emptyOutput = null; } @@ -230,6 +313,15 @@ public class FST { default: throw new IllegalStateException("invalid input type " + t); } + if (packed) { + final int nodeRefCount = in.readVInt(); + nodeRefToAddress = new int[nodeRefCount]; + for(int idx=0;idx { /** Returns bytes used to represent the FST */ public int sizeInBytes() { - return bytes.length; + int size = bytes.length; + if (packed) { + size += nodeRefToAddress.length * RamUsageEstimator.NUM_BYTES_INT; + } else if (nodeAddress != null) { + size += nodeAddress.length * RamUsageEstimator.NUM_BYTES_INT; + size += inCounts.length * RamUsageEstimator.NUM_BYTES_INT; + } + return size; } void finish(int startNode) throws IOException { @@ -266,15 +365,25 @@ public class FST { cacheRootArcs(); } + private int getNodeAddress(int node) { + if (nodeAddress != null) { + // Deref + return nodeAddress[node]; + } else { + // Straight + return node; + } + } + // Caches first 128 labels @SuppressWarnings("unchecked") private void cacheRootArcs() throws IOException { - cachedRootArcs = (FST.Arc[]) new FST.Arc[0x80]; - final FST.Arc arc = new FST.Arc(); + cachedRootArcs = (Arc[]) new Arc[0x80]; + final Arc arc = new Arc(); getFirstArc(arc); final BytesReader in = getBytesReader(0); if (targetHasArcs(arc)) { - readFirstRealArc(arc.target, arc, in); + readFirstRealTargetArc(arc.target, arc, in); while(true) { assert arc.label != END_LABEL; if (arc.label < cachedRootArcs.length) { @@ -307,14 +416,16 @@ public class FST { outputs.write(emptyOutput, writer); emptyOutputBytes = new byte[writer.posWrite-posSave]; - // reverse - final int stopAt = (writer.posWrite - posSave)/2; - int upto = 0; - while(upto < stopAt) { - final byte b = bytes[posSave + upto]; - bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; - bytes[writer.posWrite-upto-1] = b; - upto++; + if (!packed) { + // reverse + final int stopAt = (writer.posWrite - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; + bytes[writer.posWrite-upto-1] = b; + upto++; + } } System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); writer.posWrite = posSave; @@ -324,7 +435,15 @@ public class FST { if (startNode == -1) { throw new IllegalStateException("call finish first"); } + if (nodeAddress != null) { + throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); + } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + if (packed) { + out.writeByte((byte) 1); + } else { + out.writeByte((byte) 0); + } // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { @@ -343,6 +462,13 @@ public class FST { t = 2; } out.writeByte(t); + if (packed) { + assert nodeRefToAddress != null; + out.writeVInt(nodeRefToAddress.length); + for(int idx=0;idx { // returns true if the node at this address has any // outgoing arcs - public boolean targetHasArcs(Arc arc) { + public static boolean targetHasArcs(Arc arc) { return arc.target > 0; } // serializes new node by appending its bytes to the end // of the current byte[] - int addNode(Builder.UnCompiledNode node) throws IOException { - //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs); - if (node.numArcs == 0) { - if (node.isFinal) { + int addNode(Builder.UnCompiledNode nodeIn) throws IOException { + //System.out.println("FST.addNode pos=" + writer.posWrite + " numArcs=" + nodeIn.numArcs); + if (nodeIn.numArcs == 0) { + if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE; @@ -437,15 +563,15 @@ public class FST { int startAddress = writer.posWrite; //System.out.println(" startAddr=" + startAddress); - final boolean doFixedArray = shouldExpand(node); + final boolean doFixedArray = shouldExpand(nodeIn); final int fixedArrayStart; if (doFixedArray) { - if (bytesPerArc.length < node.numArcs) { - bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; + if (bytesPerArc.length < nodeIn.numArcs) { + bytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } // write a "false" first arc: - writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); - writer.writeVInt(node.numArcs); + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(nodeIn.numArcs); // placeholder -- we'll come back and write the number // of bytes per arc (int) here: // TODO: we could make this a vInt instead @@ -456,15 +582,14 @@ public class FST { fixedArrayStart = 0; } - nodeCount++; - arcCount += node.numArcs; + arcCount += nodeIn.numArcs; - final int lastArc = node.numArcs-1; + final int lastArc = nodeIn.numArcs-1; int lastArcStart = writer.posWrite; int maxBytesPerArc = 0; - for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + for(int arcIdx=0;arcIdx arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; @@ -472,7 +597,10 @@ public class FST { flags += BIT_LAST_ARC; } - if (lastFrozenNode == target.address && !doFixedArray) { + if (lastFrozenNode == target.node && !doFixedArray) { + // TODO: for better perf (but more RAM used) we + // could avoid this except when arc is "near" the + // last arc: flags += BIT_TARGET_NEXT; } @@ -485,10 +613,12 @@ public class FST { assert arc.nextFinalOutput == NO_OUTPUT; } - boolean targetHasArcs = target.address > 0; + boolean targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; + } else if (inCounts != null) { + inCounts[target.node]++; } if (arc.output != NO_OUTPUT) { @@ -498,19 +628,23 @@ public class FST { writer.writeByte((byte) flags); writeLabel(arc.label); - //System.out.println(" write arc: label=" + arc.label + " flags=" + flags); + // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + writer.posWrite + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, writer); + //System.out.println(" write output"); arcWithOutputCount++; } + if (arc.nextFinalOutput != NO_OUTPUT) { + //System.out.println(" write final output"); outputs.write(arc.nextFinalOutput, writer); } - if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) { - assert target.address > 0; - writer.writeInt(target.address); + if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { + assert target.node > 0; + //System.out.println(" write target"); + writer.writeInt(target.node); } // just write the arcs "like normal" on first pass, @@ -530,10 +664,11 @@ public class FST { // such cases if (doFixedArray) { + //System.out.println(" doFixedArray"); assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed // byte size - final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; + final int sizeNeeded = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; bytes = ArrayUtil.grow(bytes, sizeNeeded); // TODO: we could make this a vInt instead bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); @@ -543,9 +678,9 @@ public class FST { // expand the arcs in place, backwards int srcPos = writer.posWrite; - int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc; + int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc; writer.posWrite = destPos; - for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) { + for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) { //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); destPos -= maxBytesPerArc; srcPos -= bytesPerArc[arcIdx]; @@ -559,7 +694,7 @@ public class FST { // reverse bytes in-place; we do this so that the // "BIT_TARGET_NEXT" opto can work, ie, it reads the // node just before the current one - final int endAddress = lastFrozenNode = writer.posWrite - 1; + final int endAddress = writer.posWrite - 1; int left = startAddress; int right = endAddress; @@ -568,13 +703,31 @@ public class FST { bytes[left++] = bytes[right]; bytes[right--] = b; } + //System.out.println(" endAddress=" + endAddress); - return endAddress; + nodeCount++; + final int node; + if (nodeAddress != null) { + // Nodes are addressed by 1+ord: + if (nodeCount == nodeAddress.length) { + nodeAddress = ArrayUtil.grow(nodeAddress); + inCounts = ArrayUtil.grow(inCounts); + } + nodeAddress[nodeCount] = endAddress; + // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); + node = nodeCount; + } else { + node = endAddress; + } + lastFrozenNode = node; + + return node; } /** Fills virtual 'start' arc, ie, an empty incoming arc to * the FST's start node */ public Arc getFirstArc(Arc arc) { + if (emptyOutput != null) { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; @@ -585,7 +738,7 @@ public class FST { arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the - // empty string, then startNode is 0, and then readFirstTargetArc + // empty string, then startNode is 0 arc.target = startNode; return arc; } @@ -602,20 +755,27 @@ public class FST { //System.out.println(" end node"); assert follow.isFinal(); arc.label = END_LABEL; + arc.target = FINAL_END_NODE; arc.output = follow.nextFinalOutput; arc.flags = BIT_LAST_ARC; return arc; } else { - final BytesReader in = getBytesReader(follow.target); - arc.flags = in.readByte(); - if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + final BytesReader in = getBytesReader(getNodeAddress(follow.target)); + arc.node = follow.target; + final byte b = in.readByte(); + if (b == ARCS_AS_FIXED_ARRAY) { // array: jump straight to end arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.pos; arc.arcIdx = arc.numArcs - 2; } else { + arc.flags = b; // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); @@ -631,11 +791,17 @@ public class FST { if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else { - in.pos -= 4; + if (packed) { + in.readVInt(); + } else { + in.skip(4); + } } arc.flags = in.readByte(); } - arc.nextArc = in.pos+1; + // Undo the byte flags we read: + in.skip(-1); + arc.nextArc = in.pos; } readNextRealArc(arc, in); assert arc.isLast(); @@ -657,35 +823,48 @@ public class FST { // Insert "fake" final first arc: arc.label = END_LABEL; arc.output = follow.nextFinalOutput; + arc.flags = BIT_FINAL_ARC; if (follow.target <= 0) { - arc.flags = BIT_LAST_ARC | BIT_FINAL_ARC; + arc.flags |= BIT_LAST_ARC; } else { - arc.flags = BIT_FINAL_ARC; + arc.node = follow.target; + // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; } + arc.target = FINAL_END_NODE; //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { - return readFirstRealArc(follow.target, arc, getBytesReader(0)); + return readFirstRealTargetArc(follow.target, arc, getBytesReader(0)); } } - public Arc readFirstRealArc(int address, Arc arc, final BytesReader in) throws IOException { + public Arc readFirstRealTargetArc(int node, Arc arc, final BytesReader in) throws IOException { + final int address = getNodeAddress(node); in.pos = address; - arc.flags = in.readByte(); + //System.out.println(" readFirstRealTargtArc address=" + //+ address); + //System.out.println(" flags=" + arc.flags); + arc.node = node; - if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + if (in.readByte() == ARCS_AS_FIXED_ARRAY) { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } arc.arcIdx = -1; arc.nextArc = arc.posArcsStart = in.pos; //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { + //arc.flags = b; arc.nextArc = address; arc.bytesPerArc = 0; } + return readNextRealArc(arc, in); } @@ -699,9 +878,8 @@ public class FST { if (!targetHasArcs(follow)) { return false; } else { - final BytesReader in = getBytesReader(follow.target); - final byte b = in.readByte(); - return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + final BytesReader in = getBytesReader(getNodeAddress(follow.target)); + return in.readByte() == ARCS_AS_FIXED_ARRAY; } } @@ -710,10 +888,9 @@ public class FST { if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { - // This arc went to virtual final node, ie has no outgoing arcs - return null; + throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); } - return readFirstRealArc(arc.nextArc, arc, getBytesReader(0)); + return readFirstRealTargetArc(arc.nextArc, arc, getBytesReader(0)); } else { return readNextRealArc(arc, getBytesReader(0)); } @@ -727,19 +904,24 @@ public class FST { final BytesReader in; if (arc.label == END_LABEL) { //System.out.println(" nextArc fake " + arc.nextArc); - in = getBytesReader(arc.nextArc); - byte flags = bytes[in.pos]; - if (flag(flags, BIT_ARCS_AS_FIXED_ARRAY)) { + in = getBytesReader(getNodeAddress(arc.nextArc)); + final byte b = bytes[in.pos]; + if (b == ARCS_AS_FIXED_ARRAY) { //System.out.println(" nextArc fake array"); - in.pos--; + in.skip(1); in.readVInt(); - in.readInt(); + if (packed) { + in.readVInt(); + } else { + in.readInt(); + } } } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are at fixed entries - in = getBytesReader(arc.posArcsStart - (1+arc.arcIdx)*arc.bytesPerArc); + in = getBytesReader(arc.posArcsStart); + in.skip((1+arc.arcIdx)*arc.bytesPerArc); } else { // arcs are packed //System.out.println(" nextArc real packed"); @@ -754,12 +936,16 @@ public class FST { /** Never returns null, but you should never call this if * arc.isLast() is true. */ public Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + + // TODO: can't assert this because we call from readFirstArc + // assert !flag(arc.flags, BIT_LAST_ARC); + // this is a continuing arc in a fixed array if (arc.bytesPerArc != 0) { // arcs are at fixed entries arc.arcIdx++; assert arc.arcIdx < arc.numArcs; - in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc; + in.skip(arc.posArcsStart, arc.arcIdx*arc.bytesPerArc); } else { // arcs are packed in.pos = arc.nextArc; @@ -788,45 +974,61 @@ public class FST { arc.nextArc = in.pos; } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.pos; - if (!arc.flag(BIT_LAST_ARC)) { - if (arc.bytesPerArc == 0) { - // must scan - seekToNextNode(in); - } else { - in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs; + // TODO: would be nice to make this lazy -- maybe + // caller doesn't need the target and is scanning arcs... + if (nodeAddress == null) { + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); + } else { + in.skip(arc.posArcsStart, arc.bytesPerArc * arc.numArcs); + } } + arc.target = in.pos; + } else { + arc.target = arc.node - 1; + assert arc.target > 0; } - arc.target = in.pos; } else { - arc.target = in.readInt(); + if (packed) { + final int pos = in.pos; + final int code = in.readVInt(); + if (arc.flag(BIT_TARGET_DELTA)) { + // Address is delta-coded from current address: + arc.target = pos + code; + //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); + } else if (code < nodeRefToAddress.length) { + // Deref + arc.target = nodeRefToAddress[code]; + //System.out.println(" deref code=" + code + " target=" + arc.target); + } else { + // Absolute + arc.target = code; + //System.out.println(" abs code=" + code + " derefLen=" + nodeRefToAddress.length); + } + } else { + arc.target = in.readInt(); + } arc.nextArc = in.pos; } - return arc; } /** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */ - public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc, BytesReader in) throws IOException { assert cachedRootArcs != null; - // Short-circuit if this arc is in the root arc cache: - if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) { - final Arc result = cachedRootArcs[labelToMatch]; - if (result == null) { - return result; - } else { - arc.copyFrom(result); - return arc; - } - } - + if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { arc.flags = BIT_LAST_ARC; } else { arc.flags = 0; + // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; + arc.node = follow.target; } arc.output = follow.nextFinalOutput; arc.label = END_LABEL; @@ -836,35 +1038,49 @@ public class FST { } } + // Short-circuit if this arc is in the root arc cache: + if (follow.target == startNode && labelToMatch < cachedRootArcs.length) { + final Arc result = cachedRootArcs[labelToMatch]; + if (result == null) { + return result; + } else { + arc.copyFrom(result); + return arc; + } + } + if (!targetHasArcs(follow)) { return null; } - // TODO: maybe make an explicit thread state that holds - // reusable stuff eg BytesReader: - final BytesReader in = getBytesReader(follow.target); + in.pos = getNodeAddress(follow.target); + + arc.node = follow.target; // System.out.println("fta label=" + (char) labelToMatch); - if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { + if (in.readByte() == ARCS_AS_FIXED_ARRAY) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - //System.out.println(" bs " + arc.numArcs); - arc.bytesPerArc = in.readInt(); + if (packed) { + arc.bytesPerArc = in.readVInt(); + } else { + arc.bytesPerArc = in.readInt(); + } arc.posArcsStart = in.pos; int low = 0; int high = arc.numArcs-1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.skip(arc.posArcsStart, arc.bytesPerArc*mid + 1); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; - if (cmp < 0) + if (cmp < 0) { low = mid + 1; - else if (cmp > 0) + } else if (cmp > 0) { high = mid - 1; - else { + } else { arc.arcIdx = mid-1; //System.out.println(" found!"); return readNextRealArc(arc, in); @@ -875,7 +1091,8 @@ public class FST { } // Linear scan - readFirstTargetArc(follow, arc); + readFirstRealTargetArc(follow.target, arc, in); + while(true) { //System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create @@ -889,7 +1106,7 @@ public class FST { } else if (arc.isLast()) { return null; } else { - readNextArc(arc); + readNextRealArc(arc, in); } } } @@ -910,7 +1127,11 @@ public class FST { } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { - in.readInt(); + if (packed) { + in.readVInt(); + } else { + in.readInt(); + } } if (flag(flags, BIT_LAST_ARC)) { @@ -969,6 +1190,7 @@ public class FST { @Override public void writeByte(byte b) { + assert posWrite <= bytes.length; if (bytes.length == posWrite) { bytes = ArrayUtil.grow(bytes); } @@ -976,6 +1198,13 @@ public class FST { bytes[posWrite++] = b; } + public void setPosWrite(int posWrite) { + this.posWrite = posWrite; + if (bytes.length < posWrite) { + bytes = ArrayUtil.grow(bytes, posWrite); + } + } + @Override public void writeBytes(byte[] b, int offset, int length) { final int size = posWrite + length; @@ -987,15 +1216,24 @@ public class FST { public final BytesReader getBytesReader(int pos) { // TODO: maybe re-use via ThreadLocal? - return new BytesReader(bytes, pos); + if (packed) { + return new ForwardBytesReader(bytes, pos); + } else { + return new ReverseBytesReader(bytes, pos); + } } /** Expert */ - public final static class BytesReader extends DataInput { - final byte[] bytes; + public static abstract class BytesReader extends DataInput { int pos; + abstract void skip(int byteCount); + abstract void skip(int base, int byteCount); + } - public BytesReader(byte[] bytes, int pos) { + final static class ReverseBytesReader extends BytesReader { + final byte[] bytes; + + public ReverseBytesReader(byte[] bytes, int pos) { this.bytes = bytes; this.pos = pos; } @@ -1011,5 +1249,549 @@ public class FST { b[offset+i] = bytes[pos--]; } } + + public void skip(int count) { + pos -= count; + } + + public void skip(int base, int count) { + pos = base - count; + } + } + + // TODO: can we use just ByteArrayDataInput...? need to + // add a .skipBytes to DataInput.. hmm and .setPosition + final static class ForwardBytesReader extends BytesReader { + final byte[] bytes; + + public ForwardBytesReader(byte[] bytes, int pos) { + this.bytes = bytes; + this.pos = pos; + } + + @Override + public byte readByte() { + return bytes[pos++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + System.arraycopy(bytes, pos, b, offset, len); + pos += len; + } + + public void skip(int count) { + pos += count; + } + + public void skip(int base, int count) { + pos = base + count; + } + } + + private static class ArcAndState { + final Arc arc; + final IntsRef chain; + + public ArcAndState(Arc arc, IntsRef chain) { + this.arc = arc; + this.chain = chain; + } + } + + /* + public void countSingleChains() throws IOException { + // TODO: must assert this FST was built with + // "willRewrite" + + final List> queue = new ArrayList>(); + + // TODO: use bitset to not revisit nodes already + // visited + + FixedBitSet seen = new FixedBitSet(1+nodeCount); + int saved = 0; + + queue.add(new ArcAndState(getFirstArc(new Arc()), new IntsRef())); + Arc scratchArc = new Arc(); + while(queue.size() > 0) { + //System.out.println("cycle size=" + queue.size()); + //for(ArcAndState ent : queue) { + // System.out.println(" " + Util.toBytesRef(ent.chain, new BytesRef())); + // } + final ArcAndState arcAndState = queue.get(queue.size()-1); + seen.set(arcAndState.arc.node); + final BytesRef br = Util.toBytesRef(arcAndState.chain, new BytesRef()); + if (br.length > 0 && br.bytes[br.length-1] == -1) { + br.length--; + } + //System.out.println(" top node=" + arcAndState.arc.target + " chain=" + br.utf8ToString()); + if (targetHasArcs(arcAndState.arc) && !seen.get(arcAndState.arc.target)) { + // push + readFirstTargetArc(arcAndState.arc, scratchArc); + //System.out.println(" push label=" + (char) scratchArc.label); + //System.out.println(" tonode=" + scratchArc.target + " last?=" + scratchArc.isLast()); + + final IntsRef chain = IntsRef.deepCopyOf(arcAndState.chain); + chain.grow(1+chain.length); + // TODO + //assert scratchArc.label != END_LABEL; + chain.ints[chain.length] = scratchArc.label; + chain.length++; + + if (scratchArc.isLast()) { + if (scratchArc.target != -1 && inCounts[scratchArc.target] == 1) { + //System.out.println(" append"); + } else { + if (arcAndState.chain.length > 1) { + saved += chain.length-2; + try { + System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()).utf8ToString()); + } catch (AssertionError ae) { + System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef())); + } + } + chain.length = 0; + } + } else { + //System.out.println(" reset"); + if (arcAndState.chain.length > 1) { + saved += arcAndState.chain.length-2; + try { + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); + } catch (AssertionError ae) { + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef())); + } + } + if (scratchArc.target != -1 && inCounts[scratchArc.target] != 1) { + chain.length = 0; + } else { + chain.ints[0] = scratchArc.label; + chain.length = 1; + } + } + // TODO: instead of new Arc() we can re-use from + // a by-depth array + queue.add(new ArcAndState(new Arc().copyFrom(scratchArc), chain)); + } else if (!arcAndState.arc.isLast()) { + // next + readNextArc(arcAndState.arc); + //System.out.println(" next label=" + (char) arcAndState.arc.label + " len=" + arcAndState.chain.length); + if (arcAndState.chain.length != 0) { + arcAndState.chain.ints[arcAndState.chain.length-1] = arcAndState.arc.label; + } + } else { + if (arcAndState.chain.length > 1) { + saved += arcAndState.chain.length-2; + System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); + } + // pop + //System.out.println(" pop"); + queue.remove(queue.size()-1); + while(queue.size() > 0 && queue.get(queue.size()-1).arc.isLast()) { + queue.remove(queue.size()-1); + } + if (queue.size() > 0) { + final ArcAndState arcAndState2 = queue.get(queue.size()-1); + readNextArc(arcAndState2.arc); + //System.out.println(" read next=" + (char) arcAndState2.arc.label + " queue=" + queue.size()); + assert arcAndState2.arc.label != END_LABEL; + if (arcAndState2.chain.length != 0) { + arcAndState2.chain.ints[arcAndState2.chain.length-1] = arcAndState2.arc.label; + } + } + } + } + + System.out.println("TOT saved " + saved); + } + */ + + // Creates a packed FST + private FST(INPUT_TYPE inputType, int[] nodeRefToAddress, Outputs outputs) { + packed = true; + this.inputType = inputType; + bytes = new byte[128]; + this.nodeRefToAddress = nodeRefToAddress; + this.outputs = outputs; + NO_OUTPUT = outputs.getNoOutput(); + writer = new BytesWriter(); + } + + /** Expert: creates an FST by packing this one. This + * process requires substantial additional RAM (currently + * ~8 bytes per node), but then should produce a smaller FST. */ + public FST pack(int minInCountDeref, int maxDerefNodes) throws IOException { + + // TODO: other things to try + // - renumber the nodes to get more next / better locality? + // - allow multiple input labels on an arc, so + // singular chain of inputs can take one arc (on + // wikipedia terms this could save another ~6%) + // - in the ord case, the output '1' is presumably + // very common (after NO_OUTPUT)... maybe use a bit + // for it..? + // - use spare bits in flags.... for top few labels / + // outputs / targets + + if (nodeAddress == null) { + throw new IllegalArgumentException("this FST was not built with willPackFST=true"); + } + + Arc arc = new Arc(); + + final BytesReader r = getBytesReader(0); + + final int topN = Math.min(maxDerefNodes, inCounts.length); + + // Find top nodes with highest number of incoming arcs: + NodeQueue q = new NodeQueue(topN); + + // TODO: we could use more RAM efficient selection algo here... + NodeAndInCount bottom = null; + for(int node=0;node= minInCountDeref) { + if (bottom == null) { + q.add(new NodeAndInCount(node, inCounts[node])); + if (q.size() == topN) { + bottom = q.top(); + } + } else if (inCounts[node] > bottom.count) { + q.insertWithOverflow(new NodeAndInCount(node, inCounts[node])); + } + } + } + + // Free up RAM: + inCounts = null; + + final Map topNodeMap = new HashMap(); + for(int downTo=q.size()-1;downTo>=0;downTo--) { + NodeAndInCount n = q.pop(); + topNodeMap.put(n.node, downTo); + //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); + } + + // TODO: we can use packed ints: + // +1 because node ords start at 1 (0 is reserved as + // stop node): + final int[] nodeRefToAddressIn = new int[topNodeMap.size()]; + + final FST fst = new FST(inputType, nodeRefToAddressIn, outputs); + + final BytesWriter writer = fst.writer; + + final int[] newNodeAddress = new int[1+nodeCount]; + + // Fill initial coarse guess: + for(int node=1;node<=nodeCount;node++) { + newNodeAddress[node] = 1 + bytes.length - nodeAddress[node]; + } + + int absCount; + int deltaCount; + int topCount; + int nextCount; + + // Iterate until we converge: + while(true) { + + //System.out.println("\nITER"); + boolean changed = false; + + // for assert: + boolean negDelta = false; + + writer.posWrite = 0; + // Skip 0 byte since 0 is reserved target: + writer.writeByte((byte) 0); + + fst.arcWithOutputCount = 0; + fst.nodeCount = 0; + fst.arcCount = 0; + + absCount = deltaCount = topCount = nextCount = 0; + + int changedCount = 0; + + int addressError = 0; + + //int totWasted = 0; + + // Since we re-reverse the bytes, we now write the + // nodes backwards, so that BIT_TARGET_NEXT is + // unchanged: + for(int node=nodeCount;node>=1;node--) { + fst.nodeCount++; + final int address = writer.posWrite; + //System.out.println(" node: " + node + " address=" + address); + if (address != newNodeAddress[node]) { + addressError = address - newNodeAddress[node]; + //System.out.println(" change: " + (address - newNodeAddress[node])); + changed = true; + newNodeAddress[node] = address; + changedCount++; + } + + int nodeArcCount = 0; + int bytesPerArc = 0; + + boolean retry = false; + + // for assert: + boolean anyNegDelta = false; + + // Retry loop: possibly iterate more than once, if + // this is an array'd node and bytesPerArc changes: + writeNode: + while(true) { // retry writing this node + + readFirstRealTargetArc(node, arc, r); + + final boolean useArcArray = arc.bytesPerArc != 0; + if (useArcArray) { + // Write false first arc: + if (bytesPerArc == 0) { + bytesPerArc = arc.bytesPerArc; + } + writer.writeByte(ARCS_AS_FIXED_ARRAY); + writer.writeVInt(arc.numArcs); + writer.writeVInt(bytesPerArc); + //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); + } + + int maxBytesPerArc = 0; + //int wasted = 0; + while(true) { // iterate over all arcs for this node + + //System.out.println(" arc label=" + arc.label + " target=" + arc.target + " pos=" + writer.posWrite); + final int arcStartPos = writer.posWrite; + nodeArcCount++; + + byte flags = 0; + + if (arc.isLast()) { + flags += BIT_LAST_ARC; + } + /* + if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) { + flags += BIT_TARGET_NEXT; + } + */ + if (!useArcArray && node != 1 && arc.target == node-1) { + flags += BIT_TARGET_NEXT; + if (!retry) { + nextCount++; + } + } + if (arc.isFinal()) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + if (!targetHasArcs(arc)) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + final Integer ptr; + final int absPtr; + final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; + if (doWriteTarget) { + + ptr = topNodeMap.get(arc.target); + if (ptr != null) { + absPtr = ptr; + } else { + absPtr = topNodeMap.size() + newNodeAddress[arc.target] + addressError; + } + + int delta = newNodeAddress[arc.target] + addressError - writer.posWrite - 2; + if (delta < 0) { + //System.out.println("neg: " + delta); + anyNegDelta = true; + delta = 0; + } + + if (delta < absPtr) { + flags |= BIT_TARGET_DELTA; + } + } else { + ptr = null; + absPtr = 0; + } + + writer.writeByte(flags); + fst.writeLabel(arc.label); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, writer); + if (!retry) { + fst.arcWithOutputCount++; + } + } + if (arc.nextFinalOutput != NO_OUTPUT) { + outputs.write(arc.nextFinalOutput, writer); + } + + if (doWriteTarget) { + + int delta = newNodeAddress[arc.target] + addressError - writer.posWrite; + if (delta < 0) { + anyNegDelta = true; + //System.out.println("neg: " + delta); + delta = 0; + } + + if (flag(flags, BIT_TARGET_DELTA)) { + //System.out.println(" delta"); + writer.writeVInt(delta); + if (!retry) { + deltaCount++; + } + } else { + /* + if (ptr != null) { + System.out.println(" deref"); + } else { + System.out.println(" abs"); + } + */ + writer.writeVInt(absPtr); + if (!retry) { + if (absPtr >= topNodeMap.size()) { + absCount++; + } else { + topCount++; + } + } + } + } + + if (useArcArray) { + final int arcBytes = writer.posWrite - arcStartPos; + //System.out.println(" " + arcBytes + " bytes"); + maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); + // NOTE: this may in fact go "backwards", if + // somehow (rarely, possibly never) we use + // more bytesPerArc in this rewrite than the + // incoming FST did... but in this case we + // will retry (below) so it's OK to ovewrite + // bytes: + //wasted += bytesPerArc - arcBytes; + writer.setPosWrite(arcStartPos + bytesPerArc); + } + + if (arc.isLast()) { + break; + } + + readNextRealArc(arc, r); + } + + if (useArcArray) { + if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) { + // converged + //System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted); + //totWasted += wasted; + break; + } + } else { + break; + } + + //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc); + + // Retry: + bytesPerArc = maxBytesPerArc; + writer.posWrite = address; + nodeArcCount = 0; + retry = true; + anyNegDelta = false; + } + negDelta |= anyNegDelta; + + fst.arcCount += nodeArcCount; + } + + if (!changed) { + // We don't renumber the nodes (just reverse their + // order) so nodes should only point forward to + // other nodes because we only produce acyclic FSTs + // w/ nodes only pointing "forwards": + assert !negDelta; + //System.out.println("TOT wasted=" + totWasted); + // Converged! + break; + } + //System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry"); + } + + for(Map.Entry ent : topNodeMap.entrySet()) { + nodeRefToAddressIn[ent.getValue()] = newNodeAddress[ent.getKey()]; + } + + fst.startNode = newNodeAddress[startNode]; + //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); + + if (emptyOutput != null) { + fst.setEmptyOutput(emptyOutput); + } + + assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount=" + nodeCount; + assert fst.arcCount == arcCount; + assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount=" + fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount; + + final byte[] finalBytes = new byte[writer.posWrite]; + //System.out.println("resize " + fst.bytes.length + " down to " + writer.posWrite); + System.arraycopy(fst.bytes, 0, finalBytes, 0, writer.posWrite); + fst.bytes = finalBytes; + fst.cacheRootArcs(); + + //final int size = fst.sizeInBytes(); + //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount); + + return fst; + } + + private static class NodeAndInCount implements Comparable { + final int node; + final int count; + + public NodeAndInCount(int node, int count) { + this.node = node; + this.count = count; + } + + @Override + public int compareTo(NodeAndInCount other) { + if (count > other.count) { + return 1; + } else if (count < other.count) { + return -1; + } else { + // Tie-break: smaller node compares as greater than + return other.node - node; + } + } + } + + private static class NodeQueue extends PriorityQueue { + public NodeQueue(int topN) { + super(topN, false); + } + + @Override + public boolean lessThan(NodeAndInCount a, NodeAndInCount b) { + final int cmp = a.compareTo(b); + assert cmp != 0; + return cmp < 0; + } } } diff --git a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java index 221e6b2eaf1..6abe25b7978 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -151,7 +151,8 @@ abstract class FSTEnum { boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); @@ -275,7 +276,7 @@ abstract class FSTEnum { // Now scan forward, matching the new suffix of the target while(true) { - //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); + //System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast() + " bba=" + arc.bytesPerArc); if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find @@ -289,15 +290,16 @@ abstract class FSTEnum { boolean found = false; while (low <= high) { mid = (low + high) >>> 1; - in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + in.pos = arc.posArcsStart; + in.skip(arc.bytesPerArc*mid+1); final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; //System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " midLabel=" + midLabel + " cmp=" + cmp); - if (cmp < 0) + if (cmp < 0) { low = mid + 1; - else if (cmp > 0) + } else if (cmp > 0) { high = mid - 1; - else { + } else { found = true; break; } @@ -430,9 +432,11 @@ abstract class FSTEnum { FST.Arc arc = getArc(upto-1); int targetLabel = getTargetLabel(); + final FST.BytesReader fstReader = fst.getBytesReader(0); + while(true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); - final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto)); + final FST.Arc nextArc = fst.findTargetArc(targetLabel, arc, getArc(upto), fstReader); if (nextArc == null) { // short circuit //upto--; diff --git a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java index da973358442..47e669d7d10 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java +++ b/lucene/src/java/org/apache/lucene/util/fst/NodeHash.java @@ -35,7 +35,7 @@ final class NodeHash { } private boolean nodesEqual(Builder.UnCompiledNode node, int address, FST.BytesReader in) throws IOException { - fst.readFirstRealArc(address, scratchArc, in); + fst.readFirstRealTargetArc(address, scratchArc, in); if (scratchArc.bytesPerArc != 0 && node.numArcs != scratchArc.numArcs) { return false; } @@ -43,7 +43,7 @@ final class NodeHash { final Builder.Arc arc = node.arcs[arcUpto]; if (arc.label != scratchArc.label || !arc.output.equals(scratchArc.output) || - ((Builder.CompiledNode) arc.target).address != scratchArc.target || + ((Builder.CompiledNode) arc.target).node != scratchArc.target || !arc.nextFinalOutput.equals(scratchArc.nextFinalOutput) || arc.isFinal != scratchArc.isFinal()) { return false; @@ -71,9 +71,9 @@ final class NodeHash { // TODO: maybe if number of arcs is high we can safely subsample? for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; - //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).address + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); + //System.out.println(" label=" + arc.label + " target=" + ((Builder.CompiledNode) arc.target).node + " h=" + h + " output=" + fst.outputs.outputToString(arc.output) + " isFinal?=" + arc.isFinal); h = PRIME * h + arc.label; - h = PRIME * h + ((Builder.CompiledNode) arc.target).address; + h = PRIME * h + ((Builder.CompiledNode) arc.target).node; h = PRIME * h + arc.output.hashCode(); h = PRIME * h + arc.nextFinalOutput.hashCode(); if (arc.isFinal) { @@ -88,9 +88,9 @@ final class NodeHash { private int hash(int node) throws IOException { final int PRIME = 31; final FST.BytesReader in = fst.getBytesReader(0); - //System.out.println("hash frozen"); + //System.out.println("hash frozen node=" + node); int h = 0; - fst.readFirstRealArc(node, scratchArc, in); + fst.readFirstRealTargetArc(node, scratchArc, in); while(true) { //System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal()); h = PRIME * h + scratchArc.label; @@ -109,26 +109,26 @@ final class NodeHash { return h & Integer.MAX_VALUE; } - public int add(Builder.UnCompiledNode node) throws IOException { + public int add(Builder.UnCompiledNode nodeIn) throws IOException { // System.out.println("hash: add count=" + count + " vs " + table.length); final FST.BytesReader in = fst.getBytesReader(0); - final int h = hash(node); + final int h = hash(nodeIn); int pos = h & mask; int c = 0; while(true) { final int v = table[pos]; if (v == 0) { // freeze & add - final int address = fst.addNode(node); - //System.out.println(" now freeze addr=" + address); - assert hash(address) == h : "frozenHash=" + hash(address) + " vs h=" + h; + final int node = fst.addNode(nodeIn); + //System.out.println(" now freeze node=" + node); + assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h; count++; - table[pos] = address; + table[pos] = node; if (table.length < 2*count) { rehash(); } - return address; - } else if (nodesEqual(node, v, in)) { + return node; + } else if (nodesEqual(nodeIn, v, in)) { // same node is already here return v; } diff --git a/lucene/src/java/org/apache/lucene/util/fst/Outputs.java b/lucene/src/java/org/apache/lucene/util/fst/Outputs.java index e0cabaf9e99..f0ce86dbe1a 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Outputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Outputs.java @@ -26,6 +26,10 @@ import org.apache.lucene.store.DataOutput; * Represents the outputs for an FST, providing the basic * algebra needed for the FST. * + *

Note that any operation that returns NO_OUTPUT must + * return the same singleton object from {@link + * #getNoOutput}.

+ * * @lucene.experimental */ @@ -56,6 +60,8 @@ public abstract class Outputs { public abstract String outputToString(T output); + // TODO: maybe make valid(T output) public...? for asserts + public T merge(T first, T second) { throw new UnsupportedOperationException(); } diff --git a/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java b/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java index 4cfd7f67915..d2b3504b03b 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/PairOutputs.java @@ -38,7 +38,8 @@ public class PairOutputs extends Outputs> { public final A output1; public final B output2; - public Pair(A output1, B output2) { + // use newPair + private Pair(A output1, B output2) { this.output1 = output1; this.output2 = output2; } @@ -66,35 +67,79 @@ public class PairOutputs extends Outputs> { this.outputs2 = outputs2; NO_OUTPUT = new Pair(outputs1.getNoOutput(), outputs2.getNoOutput()); } - - public Pair get(A output1, B output2) { - if (output1 == outputs1.getNoOutput() && output2 == outputs2.getNoOutput()) { + + /** Create a new Pair */ + public Pair newPair(A a, B b) { + if (a.equals(outputs1.getNoOutput())) { + a = outputs1.getNoOutput(); + } + if (b.equals(outputs2.getNoOutput())) { + b = outputs2.getNoOutput(); + } + + if (a == outputs1.getNoOutput() && b == outputs2.getNoOutput()) { return NO_OUTPUT; } else { - return new Pair(output1, output2); + final Pair p = new Pair(a, b); + assert valid(p); + return p; } } - + + // for assert + private boolean valid(Pair pair) { + final boolean noOutput1 = pair.output1.equals(outputs1.getNoOutput()); + final boolean noOutput2 = pair.output2.equals(outputs2.getNoOutput()); + + if (noOutput1 && pair.output1 != outputs1.getNoOutput()) { + System.out.println("invalid0"); + return false; + } + + if (noOutput2 && pair.output2 != outputs2.getNoOutput()) { + System.out.println("invalid1"); + return false; + } + + if (noOutput1 && noOutput2) { + if (pair != NO_OUTPUT) { + System.out.println("invalid2"); + return false; + } else { + return true; + } + } else { + return true; + } + } + @Override public Pair common(Pair pair1, Pair pair2) { - return get(outputs1.common(pair1.output1, pair2.output1), - outputs2.common(pair1.output2, pair2.output2)); + assert valid(pair1); + assert valid(pair2); + return newPair(outputs1.common(pair1.output1, pair2.output1), + outputs2.common(pair1.output2, pair2.output2)); } @Override public Pair subtract(Pair output, Pair inc) { - return get(outputs1.subtract(output.output1, inc.output1), - outputs2.subtract(output.output2, inc.output2)); + assert valid(output); + assert valid(inc); + return newPair(outputs1.subtract(output.output1, inc.output1), + outputs2.subtract(output.output2, inc.output2)); } @Override public Pair add(Pair prefix, Pair output) { - return get(outputs1.add(prefix.output1, output.output1), - outputs2.add(prefix.output2, output.output2)); + assert valid(prefix); + assert valid(output); + return newPair(outputs1.add(prefix.output1, output.output1), + outputs2.add(prefix.output2, output.output2)); } @Override public void write(Pair output, DataOutput writer) throws IOException { + assert valid(output); outputs1.write(output.output1, writer); outputs2.write(output.output2, writer); } @@ -103,7 +148,7 @@ public class PairOutputs extends Outputs> { public Pair read(DataInput in) throws IOException { A output1 = outputs1.read(in); B output2 = outputs2.read(in); - return get(output1, output2); + return newPair(output1, output2); } @Override @@ -113,6 +158,12 @@ public class PairOutputs extends Outputs> { @Override public String outputToString(Pair output) { + assert valid(output); return ""; } + + @Override + public String toString() { + return "PairOutputs<" + outputs1 + "," + outputs2 + ">"; + } } diff --git a/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java index c1051152518..31df0666123 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/fst/PositiveIntOutputs.java @@ -25,10 +25,7 @@ import org.apache.lucene.store.DataOutput; /** * Output is a long, for each input term. NOTE: the * resulting FST is not guaranteed to be minimal! See - * {@link Builder}. You must use {@link #get} to obtain the - * output for a given long value -- do not use autoboxing - * nor create your own Long instance (the value 0 - * must map to the {@link #getNoOutput} singleton). + * {@link Builder}. * * @lucene.experimental */ @@ -50,14 +47,6 @@ public final class PositiveIntOutputs extends Outputs { return doShare ? singletonShare : singletonNoShare; } - public Long get(long v) { - if (v == 0) { - return NO_OUTPUT; - } else { - return Long.valueOf(v); - } - } - @Override public Long common(Long output1, Long output2) { assert valid(output1); diff --git a/lucene/src/java/org/apache/lucene/util/fst/Util.java b/lucene/src/java/org/apache/lucene/util/fst/Util.java index b5e4e2d6fea..cf8f3dec453 100644 --- a/lucene/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/src/java/org/apache/lucene/util/fst/Util.java @@ -37,23 +37,21 @@ public final class Util { // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + final FST.BytesReader fstReader = fst.getBytesReader(0); + // Accumulate output as we go - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; + T output = fst.outputs.getNoOutput(); for(int i=0;i T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; + final FST.BytesReader fstReader = fst.getBytesReader(0); + // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); // Accumulate output as we go - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; + T output = fst.outputs.getNoOutput(); for(int i=0;i prevArc = null; @@ -238,6 +234,7 @@ public final class Util { // A queue of transitions to consider when processing the next level. final List> nextLevelQueue = new ArrayList>(); nextLevelQueue.add(startArc); + //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). final List sameLevelStates = new ArrayList(); @@ -289,8 +286,11 @@ public final class Util { int level = 0; + final FST.BytesReader r = fst.getBytesReader(0); + while (!nextLevelQueue.isEmpty()) { // we could double buffer here, but it doesn't matter probably. + //System.out.println("next level=" + level); thisLevelQueue.addAll(nextLevelQueue); nextLevelQueue.clear(); @@ -298,19 +298,19 @@ public final class Util { out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); + //System.out.println(" pop: " + arc); if (fst.targetHasArcs(arc)) { - // scan all arcs + // scan all target arcs + //System.out.println(" readFirstTarget..."); final int node = arc.target; - fst.readFirstTargetArc(arc, arc); - if (arc.label == FST.END_LABEL) { - // Skip it -- prior recursion took this into account already - assert !arc.isLast(); - fst.readNextArc(arc); - } + fst.readFirstRealTargetArc(arc.target, arc, r); + + //System.out.println(" firstTarget: " + arc); while (true) { + //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { @@ -329,7 +329,7 @@ public final class Util { if (fst.isExpandedTarget(arc)) { stateColor = expandedNodeColor; } else { - stateColor = null; + stateColor = null; } final String finalOutput; @@ -339,7 +339,9 @@ public final class Util { finalOutput = ""; } - emitDotState(out, Integer.toString(arc.target), arc.isFinal() ? finalStateShape : stateShape, stateColor, finalOutput); + emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, finalOutput); + // To see the node address, use this instead: + //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); sameLevelStates.add(arc.target); @@ -362,14 +364,22 @@ public final class Util { outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]"; } + final String arcColor; + if (arc.flag(FST.BIT_TARGET_NEXT)) { + arcColor = "red"; + } else { + arcColor = "black"; + } + assert arc.label != FST.END_LABEL; - out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"]\n"); + out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.isLast()) { + //System.out.println(" break"); break; } - fst.readNextArc(arc); + fst.readNextRealArc(arc, r); } } } diff --git a/lucene/src/test-framework/java/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/src/test-framework/java/org/apache/lucene/store/MockDirectoryWrapper.java index 205ef5b4a95..cff2f9640ec 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/src/test-framework/java/org/apache/lucene/store/MockDirectoryWrapper.java @@ -702,12 +702,6 @@ public class MockDirectoryWrapper extends Directory { return delegate.fileExists(name); } - @Override - public synchronized long fileModified(String name) throws IOException { - maybeYield(); - return delegate.fileModified(name); - } - @Override public synchronized long fileLength(String name) throws IOException { maybeYield(); diff --git a/lucene/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java b/lucene/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java index 396ebc75a6f..c95f446dcb6 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java +++ b/lucene/src/test/org/apache/lucene/index/TestCrashCausesCorruptIndex.java @@ -224,14 +224,6 @@ public class TestCrashCausesCorruptIndex extends LuceneTestCase { return realDirectory.fileLength(name); } - /** - * {@inheritDoc} - */ - @Override - public long fileModified(String name) throws IOException { - return realDirectory.fileModified(name); - } - /** * {@inheritDoc} */ diff --git a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java index 76e150322d5..a936f36c296 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -18,10 +18,12 @@ package org.apache.lucene.index; */ import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; -import java.util.Collection; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -32,7 +34,6 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -47,20 +48,12 @@ public class TestDeletionPolicy extends LuceneTestCase { final IndexCommit firstCommit = commits.get(0); long last = SegmentInfos.generationFromSegmentsFileName(firstCommit.getSegmentsFileName()); assertEquals(last, firstCommit.getGeneration()); - long lastVersion = firstCommit.getVersion(); - long lastTimestamp = firstCommit.getTimestamp(); for(int i=1;i last); - assertTrue("SegmentInfos versions are out-of-order", nowVersion > lastVersion); - assertTrue("SegmentInfos timestamps are out-of-order: now=" + nowTimestamp + " vs last=" + lastTimestamp, nowTimestamp >= lastTimestamp); assertEquals(now, commit.getGeneration()); last = now; - lastVersion = nowVersion; - lastTimestamp = nowTimestamp; } } @@ -158,6 +151,10 @@ public class TestDeletionPolicy extends LuceneTestCase { } } + static long getCommitTime(IndexCommit commit) throws IOException { + return Long.parseLong(commit.getUserData().get("commitTime")); + } + /* * Delete a commit only when it has been obsoleted by N * seconds. @@ -184,10 +181,10 @@ public class TestDeletionPolicy extends LuceneTestCase { IndexCommit lastCommit = commits.get(commits.size()-1); // Any commit older than expireTime should be deleted: - double expireTime = dir.fileModified(lastCommit.getSegmentsFileName())/1000.0 - expirationTimeSeconds; + double expireTime = getCommitTime(lastCommit)/1000.0 - expirationTimeSeconds; for (final IndexCommit commit : commits) { - double modTime = dir.fileModified(commit.getSegmentsFileName())/1000.0; + double modTime = getCommitTime(commit)/1000.0; if (commit != lastCommit && modTime < expireTime) { commit.delete(); numDelete += 1; @@ -213,6 +210,9 @@ public class TestDeletionPolicy extends LuceneTestCase { ((LogMergePolicy) mp).setUseCompoundFile(true); } IndexWriter writer = new IndexWriter(dir, conf); + Map commitData = new HashMap(); + commitData.put("commitTime", String.valueOf(System.currentTimeMillis())); + writer.commit(commitData); writer.close(); final int ITER = 9; @@ -233,6 +233,9 @@ public class TestDeletionPolicy extends LuceneTestCase { for(int j=0;j<17;j++) { addDoc(writer); } + commitData = new HashMap(); + commitData.put("commitTime", String.valueOf(System.currentTimeMillis())); + writer.commit(commitData); writer.close(); if (i < ITER-1) { @@ -269,7 +272,9 @@ public class TestDeletionPolicy extends LuceneTestCase { // if we are on a filesystem that seems to have only // 1 second resolution, allow +1 second in commit // age tolerance: - long modTime = dir.fileModified(fileName); + SegmentInfos sis = new SegmentInfos(); + sis.read(dir, fileName); + long modTime = Long.parseLong(sis.getUserData().get("commitTime")); oneSecondResolution &= (modTime % 1000) == 0; final long leeway = (long) ((SECONDS + (oneSecondResolution ? 1.0:0.0))*1000); diff --git a/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java index 3bb525f53d9..3430c0399f5 100644 --- a/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -126,10 +126,6 @@ public class TestFieldsReader extends LuceneTestCase { return fsDir.fileExists(name); } @Override - public long fileModified(String name) throws IOException { - return fsDir.fileModified(name); - } - @Override public void deleteFile(String name) throws IOException { fsDir.deleteFile(name); } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexCommit.java b/lucene/src/test/org/apache/lucene/index/TestIndexCommit.java index d56d62d1bd2..a877b71f22b 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexCommit.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexCommit.java @@ -34,12 +34,10 @@ public class TestIndexCommit extends LuceneTestCase { IndexCommit ic1 = new IndexCommit() { @Override public String getSegmentsFileName() { return "a"; } - @Override public long getVersion() { return 12; } @Override public Directory getDirectory() { return dir; } @Override public Collection getFileNames() throws IOException { return null; } @Override public void delete() {} @Override public long getGeneration() { return 0; } - @Override public long getTimestamp() throws IOException { return 1;} @Override public Map getUserData() throws IOException { return null; } @Override public boolean isDeleted() { return false; } @Override public int getSegmentCount() { return 2; } @@ -47,12 +45,10 @@ public class TestIndexCommit extends LuceneTestCase { IndexCommit ic2 = new IndexCommit() { @Override public String getSegmentsFileName() { return "b"; } - @Override public long getVersion() { return 12; } @Override public Directory getDirectory() { return dir; } @Override public Collection getFileNames() throws IOException { return null; } @Override public void delete() {} @Override public long getGeneration() { return 0; } - @Override public long getTimestamp() throws IOException { return 1;} @Override public Map getUserData() throws IOException { return null; } @Override public boolean isDeleted() { return false; } @Override public int getSegmentCount() { return 2; } diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index 7ce6e7dd769..2082b790dcb 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -381,60 +381,6 @@ public class TestIndexReader extends LuceneTestCase { _TestUtil.rmDir(dirFile); } - public void testLastModified() throws Exception { - for(int i=0;i<2;i++) { - final Directory dir = newDirectory(); - assertFalse(IndexReader.indexExists(dir)); - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.CREATE)); - addDocumentWithFields(writer); - assertTrue(IndexWriter.isLocked(dir)); // writer open, so dir is locked - writer.close(); - assertTrue(IndexReader.indexExists(dir)); - IndexReader reader = IndexReader.open(dir); - assertFalse(IndexWriter.isLocked(dir)); // reader only, no lock - long version = IndexReader.lastModified(dir); - if (i == 1) { - long version2 = IndexReader.lastModified(dir); - assertEquals(version, version2); - } - reader.close(); - // modify index and check version has been - // incremented: - Thread.sleep(1000); - - writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.CREATE)); - addDocumentWithFields(writer); - writer.close(); - reader = IndexReader.open(dir); - assertTrue("old lastModified is " + version + "; new lastModified is " + IndexReader.lastModified(dir), version <= IndexReader.lastModified(dir)); - reader.close(); - dir.close(); - } - } - - public void testVersion() throws IOException { - Directory dir = newDirectory(); - assertFalse(IndexReader.indexExists(dir)); - IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))); - addDocumentWithFields(writer); - assertTrue(IndexWriter.isLocked(dir)); // writer open, so dir is locked - writer.close(); - assertTrue(IndexReader.indexExists(dir)); - IndexReader reader = IndexReader.open(dir); - assertFalse(IndexWriter.isLocked(dir)); // reader only, no lock - long version = IndexReader.getCurrentVersion(dir); - reader.close(); - // modify index and check version has been - // incremented: - writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(OpenMode.CREATE)); - addDocumentWithFields(writer); - writer.close(); - reader = IndexReader.open(dir); - assertTrue("old version is " + version + "; new version is " + IndexReader.getCurrentVersion(dir), version < IndexReader.getCurrentVersion(dir)); - reader.close(); - dir.close(); - } - public void testOpenReaderAfterDelete() throws IOException { File dirFile = _TestUtil.getTempDir("deletetest"); Directory dir = newFSDirectory(dirFile); diff --git a/lucene/src/test/org/apache/lucene/store/TestBufferedIndexInput.java b/lucene/src/test/org/apache/lucene/store/TestBufferedIndexInput.java index 2622e9332e9..7b9ba7e3fd2 100755 --- a/lucene/src/test/org/apache/lucene/store/TestBufferedIndexInput.java +++ b/lucene/src/test/org/apache/lucene/store/TestBufferedIndexInput.java @@ -348,12 +348,6 @@ public class TestBufferedIndexInput extends LuceneTestCase { dir.deleteFile(name); } @Override - public long fileModified(String name) - throws IOException - { - return dir.fileModified(name); - } - @Override public boolean fileExists(String name) throws IOException { diff --git a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java index 4ea6e8a43e1..3082c61c3d4 100644 --- a/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -89,11 +89,11 @@ public class TestFSTs extends LuceneTestCase { return br; } - private static IntsRef toIntsRef(String s, int inputMode) { + static IntsRef toIntsRef(String s, int inputMode) { return toIntsRef(s, inputMode, new IntsRef(10)); } - private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { + static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { if (inputMode == 0) { // utf8 return toIntsRef(new BytesRef(s), ir); @@ -103,7 +103,7 @@ public class TestFSTs extends LuceneTestCase { } } - private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { + static IntsRef toIntsRefUTF32(String s, IntsRef ir) { final int charLength = s.length(); int charIdx = 0; int intIdx = 0; @@ -120,7 +120,7 @@ public class TestFSTs extends LuceneTestCase { return ir; } - private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { + static IntsRef toIntsRef(BytesRef br, IntsRef ir) { if (br.length > ir.ints.length) { ir.grow(br.length); } @@ -172,7 +172,7 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final List> pairs = new ArrayList>(terms2.length); for(int idx=0;idx(terms2[idx], outputs.get(idx))); + pairs.add(new FSTTester.InputOutput(terms2[idx], (long) idx)); } final FST fst = new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false); assertNotNull(fst); @@ -230,7 +230,7 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final List> pairs = new ArrayList>(terms.length); for(int idx=0;idx(terms[idx], outputs.get(idx))); + pairs.add(new FSTTester.InputOutput(terms[idx], (long) idx)); } new FSTTester(random, dir, inputMode, pairs, outputs, true).doTest(); } @@ -244,7 +244,7 @@ public class TestFSTs extends LuceneTestCase { for(int idx=0;idx(terms[idx], outputs.get(value))); + pairs.add(new FSTTester.InputOutput(terms[idx], value)); } new FSTTester(random, dir, inputMode, pairs, outputs, doShare).doTest(); } @@ -254,7 +254,7 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); final List> pairs = new ArrayList>(terms.length); for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); + pairs.add(new FSTTester.InputOutput(terms[idx], random.nextLong() & Long.MAX_VALUE)); } new FSTTester(random, dir, inputMode, pairs, outputs, false).doTest(); } @@ -270,8 +270,7 @@ public class TestFSTs extends LuceneTestCase { final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000); lastOutput = value; pairs.add(new FSTTester.InputOutput>(terms[idx], - outputs.get(o1.get(idx), - o2.get(value)))); + outputs.newPair((long) idx, value))); } new FSTTester>(random, dir, inputMode, pairs, outputs, false).doTest(); } @@ -393,6 +392,7 @@ public class TestFSTs extends LuceneTestCase { final FST.Arc arc = fst.getFirstArc(new FST.Arc()); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; + final FST.BytesReader fstReader = fst.getBytesReader(0); for(int i=0;i<=term.length;i++) { final int label; @@ -401,8 +401,9 @@ public class TestFSTs extends LuceneTestCase { } else { label = term.ints[term.offset+i]; } - //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); - if (fst.findTargetArc(label, arc, arc) == null) { + // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); + if (fst.findTargetArc(label, arc, arc, fstReader) == null) { + // System.out.println(" not found"); if (prefixLength != null) { prefixLength[0] = i; return output; @@ -462,16 +463,19 @@ public class TestFSTs extends LuceneTestCase { FST doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { if (VERBOSE) { - System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); + System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); } + final boolean willRewrite = random.nextBoolean(); + final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, prune1, prune2, prune1==0 && prune2==0, allowRandomSuffixSharing ? random.nextBoolean() : true, allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, outputs, - null); + null, + willRewrite); for(InputOutput pair : pairs) { if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { @@ -486,7 +490,7 @@ public class TestFSTs extends LuceneTestCase { } FST fst = builder.finish(); - if (random.nextBoolean() && fst != null) { + if (random.nextBoolean() && fst != null && !willRewrite) { TestFSTs t = new TestFSTs(); IOContext context = t.newIOContext(random); IndexOutput out = dir.createOutput("fst.bin", context); @@ -522,6 +526,21 @@ public class TestFSTs extends LuceneTestCase { verifyPruned(inputMode, fst, prune1, prune2); } + if (willRewrite && fst != null) { + if (VERBOSE) { + System.out.println("TEST: now rewrite"); + } + final FST packed = fst.pack(_TestUtil.nextInt(random, 1, 10), _TestUtil.nextInt(random, 0, 10000000)); + if (VERBOSE) { + System.out.println("TEST: now verify packed FST"); + } + if (prune1 == 0 && prune2 == 0) { + verifyUnPruned(inputMode, packed); + } else { + verifyPruned(inputMode, packed, prune1, prune2); + } + } + return fst; } @@ -638,7 +657,7 @@ public class TestFSTs extends LuceneTestCase { num = atLeast(100); for(int iter=0;iter fst = builder.finish(); - //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount()); - // NOTE: we produce 7 nodes today - assertEquals(6, fst.getNodeCount()); - // NOTE: we produce 8 arcs today - assertEquals(7, fst.getNodeCount()); - //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - //Util.toDot(fst, w, false, false); - //w.close(); - } - */ - - // NOTE: this test shows a case where our current builder - // fails to produce minimal FST: - /* - public void test4() throws Exception { - final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - IntsRef scratchIntsRef = new IntsRef(); - builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput()); - builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1")); - builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11")); - final FST fst = builder.finish(); - //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount()); - // NOTE: we produce 7 nodes today - assertEquals(6, fst.getNodeCount()); - // NOTE: we produce 8 arcs today - assertEquals(7, fst.getNodeCount()); - //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - //Util.toDot(fst, w, false, false); - //w.close(); - } - */ - // Build FST for all unique terms in the test line docs // file, up until a time limit public void testRealTerms() throws Exception { @@ -1126,7 +1109,10 @@ public class TestFSTs extends LuceneTestCase { IndexReader r = IndexReader.open(writer, true); writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + + final boolean doRewrite = random.nextBoolean(); + + Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, doRewrite); boolean storeOrd = random.nextBoolean(); if (VERBOSE) { @@ -1162,59 +1148,69 @@ public class TestFSTs extends LuceneTestCase { } else { output = termsEnum.docFreq(); } - builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output)); + builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output); ord++; if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { System.out.println(ord + " terms..."); } } - final FST fst = builder.finish(); + FST fst = builder.finish(); if (VERBOSE) { System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); } if (ord > 0) { - // Now confirm BytesRefFSTEnum and TermsEnum act the - // same: - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); - int num = atLeast(1000); - for(int iter=0;iter fstEnum = new BytesRefFSTEnum(fst); + int num = atLeast(1000); + for(int iter=0;iter nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; } - BytesRefFSTEnum.InputOutput nextResult = fstEnum.next(); - if (nextResult != null) { - System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output)); - fail(); - } - break; } } } @@ -1248,14 +1244,17 @@ public class TestFSTs extends LuceneTestCase { private int inputMode; private final Outputs outputs; private final Builder builder; + private final boolean doPack; - public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { + public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs, boolean doPack, boolean noArcArrays) { this.dirOut = dirOut; this.wordsFileIn = wordsFileIn; this.inputMode = inputMode; this.outputs = outputs; - - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null); + this.doPack = doPack; + + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, null, doPack); + builder.setAllowArrayArcs(!noArcArrays); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -1287,14 +1286,15 @@ public class TestFSTs extends LuceneTestCase { } assert builder.getTermCount() == ord; - final FST fst = builder.finish(); + FST fst = builder.finish(); if (fst == null) { System.out.println("FST was fully pruned!"); System.exit(0); } - if (dirOut == null) + if (dirOut == null) { return; + } System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); if (fst.getNodeCount() < 100) { @@ -1304,11 +1304,16 @@ public class TestFSTs extends LuceneTestCase { System.out.println("Wrote FST to out.dot"); } + if (doPack) { + System.out.println("Pack..."); + fst = fst.pack(4, 100000000); + System.out.println("New size " + fst.sizeInBytes() + " bytes"); + } + Directory dir = FSDirectory.open(new File(dirOut)); IndexOutput out = dir.createOutput("fst.bin", IOContext.DEFAULT); fst.save(out); out.close(); - System.out.println("Saved FST to fst.bin."); if (!verify) { @@ -1317,45 +1322,50 @@ public class TestFSTs extends LuceneTestCase { System.out.println("\nNow verify..."); - is.close(); - is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); - - ord = 0; - tStart = System.currentTimeMillis(); while(true) { - String w = is.readLine(); - if (w == null) { - break; - } - toIntsRef(w, inputMode, intsRef); - T expected = getOutput(intsRef, ord); - T actual = Util.get(fst, intsRef); - if (actual == null) { - throw new RuntimeException("unexpected null output on input=" + w); - } - if (!actual.equals(expected)) { - throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + is.close(); + is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + + ord = 0; + tStart = System.currentTimeMillis(); + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + T expected = getOutput(intsRef, ord); + T actual = Util.get(fst, intsRef); + if (actual == null) { + throw new RuntimeException("unexpected null output on input=" + w); + } + if (!actual.equals(expected)) { + throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + } + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } } - ord++; - if (ord % 500000 == 0) { - System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); - } - if (ord >= limit) { - break; - } + double totSec = ((System.currentTimeMillis() - tStart)/1000.0); + System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); + + // NOTE: comment out to profile lookup... + break; } - double totSec = ((System.currentTimeMillis() - tStart)/1000.0); - System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); - } finally { is.close(); } } } - // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out + // java -cp build/classes/test:build/classes/test-framework:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out public static void main(String[] args) throws IOException { int prune = 0; int limit = Integer.MAX_VALUE; @@ -1363,7 +1373,8 @@ public class TestFSTs extends LuceneTestCase { boolean storeOrds = false; boolean storeDocFreqs = false; boolean verify = true; - + boolean doPack = false; + boolean noArcArrays = false; String wordsFileIn = null; String dirOut = null; @@ -1381,10 +1392,14 @@ public class TestFSTs extends LuceneTestCase { inputMode = 1; } else if (args[idx].equals("-docFreq")) { storeDocFreqs = true; + } else if (args[idx].equals("-noArcArrays")) { + noArcArrays = true; } else if (args[idx].equals("-ords")) { storeOrds = true; } else if (args[idx].equals("-noverify")) { verify = false; + } else if (args[idx].equals("-pack")) { + doPack = true; } else if (args[idx].startsWith("-")) { System.err.println("Unrecognized option: " + args[idx]); System.exit(-1); @@ -1413,44 +1428,44 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); final PairOutputs outputs = new PairOutputs(o1, o2); - new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { Random rand; @Override public PairOutputs.Pair getOutput(IntsRef input, int ord) { if (ord == 0) { rand = new Random(17); } - return new PairOutputs.Pair(o1.get(ord), - o2.get(_TestUtil.nextInt(rand, 1, 5000))); + return outputs.newPair((long) ord, + (long) _TestUtil.nextInt(rand, 1, 5000)); } }.run(limit, verify); } else if (storeOrds) { // Store only ords final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { @Override public Long getOutput(IntsRef input, int ord) { - return outputs.get(ord); + return (long) ord; } }.run(limit, verify); } else if (storeDocFreqs) { // Store only docFreq final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { Random rand; @Override public Long getOutput(IntsRef input, int ord) { if (ord == 0) { rand = new Random(17); } - return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); + return (long) _TestUtil.nextInt(rand, 1, 5000); } }.run(limit, verify); } else { // Store nothing final NoOutputs outputs = NoOutputs.getSingleton(); final Object NO_OUTPUT = outputs.getNoOutput(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { @Override public Object getOutput(IntsRef input, int ord) { return NO_OUTPUT; @@ -1468,6 +1483,46 @@ public class TestFSTs extends LuceneTestCase { assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); } + /* + public void testTrivial() throws Exception { + + // Get outputs -- passing true means FST will share + // (delta code) the outputs. This should result in + // smaller FST if the outputs grow monotonically. But + // if numbers are "random", false should give smaller + // final size: + final NoOutputs outputs = NoOutputs.getSingleton(); + + String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; + + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, + 0, 0, + true, + true, + Integer.MAX_VALUE, + outputs, + null, + true); + Arrays.sort(strings); + final IntsRef scratch = new IntsRef(); + for(String s : strings) { + builder.add(Util.toIntsRef(new BytesRef(s), scratch), outputs.getNoOutput()); + } + final FST fst = builder.finish(); + System.out.println("DOT before rewrite"); + Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot")); + Util.toDot(fst, w, false, false); + w.close(); + + final FST rewrite = new FST(fst, 1, 100); + + System.out.println("DOT after rewrite"); + w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot")); + Util.toDot(rewrite, w, false, false); + w.close(); + } + */ + public void testSimple() throws Exception { // Get outputs -- passing true means FST will share @@ -1484,9 +1539,9 @@ public class TestFSTs extends LuceneTestCase { final BytesRef b = new BytesRef("b"); final BytesRef c = new BytesRef("c"); - builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17)); - builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42)); - builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L)); + builder.add(Util.toIntsRef(a, new IntsRef()), 17L); + builder.add(Util.toIntsRef(b, new IntsRef()), 42L); + builder.add(Util.toIntsRef(c, new IntsRef()), 13824324872317238L); final FST fst = builder.finish(); @@ -1795,11 +1850,11 @@ public class TestFSTs extends LuceneTestCase { public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); - builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17)); - builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10)); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null, random.nextBoolean()); + builder.add(Util.toUTF32("stat", new IntsRef()), 17L); + builder.add(Util.toUTF32("station", new IntsRef()), 10L); final FST fst = builder.finish(); - //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); StringWriter w = new StringWriter(); Util.toDot(fst, w, false, false); w.close(); @@ -1809,8 +1864,8 @@ public class TestFSTs extends LuceneTestCase { public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); + final boolean willRewrite = random.nextBoolean(); + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null, willRewrite); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST fst = builder.finish(); @@ -1819,17 +1874,23 @@ public class TestFSTs extends LuceneTestCase { Util.toDot(fst, w, false, false); w.close(); //System.out.println(w.toString()); - assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1); + final String expected; + if (willRewrite) { + expected = "4 -> 3 [label=\"t\" style=\"bold\""; + } else { + expected = "8 -> 6 [label=\"t\" style=\"bold\""; + } + assertTrue(w.toString().indexOf(expected) != -1); } // Make sure raw FST can differentiate between final vs // non-final end nodes - public void testNonFinalStopNodes() throws Exception { + public void testNonFinalStopNode() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Long nothing = outputs.getNoOutput(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs, false); final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); @@ -1839,8 +1900,8 @@ public class TestFSTs extends LuceneTestCase { node.isFinal = true; rootNode.addArc('a', node); final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); - rootNode.arcs[0].nextFinalOutput = outputs.get(17); + frozen.node = fst.addNode(node); + rootNode.arcs[0].nextFinalOutput = 17L; rootNode.arcs[0].isFinal = true; rootNode.arcs[0].output = nothing; rootNode.arcs[0].target = frozen; @@ -1851,13 +1912,18 @@ public class TestFSTs extends LuceneTestCase { final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); rootNode.addArc('b', node); final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); + frozen.node = fst.addNode(node); rootNode.arcs[1].nextFinalOutput = nothing; - rootNode.arcs[1].output = outputs.get(42); + rootNode.arcs[1].output = 42L; rootNode.arcs[1].target = frozen; } fst.finish(fst.addNode(rootNode)); + + StringWriter w = new StringWriter(); + //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); + Util.toDot(fst, w, false, false); + w.close(); checkStopNodes(fst, outputs); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 97cf3a64c41..30bad751566 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -226,6 +226,9 @@ public final class SynonymFilter extends TokenFilter { private final FST fst; + private final FST.BytesReader fstReader; + + private final BytesRef scratchBytes = new BytesRef(); private final CharsRef scratchChars = new CharsRef(); @@ -241,7 +244,7 @@ public final class SynonymFilter extends TokenFilter { this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; - + this.fstReader = fst.getBytesReader(0); if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } @@ -366,7 +369,7 @@ public final class SynonymFilter extends TokenFilter { int bufUpto = 0; while(bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); - if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) { + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { //System.out.println(" stop"); break byToken; } @@ -388,7 +391,7 @@ public final class SynonymFilter extends TokenFilter { // See if the FST wants to continue matching (ie, needs to // see the next input token): - if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) { + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java index 0ee9786ffbe..553724199ff 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoFST.java @@ -47,16 +47,17 @@ public final class TokenInfoFST { FST.Arc firstArc = new FST.Arc(); fst.getFirstArc(firstArc); FST.Arc arc = new FST.Arc(); + final FST.BytesReader fstReader = fst.getBytesReader(0); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.length; i++) { - if (fst.findTargetArc(0x3040 + i, firstArc, arc) != null) { + if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { rootCache[i] = new FST.Arc().copyFrom(arc); } } return rootCache; } - public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache) throws IOException { + public FST.Arc findTargetArc(int ch, FST.Arc follow, FST.Arc arc, boolean useCache, FST.BytesReader fstReader) throws IOException { if (useCache && ch >= 0x3040 && ch <= cacheCeiling) { assert ch != FST.END_LABEL; final Arc result = rootCache[ch - 0x3040]; @@ -67,13 +68,17 @@ public final class TokenInfoFST { return arc; } } else { - return fst.findTargetArc(ch, follow, arc); + return fst.findTargetArc(ch, follow, arc, fstReader); } } public Arc getFirstArc(FST.Arc arc) { return fst.getFirstArc(arc); } + + public FST.BytesReader getBytesReader(int pos) { + return fst.getBytesReader(pos); + } /** @lucene.internal for testing only */ FST getInternalFST() { diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java index cd7dd2f9dc8..27e54a4ddde 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java @@ -113,7 +113,7 @@ public final class UserDictionary implements Dictionary { for (int i = 0; i < token.length(); i++) { scratch.ints[i] = (int) token.charAt(i); } - fstBuilder.add(scratch, fstOutput.get(ord)); + fstBuilder.add(scratch, ord); segmentations.add(wordIdAndLength); ord++; } @@ -134,6 +134,8 @@ public final class UserDictionary implements Dictionary { TreeMap result = new TreeMap(); // index, [length, length...] boolean found = false; // true if we found any results + final FST.BytesReader fstReader = fst.getBytesReader(0); + FST.Arc arc = new FST.Arc(); int end = off + len; for (int startOffset = off; startOffset < end; startOffset++) { @@ -142,7 +144,7 @@ public final class UserDictionary implements Dictionary { int remaining = end - startOffset; for (int i = 0; i < remaining; i++) { int ch = chars[startOffset+i]; - if (fst.findTargetArc(ch, arc, arc, i == 0) == null) { + if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { break; // continue to next position } output += arc.output.intValue(); diff --git a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java index 8de876ee77a..e44235182eb 100644 --- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java +++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java @@ -35,7 +35,7 @@ import org.apache.lucene.util.fst.FST; public class Viterbi { private final TokenInfoFST fst; - + private final TokenInfoDictionary dictionary; private final UnknownDictionary unkDictionary; @@ -214,6 +214,8 @@ public class Viterbi { ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN); addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr); + final FST.BytesReader fstReader = fst.getBytesReader(0); + // Process user dictionary; if (useUserDictionary) { processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr); @@ -238,7 +240,7 @@ public class Viterbi { for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) { int ch = text[suffixStart + endIndex - 1]; - if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) { + if (fst.findTargetArc(ch, arc, arc, endIndex == 1, fstReader) == null) { break; // continue to next position } output += arc.output.intValue(); diff --git a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat index dbb49c4f1ea..8fd2138c066 100644 Binary files a/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat and b/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat differ diff --git a/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java b/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java index 609c8348379..ef1bf3dc81c 100644 --- a/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java +++ b/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java @@ -131,7 +131,7 @@ public class TokenInfoDictionaryBuilder { System.out.println(" encode..."); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(true); - Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, fstOutput); + Builder fstBuilder = new Builder(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, null, true); IntsRef scratch = new IntsRef(); long ord = -1; // first ord will be 0 String lastValue = null; @@ -155,13 +155,14 @@ public class TokenInfoDictionaryBuilder { for (int i = 0; i < token.length(); i++) { scratch.ints[i] = (int) token.charAt(i); } - fstBuilder.add(scratch, fstOutput.get(ord)); + fstBuilder.add(scratch, ord); } dictionary.addMapping((int)ord, offset); offset = next; } - FST fst = fstBuilder.finish(); + final FST fst = fstBuilder.finish().pack(2, 100000); + System.out.print(" " + fst.getNodeCount() + " nodes, " + fst.getArcCount() + " arcs, " + fst.sizeInBytes() + " bytes... "); dictionary.setFST(fst); System.out.println(" done"); diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java index 4263fbeec13..23a8df2de49 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java @@ -329,8 +329,11 @@ public class FSTCompletion { private boolean descendWithPrefix(Arc arc, BytesRef utf8) throws IOException { final int max = utf8.offset + utf8.length; + // Cannot save as instance var since multiple threads + // can use FSTCompletion at once... + final FST.BytesReader fstReader = automaton.getBytesReader(0); for (int i = utf8.offset; i < max; i++) { - if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc) == null) { + if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. return false; } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index 914fcea472e..01f2b34df38 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -234,7 +234,7 @@ public class FSTCompletionBuilder { final Object empty = outputs.getNoOutput(); final Builder builder = new Builder( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, null); + shareMaxTailLength, outputs, null, false); BytesRef scratch = new BytesRef(); final IntsRef scratchIntsRef = new IntsRef(); diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java index 4a252e78d9b..9264b14e384 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java +++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java @@ -176,20 +176,24 @@ public class Overseer implements NodeStateChangeListener, ShardLeaderListener { return false; } /** - * Try to assign core to the cluster + * Try to assign core to the cluster. * @throws KeeperException * @throws InterruptedException */ private CloudState updateState(CloudState state, String nodeName, CoreState coreState) throws KeeperException, InterruptedException { String collection = coreState.getCollectionName(); String zkCoreNodeName = coreState.getCoreNodeName(); - - String shardId; - if (coreState.getProperties().get(ZkStateReader.SHARD_ID_PROP) == null) { - shardId = AssignShard.assignShard(collection, state); - } else { - shardId = coreState.getProperties().get(ZkStateReader.SHARD_ID_PROP); - } + + // use the provided non null shardId + String shardId = coreState.getProperties().get(ZkStateReader.SHARD_ID_PROP); + if(shardId==null) { + //use shardId from CloudState + shardId = getAssignedId(state, nodeName, coreState); + } + if(shardId==null) { + //request new shardId + shardId = AssignShard.assignShard(collection, state); + } Map props = new HashMap(); for (Entry entry : coreState.getProperties().entrySet()) { @@ -209,6 +213,23 @@ public class Overseer implements NodeStateChangeListener, ShardLeaderListener { CloudState newCloudState = updateSlice(state, collection, slice); return newCloudState; } + + /* + * Return an already assigned id or null if not assigned + */ + private String getAssignedId(final CloudState state, final String nodeName, + final CoreState coreState) { + final String key = coreState.getProperties().get(ZkStateReader.NODE_NAME_PROP) + "_" + coreState.getProperties().get(ZkStateReader.CORE_NAME_PROP); + Map slices = state.getSlices(coreState.getCollectionName()); + if (slices != null) { + for (Slice slice : slices.values()) { + if (slice.getShards().get(key) != null) { + return slice.getName(); + } + } + } + return null; + } private CloudState updateSlice(CloudState state, String collection, Slice slice) { @@ -480,6 +501,7 @@ public class Overseer implements NodeStateChangeListener, ShardLeaderListener { Set downNodes = complement(oldLiveNodes, liveNodes); for(String node: downNodes) { NodeStateWatcher watcher = nodeStateWatches.remove(node); + log.debug("Removed NodeStateWatcher for node:" + node); } } @@ -491,6 +513,7 @@ public class Overseer implements NodeStateChangeListener, ShardLeaderListener { if (!nodeStateWatches.containsKey(nodeName)) { zkCmdExecutor.ensureExists(path, zkClient); nodeStateWatches.put(nodeName, new NodeStateWatcher(zkClient, nodeName, path, this)); + log.debug("Added NodeStateWatcher for node " + nodeName); } else { log.debug("watch already added"); } diff --git a/solr/core/src/java/org/apache/solr/core/IndexDeletionPolicyWrapper.java b/solr/core/src/java/org/apache/solr/core/IndexDeletionPolicyWrapper.java index d11129277d4..6b33e9cf672 100644 --- a/solr/core/src/java/org/apache/solr/core/IndexDeletionPolicyWrapper.java +++ b/solr/core/src/java/org/apache/solr/core/IndexDeletionPolicyWrapper.java @@ -19,6 +19,7 @@ package org.apache.solr.core; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; import org.apache.lucene.store.Directory; +import org.apache.solr.update.SolrIndexWriter; import java.io.IOException; import java.util.*; @@ -65,13 +66,13 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { /** * Set the duration for which commit point is to be reserved by the deletion policy. * - * @param indexVersion version of the commit point to be reserved + * @param indexGen gen of the commit point to be reserved * @param reserveTime time in milliseconds for which the commit point is to be reserved */ - public void setReserveDuration(Long indexVersion, long reserveTime) { + public void setReserveDuration(Long indexGen, long reserveTime) { long timeToSet = System.currentTimeMillis() + reserveTime; for(;;) { - Long previousTime = reserves.put(indexVersion, timeToSet); + Long previousTime = reserves.put(indexGen, timeToSet); // this is the common success case: the older time didn't exist, or // came before the new time. @@ -103,19 +104,19 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { /** Permanently prevent this commit point from being deleted. * A counter is used to allow a commit point to be correctly saved and released * multiple times. */ - public synchronized void saveCommitPoint(Long indexCommitVersion) { - AtomicInteger reserveCount = savedCommits.get(indexCommitVersion); + public synchronized void saveCommitPoint(Long indexCommitGen) { + AtomicInteger reserveCount = savedCommits.get(indexCommitGen); if (reserveCount == null) reserveCount = new AtomicInteger(); reserveCount.incrementAndGet(); - savedCommits.put(indexCommitVersion, reserveCount); + savedCommits.put(indexCommitGen, reserveCount); } /** Release a previously saved commit point */ - public synchronized void releaseCommitPoint(Long indexCommitVersion) { - AtomicInteger reserveCount = savedCommits.get(indexCommitVersion); + public synchronized void releaseCommitPoint(Long indexCommitGen) { + AtomicInteger reserveCount = savedCommits.get(indexCommitGen); if (reserveCount == null) return;// this should not happen if (reserveCount.decrementAndGet() <= 0) { - savedCommits.remove(indexCommitVersion); + savedCommits.remove(indexCommitGen); } } @@ -165,10 +166,10 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { @Override public void delete() { - Long version = delegate.getVersion(); - Long reserve = reserves.get(version); + Long gen = delegate.getGeneration(); + Long reserve = reserves.get(gen); if (reserve != null && System.currentTimeMillis() < reserve) return; - if(savedCommits.containsKey(version)) return; + if(savedCommits.containsKey(gen)) return; delegate.delete(); } @@ -187,11 +188,6 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { return delegate.hashCode(); } - @Override - public long getVersion() { - return delegate.getVersion(); - } - @Override public long getGeneration() { return delegate.getGeneration(); @@ -202,11 +198,6 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { return delegate.isDeleted(); } - @Override - public long getTimestamp() throws IOException { - return delegate.getTimestamp(); - } - @Override public Map getUserData() throws IOException { return delegate.getUserData(); @@ -214,11 +205,11 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { } /** - * @param version the version of the commit point + * @param gen the gen of the commit point * @return a commit point corresponding to the given version */ - public IndexCommit getCommitPoint(Long version) { - return solrVersionVsCommits.get(version); + public IndexCommit getCommitPoint(Long gen) { + return solrVersionVsCommits.get(gen); } /** @@ -236,10 +227,20 @@ public class IndexDeletionPolicyWrapper implements IndexDeletionPolicy { Map map = new ConcurrentHashMap(); for (IndexCommitWrapper wrapper : list) { if (!wrapper.isDeleted()) - map.put(wrapper.getVersion(), wrapper.delegate); + map.put(wrapper.delegate.getGeneration(), wrapper.delegate); } solrVersionVsCommits = map; latestCommit = ((list.get(list.size() - 1)).delegate); } + + public static long getCommitTimestamp(IndexCommit commit) throws IOException { + final Map commitData = commit.getUserData(); + String commitTime = commitData.get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + if (commitTime != null) { + return Long.parseLong(commitTime); + } else { + return 0; + } + } } diff --git a/solr/core/src/java/org/apache/solr/core/SolrDeletionPolicy.java b/solr/core/src/java/org/apache/solr/core/SolrDeletionPolicy.java index 9f119b0f5c6..d8035e48718 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrDeletionPolicy.java +++ b/solr/core/src/java/org/apache/solr/core/SolrDeletionPolicy.java @@ -87,7 +87,6 @@ public class SolrDeletionPolicy implements IndexDeletionPolicy, NamedListInitial } sb.append(",segFN=").append(commit.getSegmentsFileName()); - sb.append(",version=").append(commit.getVersion()); sb.append(",generation=").append(commit.getGeneration()); sb.append(",filenames=").append(commit.getFileNames()); } catch (Exception e) { @@ -133,7 +132,7 @@ public class SolrDeletionPolicy implements IndexDeletionPolicy, NamedListInitial synchronized (this) { long maxCommitAgeTimeStamp = -1L; IndexCommit newest = commits.get(commits.size() - 1); - log.info("newest commit = " + newest.getVersion()); + log.info("newest commit = " + newest.getGeneration()); int singleSegKept = (newest.getSegmentCount() == 1) ? 1 : 0; int totalKept = 1; @@ -149,7 +148,7 @@ public class SolrDeletionPolicy implements IndexDeletionPolicy, NamedListInitial DateMathParser dmp = new DateMathParser(DateField.UTC, Locale.US); maxCommitAgeTimeStamp = dmp.parseMath(maxCommitAge).getTime(); } - if (commit.getTimestamp() < maxCommitAgeTimeStamp) { + if (IndexDeletionPolicyWrapper.getCommitTimestamp(commit) < maxCommitAgeTimeStamp) { commit.delete(); continue; } @@ -191,8 +190,6 @@ public class SolrDeletionPolicy implements IndexDeletionPolicy, NamedListInitial sb.append('/'); sb.append(commit.getGeneration()); - sb.append('_'); - sb.append(commit.getVersion()); return sb.toString(); } diff --git a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java index bae9c78232c..9557f035c23 100644 --- a/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/ReplicationHandler.java @@ -60,6 +60,7 @@ import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.BinaryQueryResponseWriter; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.update.SolrIndexWriter; import org.apache.solr.util.NumberUtils; import org.apache.solr.util.RefCounted; import org.apache.solr.util.plugin.SolrCoreAware; @@ -139,8 +140,8 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw // in a catastrophic failure, but will result in the client getting an empty file list for // the CMD_GET_FILE_LIST command. // - core.getDeletionPolicy().setReserveDuration(commitPoint.getVersion(), reserveCommitDuration); - rsp.add(CMD_INDEX_VERSION, commitPoint.getVersion()); + core.getDeletionPolicy().setReserveDuration(commitPoint.getGeneration(), reserveCommitDuration); + rsp.add(CMD_INDEX_VERSION, core.getDeletionPolicy().getCommitTimestamp(commitPoint)); rsp.add(GENERATION, commitPoint.getGeneration()); } else { // This happens when replication is not configured to happen after startup and no commit/optimize @@ -219,7 +220,7 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw for (IndexCommit c : commits.values()) { try { NamedList nl = new NamedList(); - nl.add("indexVersion", c.getVersion()); + nl.add("indexVersion", core.getDeletionPolicy().getCommitTimestamp(c)); nl.add(GENERATION, c.getGeneration()); nl.add(CMD_GET_FILE_LIST, c.getFileNames()); l.add(nl); @@ -332,21 +333,21 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw @SuppressWarnings("unchecked") private void getFileList(SolrParams solrParams, SolrQueryResponse rsp) { - String v = solrParams.get(CMD_INDEX_VERSION); + String v = solrParams.get(GENERATION); if (v == null) { - rsp.add("status", "no indexversion specified"); + rsp.add("status", "no index generation specified"); return; } - long version = Long.parseLong(v); - IndexCommit commit = core.getDeletionPolicy().getCommitPoint(version); + long gen = Long.parseLong(v); + IndexCommit commit = core.getDeletionPolicy().getCommitPoint(gen); //System.out.println("ask for files for gen:" + commit.getGeneration() + core.getCoreDescriptor().getCoreContainer().getZkController().getNodeName()); if (commit == null) { - rsp.add("status", "invalid indexversion"); + rsp.add("status", "invalid index generation"); return; } // reserve the indexcommit for sometime - core.getDeletionPolicy().setReserveDuration(version, reserveCommitDuration); + core.getDeletionPolicy().setReserveDuration(gen, reserveCommitDuration); List> result = new ArrayList>(); try { //get all the files in the commit @@ -359,10 +360,10 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw result.add(fileMeta); } } catch (IOException e) { - rsp.add("status", "unable to get file names for given indexversion"); + rsp.add("status", "unable to get file names for given index generation"); rsp.add("exception", e); - LOG.warn("Unable to get file names for indexCommit version: " - + version, e); + LOG.warn("Unable to get file names for indexCommit generation: " + + gen, e); } rsp.add(CMD_GET_FILE_LIST, result); if (confFileNameAlias.size() < 1) @@ -487,8 +488,13 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw long version[] = new long[2]; RefCounted searcher = core.getSearcher(); try { - version[0] = searcher.get().getIndexReader().getIndexCommit().getVersion(); - version[1] = searcher.get().getIndexReader().getIndexCommit().getGeneration(); + final IndexCommit commit = searcher.get().getIndexReader().getIndexCommit(); + final Map commitData = commit.getUserData(); + String commitTime = commitData.get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + if (commitTime != null) { + version[0] = Long.parseLong(commitTime); + } + version[1] = commit.getGeneration(); } catch (IOException e) { LOG.warn("Unable to get index version : ", e); } finally { @@ -574,7 +580,6 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw } if (isMaster && commit != null) { - master.add("replicatableIndexVersion", commit.getVersion()); master.add("replicatableGeneration", commit.getGeneration()); } @@ -846,7 +851,7 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw Collection commits = IndexReader.listCommits(reader.directory()); for (IndexCommit ic : commits) { if(ic.getSegmentCount() == 1){ - if(indexCommitPoint == null || indexCommitPoint.getVersion() < ic.getVersion()) indexCommitPoint = ic; + if(indexCommitPoint == null || indexCommitPoint.getGeneration() < ic.getGeneration()) indexCommitPoint = ic; } } } else{ @@ -857,7 +862,7 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw // always saves the last commit point (and the last optimized commit point, if needed) /*** if(indexCommitPoint != null){ - core.getDeletionPolicy().saveCommitPoint(indexCommitPoint.getVersion()); + core.getDeletionPolicy().saveCommitPoint(indexCommitPoint.getGeneration()); } ***/ } @@ -958,10 +963,10 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw // always saves the last commit point (and the last optimized commit point, if needed) /*** if (indexCommitPoint != null) { - core.getDeletionPolicy().saveCommitPoint(indexCommitPoint.getVersion()); + core.getDeletionPolicy().saveCommitPoint(indexCommitPoint.getGeneration()); } if(oldCommitPoint != null){ - core.getDeletionPolicy().releaseCommitPoint(oldCommitPoint.getVersion()); + core.getDeletionPolicy().releaseCommitPoint(oldCommitPoint.getGeneration()); } ***/ } @@ -989,7 +994,7 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw private FastOutputStream fos; - private Long indexVersion; + private Long indexGen; private IndexDeletionPolicyWrapper delPolicy; public FileStream(SolrParams solrParams) { @@ -1004,8 +1009,8 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw String sLen = params.get(LEN); String compress = params.get(COMPRESSION); String sChecksum = params.get(CHECKSUM); - String sindexVersion = params.get(CMD_INDEX_VERSION); - if (sindexVersion != null) indexVersion = Long.parseLong(sindexVersion); + String sGen = params.get(GENERATION); + if (sGen != null) indexGen = Long.parseLong(sGen); if (Boolean.parseBoolean(compress)) { fos = new FastOutputStream(new DeflaterOutputStream(out)); } else { @@ -1063,9 +1068,9 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw } fos.write(buf, 0, (int) bytesRead); fos.flush(); - if (indexVersion != null && (packetsWritten % 5 == 0)) { + if (indexGen != null && (packetsWritten % 5 == 0)) { //after every 5 packets reserve the commitpoint for some time - delPolicy.setReserveDuration(indexVersion, reserveCommitDuration); + delPolicy.setReserveDuration(indexGen, reserveCommitDuration); } packetsWritten++; } diff --git a/solr/core/src/java/org/apache/solr/handler/SnapPuller.java b/solr/core/src/java/org/apache/solr/handler/SnapPuller.java index 9c900fddeaa..9cef5096b9f 100644 --- a/solr/core/src/java/org/apache/solr/handler/SnapPuller.java +++ b/solr/core/src/java/org/apache/solr/handler/SnapPuller.java @@ -28,6 +28,7 @@ import org.apache.solr.common.util.FileUtils; import org.apache.solr.common.util.JavaBinCodec; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; +import org.apache.solr.core.IndexDeletionPolicyWrapper; import static org.apache.solr.handler.ReplicationHandler.*; import org.apache.solr.request.LocalSolrQueryRequest; @@ -210,10 +211,10 @@ public class SnapPuller { /** * Fetches the list of files in a given index commit point */ - void fetchFileList(long version) throws IOException { + void fetchFileList(long gen) throws IOException { PostMethod post = new PostMethod(masterUrl); post.addParameter(COMMAND, CMD_GET_FILE_LIST); - post.addParameter(CMD_INDEX_VERSION, String.valueOf(version)); + post.addParameter(GENERATION, String.valueOf(gen)); post.addParameter("wt", "javabin"); @SuppressWarnings("unchecked") @@ -225,7 +226,7 @@ public class SnapPuller { filesToDownload = Collections.synchronizedList(f); else { filesToDownload = Collections.emptyList(); - LOG.error("No files to download for indexversion: "+ version); + LOG.error("No files to download for index generation: "+ gen); } f = nl.get(CONF_FILES); @@ -274,7 +275,7 @@ public class SnapPuller { } if (latestVersion == 0L) { - if (force && commit.getVersion() != 0) { + if (force && commit.getGeneration() != 0) { // since we won't get the files for an empty index, // we just clear ours and commit core.getUpdateHandler().getSolrCoreState().getIndexWriter(core).deleteAll(); @@ -288,17 +289,17 @@ public class SnapPuller { return true; } - if (!force && commit.getVersion() == latestVersion && commit.getGeneration() == latestGeneration) { + if (!force && IndexDeletionPolicyWrapper.getCommitTimestamp(commit) == latestVersion) { //master and slave are already in sync just return LOG.info("Slave in sync with master."); successfulInstall = true; return true; } - LOG.info("Master's version: " + latestVersion + ", generation: " + latestGeneration); - LOG.info("Slave's version: " + commit.getVersion() + ", generation: " + commit.getGeneration()); + LOG.info("Master's generation: " + latestGeneration); + LOG.info("Slave's generation: " + commit.getGeneration()); LOG.info("Starting replication process"); // get the list of files first - fetchFileList(latestVersion); + fetchFileList(latestGeneration); // this can happen if the commit point is deleted before we fetch the file list. if(filesToDownload.isEmpty()) return false; LOG.info("Number of files in latest index in master: " + filesToDownload.size()); @@ -309,7 +310,7 @@ public class SnapPuller { filesDownloaded = Collections.synchronizedList(new ArrayList>()); // if the generateion of master is older than that of the slave , it means they are not compatible to be copied // then a new index direcory to be created and all the files need to be copied - boolean isFullCopyNeeded = commit.getVersion() >= latestVersion || force; + boolean isFullCopyNeeded = IndexDeletionPolicyWrapper.getCommitTimestamp(commit) >= latestVersion || force; File tmpIndexDir = createTempindexDir(core); if (isIndexStale()) isFullCopyNeeded = true; @@ -318,11 +319,11 @@ public class SnapPuller { File indexDir = null ; try { indexDir = new File(core.getIndexDir()); - downloadIndexFiles(isFullCopyNeeded, tmpIndexDir, latestVersion); + downloadIndexFiles(isFullCopyNeeded, tmpIndexDir, latestGeneration); LOG.info("Total time taken for download : " + ((System.currentTimeMillis() - replicationStartTime) / 1000) + " secs"); Collection> modifiedConfFiles = getModifiedConfFiles(confFilesToDownload); if (!modifiedConfFiles.isEmpty()) { - downloadConfFiles(confFilesToDownload, latestVersion); + downloadConfFiles(confFilesToDownload, latestGeneration); if (isFullCopyNeeded) { successfulInstall = modifyIndexProps(tmpIndexDir.getName()); deleteTmpIdxDir = false; @@ -530,7 +531,7 @@ public class SnapPuller { }.start(); } - private void downloadConfFiles(List> confFilesToDownload, long latestVersion) throws Exception { + private void downloadConfFiles(List> confFilesToDownload, long latestGeneration) throws Exception { LOG.info("Starting download of configuration files from master: " + confFilesToDownload); confFilesDownloaded = Collections.synchronizedList(new ArrayList>()); File tmpconfDir = new File(solrCore.getResourceLoader().getConfigDir(), "conf." + getDateAsStr(new Date())); @@ -542,7 +543,7 @@ public class SnapPuller { } for (Map file : confFilesToDownload) { String saveAs = (String) (file.get(ALIAS) == null ? file.get(NAME) : file.get(ALIAS)); - fileFetcher = new FileFetcher(tmpconfDir, file, saveAs, true, latestVersion); + fileFetcher = new FileFetcher(tmpconfDir, file, saveAs, true, latestGeneration); currentFile = file; fileFetcher.fetchFile(); confFilesDownloaded.add(new HashMap(file)); @@ -561,13 +562,13 @@ public class SnapPuller { * * @param downloadCompleteIndex is it a fresh index copy * @param tmpIdxDir the directory to which files need to be downloadeed to - * @param latestVersion the version number + * @param latestGeneration the version number */ - private void downloadIndexFiles(boolean downloadCompleteIndex, File tmpIdxDir, long latestVersion) throws Exception { + private void downloadIndexFiles(boolean downloadCompleteIndex, File tmpIdxDir, long latestGeneration) throws Exception { for (Map file : filesToDownload) { File localIndexFile = new File(solrCore.getIndexDir(), (String) file.get(NAME)); if (!localIndexFile.exists() || downloadCompleteIndex) { - fileFetcher = new FileFetcher(tmpIdxDir, file, (String) file.get(NAME), false, latestVersion); + fileFetcher = new FileFetcher(tmpIdxDir, file, (String) file.get(NAME), false, latestGeneration); currentFile = file; fileFetcher.fetchFile(); filesDownloaded.add(new HashMap(file)); @@ -578,7 +579,7 @@ public class SnapPuller { } /** - * All the files which are common between master and slave must have same timestamp and size else we assume they are + * All the files which are common between master and slave must have same size else we assume they are * not compatible (stale). * * @return true if the index stale and we need to download a fresh copy, false otherwise. @@ -896,10 +897,10 @@ public class SnapPuller { private boolean aborted = false; - private Long indexVersion; + private Long indexGen; FileFetcher(File dir, Map fileDetails, String saveAs, - boolean isConf, long latestVersion) throws IOException { + boolean isConf, long latestGen) throws IOException { this.copy2Dir = dir; this.fileName = (String) fileDetails.get(NAME); this.size = (Long) fileDetails.get(SIZE); @@ -908,7 +909,7 @@ public class SnapPuller { if(fileDetails.get(LAST_MODIFIED) != null){ lastmodified = (Long)fileDetails.get(LAST_MODIFIED); } - indexVersion = latestVersion; + indexGen = latestGen; this.file = new File(copy2Dir, saveAs); @@ -1077,7 +1078,7 @@ public class SnapPuller { //the method is command=filecontent post.addParameter(COMMAND, CMD_GET_FILE); //add the version to download. This is used to reserve the download - post.addParameter(CMD_INDEX_VERSION, indexVersion.toString()); + post.addParameter(GENERATION, indexGen.toString()); if (isConf) { //set cf instead of file for config file post.addParameter(CONF_FILE_SHORT, fileName); diff --git a/solr/core/src/java/org/apache/solr/handler/SnapShooter.java b/solr/core/src/java/org/apache/solr/handler/SnapShooter.java index 84cf74ae067..bbfd63fb4a9 100644 --- a/solr/core/src/java/org/apache/solr/handler/SnapShooter.java +++ b/solr/core/src/java/org/apache/solr/handler/SnapShooter.java @@ -70,7 +70,7 @@ public class SnapShooter { } void createSnapAsync(final IndexCommit indexCommit, final int numberToKeep, final ReplicationHandler replicationHandler) { - replicationHandler.core.getDeletionPolicy().saveCommitPoint(indexCommit.getVersion()); + replicationHandler.core.getDeletionPolicy().saveCommitPoint(indexCommit.getGeneration()); new Thread() { @Override @@ -112,7 +112,7 @@ public class SnapShooter { LOG.error("Exception while creating snapshot", e); details.add("snapShootException", e.getMessage()); } finally { - replicationHandler.core.getDeletionPolicy().releaseCommitPoint(indexCommit.getVersion()); + replicationHandler.core.getDeletionPolicy().releaseCommitPoint(indexCommit.getGeneration()); replicationHandler.snapShootDetails = details; if (lock != null) { try { diff --git a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java index 44c8e8fadfd..99148fc55fe 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java @@ -48,6 +48,7 @@ import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; +import org.apache.solr.update.SolrIndexWriter; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.SolrIndexSearcher; @@ -562,7 +563,10 @@ public class LukeRequestHandler extends RequestHandlerBase indexInfo.add("current", reader.isCurrent() ); indexInfo.add("hasDeletions", reader.hasDeletions() ); indexInfo.add("directory", dir ); - indexInfo.add("lastModified", new Date(IndexReader.lastModified(dir)) ); + String s = reader.getIndexCommit().getUserData().get(SolrIndexWriter.COMMIT_TIME_MSEC_KEY); + if (s != null) { + indexInfo.add("lastModified", new Date(Long.parseLong(s))); + } return indexInfo; } //////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/core/src/java/org/apache/solr/servlet/cache/HttpCacheHeaderUtil.java b/solr/core/src/java/org/apache/solr/servlet/cache/HttpCacheHeaderUtil.java index ce1d55680be..ece1e6b8502 100644 --- a/solr/core/src/java/org/apache/solr/servlet/cache/HttpCacheHeaderUtil.java +++ b/solr/core/src/java/org/apache/solr/servlet/cache/HttpCacheHeaderUtil.java @@ -29,6 +29,7 @@ import javax.servlet.http.HttpServletResponse; import org.apache.lucene.index.IndexReader; +import org.apache.solr.core.IndexDeletionPolicyWrapper; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrConfig.HttpCachingConfig.LastModFrom; @@ -157,7 +158,7 @@ public final class HttpCacheHeaderUtil { // assume default, change if needed (getOpenTime() should be fast) lastMod = LastModFrom.DIRLASTMOD == lastModFrom - ? IndexReader.lastModified(searcher.getIndexReader().directory()) + ? IndexDeletionPolicyWrapper.getCommitTimestamp(searcher.getIndexReader().getIndexCommit()) : searcher.getOpenTime(); } catch (IOException e) { // we're pretty freaking screwed if this happens diff --git a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java index 6818eecb4d4..bbebe1672da 100644 --- a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java +++ b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java @@ -22,6 +22,8 @@ package org.apache.solr.update; import java.io.IOException; import java.net.URL; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; @@ -387,7 +389,9 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState } // SolrCore.verbose("writer.commit() start writer=",writer); - writer.commit(); + final Map commitData = new HashMap(); + commitData.put(SolrIndexWriter.COMMIT_TIME_MSEC_KEY, String.valueOf(System.currentTimeMillis())); + writer.commit(commitData); // SolrCore.verbose("writer.commit() end"); numDocsPending.set(0); callPostCommitCallbacks(); diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java index c1aa1e5ac95..bd561dcc18b 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java @@ -50,6 +50,11 @@ public class SolrIndexWriter extends IndexWriter { public static final AtomicLong numOpens = new AtomicLong(); public static final AtomicLong numCloses = new AtomicLong(); + + /** Stored into each Lucene commit to record the + * System.currentTimeMillis() when commit was called. */ + public static final String COMMIT_TIME_MSEC_KEY = "commitTimeMSec"; + String name; private DirectoryFactory directoryFactory; diff --git a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java index ac6488394e0..34f54679fc9 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java +++ b/solr/core/src/test/org/apache/solr/cloud/ChaosMonkey.java @@ -19,7 +19,6 @@ package org.apache.solr.cloud; import java.net.BindException; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Random; @@ -297,13 +296,7 @@ public class ChaosMonkey { JettySolrRunner jetty; if (chance <= 5 && aggressivelyKillLeaders) { // if killLeader, really aggressively go after leaders - Collection leaders = shardToLeaderJetty.values(); - List leadersList = new ArrayList(leaders.size()); - - leadersList.addAll(leaders); - - int index = random.nextInt(leadersList.size()); - jetty = leadersList.get(index).jetty; + jetty = shardToLeaderJetty.get(slice).jetty; } else { // get random shard List jetties = shardToJetty.get(slice); diff --git a/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java b/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java index c909facbc3d..b20d779e4ab 100644 --- a/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/FullSolrCloudTest.java @@ -59,6 +59,7 @@ import org.junit.Ignore; * what we test now - the default update chain * */ +@Ignore public class FullSolrCloudTest extends AbstractDistributedZkTestCase { private static final String SHARD2 = "shard2"; diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java index c4826fb2cc4..cd99aebc4fa 100644 --- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java @@ -40,6 +40,7 @@ import org.apache.solr.core.CoreDescriptor; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.Code; +import org.apache.zookeeper.data.Stat; import org.junit.BeforeClass; import org.junit.Test; @@ -49,6 +50,54 @@ public class OverseerTest extends SolrTestCaseJ4 { private static final boolean DEBUG = false; + private static class MockZKController{ + + private final SolrZkClient zkClient; + private final String nodeName; + + public MockZKController(String zkAddress, String nodeName) throws InterruptedException, TimeoutException, IOException, KeeperException { + this.nodeName = nodeName; + zkClient = new SolrZkClient(zkAddress, TIMEOUT); + Overseer.createClientNodes(zkClient, nodeName); + + // live node + final String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + "node1"; + zkClient.makePath(nodePath, CreateMode.EPHEMERAL, true); + } + + private void deleteNode(final String path) { + try { + Stat stat = zkClient.exists(path, null, false); + zkClient.delete(path, stat.getVersion(), false); + } catch (KeeperException e) { + fail("Unexpected KeeperException!" + e); + } catch (InterruptedException e) { + fail("Unexpected InterruptedException!" + e); + } + } + + public void close(){ + try { + deleteNode(ZkStateReader.LIVE_NODES_ZKNODE + "/" + "node1"); + zkClient.close(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + public void publishState(String coreName, String stateName) throws KeeperException, InterruptedException{ + HashMap coreProps = new HashMap(); + coreProps.put(ZkStateReader.STATE_PROP, stateName); + coreProps.put(ZkStateReader.NODE_NAME_PROP, nodeName); + coreProps.put(ZkStateReader.CORE_NAME_PROP, coreName); + CoreState state = new CoreState(coreName, "collection1", coreProps); + final String statePath = Overseer.STATES_NODE + "/" + nodeName; + zkClient.setData(statePath, ZkStateReader.toJSON(new CoreState[] {state}), true); + } + + } + @BeforeClass public static void beforeClass() throws Exception { initCore(); @@ -438,11 +487,11 @@ public class OverseerTest extends SolrTestCaseJ4 { SolrZkClient controllerClient = null; SolrZkClient overseerClient = null; ZkStateReader reader = null; + MockZKController mockController = null; try { server.run(); controllerClient = new SolrZkClient(server.getZkAddress(), TIMEOUT); - AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost()); AbstractZkTestCase.makeSolrZkNode(server.getZkHost()); controllerClient.makePath(ZkStateReader.LIVE_NODES_ZKNODE, true); @@ -450,45 +499,35 @@ public class OverseerTest extends SolrTestCaseJ4 { reader = new ZkStateReader(controllerClient); reader.createClusterStateWatchersAndUpdate(); - Overseer.createClientNodes(controllerClient, "node1"); + mockController = new MockZKController(server.getZkAddress(), "node1"); + overseerClient = electNewOverseer(server.getZkAddress()); - - // live node - final String nodePath = ZkStateReader.LIVE_NODES_ZKNODE + "/" + "node1"; - controllerClient.makePath(nodePath, CreateMode.EPHEMERAL, true); - - HashMap coreProps = new HashMap(); - coreProps.put(ZkStateReader.STATE_PROP, ZkStateReader.RECOVERING); - coreProps.put(ZkStateReader.NODE_NAME_PROP, "node1"); - CoreState state = new CoreState("core1", "collection1", coreProps); - - final String statePath = Overseer.STATES_NODE + "/node1"; - // publish node state (recovering) - controllerClient.setData(statePath, ZkStateReader.toJSON(new CoreState[] {state}), true); + + mockController.publishState("core1", ZkStateReader.RECOVERING); // wait overseer assignment waitForSliceCount(reader, "collection1", 1); verifyStatus(reader, ZkStateReader.RECOVERING); - // publish node state (active) - coreProps.put(ZkStateReader.STATE_PROP, ZkStateReader.ACTIVE); - coreProps.put(ZkStateReader.SHARD_ID_PROP, "shard1"); - state = new CoreState("core1", "collection1", coreProps); - controllerClient.setData(statePath, - ZkStateReader.toJSON(new CoreState[] {state}), true); + int version = getCloudStateVersion(controllerClient); + + mockController.publishState("core1", ZkStateReader.ACTIVE); + + while(version == getCloudStateVersion(controllerClient)); verifyStatus(reader, ZkStateReader.ACTIVE); + version = getCloudStateVersion(controllerClient); overseerClient.close(); - - coreProps.put(ZkStateReader.STATE_PROP, ZkStateReader.RECOVERING); - state = new CoreState("core1", "collection1", coreProps); - - controllerClient.setData(statePath, - ZkStateReader.toJSON(new CoreState[] {state}), true); + Thread.sleep(1000); //wait for overseer to get killed - overseerClient = electNewOverseer(server.getZkAddress()); + mockController.publishState("core1", ZkStateReader.RECOVERING); + version = getCloudStateVersion(controllerClient); + overseerClient = electNewOverseer(server.getZkAddress()); + + while(version == getCloudStateVersion(controllerClient)); + verifyStatus(reader, ZkStateReader.RECOVERING); assertEquals("Live nodes count does not match", 1, reader.getCloudState() @@ -497,6 +536,10 @@ public class OverseerTest extends SolrTestCaseJ4 { .getSlice("collection1", "shard1").getShards().size()); } finally { + if (mockController != null) { + mockController.close(); + } + if (overseerClient != null) { overseerClient.close(); } @@ -509,6 +552,80 @@ public class OverseerTest extends SolrTestCaseJ4 { server.shutdown(); } } + + @Test + public void testDoubleAssignment() throws Exception { + String zkDir = dataDir.getAbsolutePath() + File.separator + + "zookeeper/server1/data"; + + System.setProperty(ZkStateReader.NUM_SHARDS_PROP, "2"); + + ZkTestServer server = new ZkTestServer(zkDir); + + SolrZkClient controllerClient = null; + SolrZkClient overseerClient = null; + ZkStateReader reader = null; + MockZKController mockController = null; + + try { + server.run(); + controllerClient = new SolrZkClient(server.getZkAddress(), TIMEOUT); + + AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost()); + AbstractZkTestCase.makeSolrZkNode(server.getZkHost()); + controllerClient.makePath(ZkStateReader.LIVE_NODES_ZKNODE, true); + + reader = new ZkStateReader(controllerClient); + reader.createClusterStateWatchersAndUpdate(); + + mockController = new MockZKController(server.getZkAddress(), "node1"); + + overseerClient = electNewOverseer(server.getZkAddress()); + + mockController.publishState("core1", ZkStateReader.RECOVERING); + + // wait overseer assignment + waitForSliceCount(reader, "collection1", 1); + + verifyStatus(reader, ZkStateReader.RECOVERING); + + mockController.close(); + + int version = getCloudStateVersion(controllerClient); + + mockController = new MockZKController(server.getZkAddress(), "node1"); + mockController.publishState("core1", ZkStateReader.RECOVERING); + + while (version == getCloudStateVersion(controllerClient)); + + reader.updateCloudState(true); + CloudState state = reader.getCloudState(); + assertEquals("more than 1 shard id was assigned to same core", 1, state.getSlices("collection1").size()); + + } finally { + System.clearProperty(ZkStateReader.NUM_SHARDS_PROP); + if (overseerClient != null) { + overseerClient.close(); + } + if (mockController != null) { + mockController.close(); + } + + if (controllerClient != null) { + controllerClient.close(); + } + if (reader != null) { + reader.close(); + } + server.shutdown(); + } + } + + private int getCloudStateVersion(SolrZkClient controllerClient) + throws KeeperException, InterruptedException { + return controllerClient.exists(ZkStateReader.CLUSTER_STATE, null, false).getVersion(); + } + private SolrZkClient electNewOverseer(String address) throws InterruptedException, TimeoutException, IOException, KeeperException { diff --git a/solr/core/src/test/org/apache/solr/core/TestSolrDeletionPolicy1.java b/solr/core/src/test/org/apache/solr/core/TestSolrDeletionPolicy1.java index 5da62965942..1242c3fb568 100644 --- a/solr/core/src/test/org/apache/solr/core/TestSolrDeletionPolicy1.java +++ b/solr/core/src/test/org/apache/solr/core/TestSolrDeletionPolicy1.java @@ -92,10 +92,10 @@ public class TestSolrDeletionPolicy1 extends SolrTestCaseJ4 { addDocs(); Map commits = delPolicy.getCommits(); IndexCommit latest = delPolicy.getLatestCommit(); - for (Long version : commits.keySet()) { - if (commits.get(version) == latest) + for (Long gen : commits.keySet()) { + if (commits.get(gen) == latest) continue; - assertEquals(1, commits.get(version).getSegmentCount()); + assertEquals(1, commits.get(gen).getSegmentCount()); } } @@ -126,7 +126,7 @@ public class TestSolrDeletionPolicy1 extends SolrTestCaseJ4 { ); commits = delPolicy.getCommits(); - assertTrue(!commits.containsKey(ic.getVersion())); + assertTrue(!commits.containsKey(ic.getGeneration())); } } diff --git a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java index c514c0d993f..089a1f84308 100755 --- a/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/test-framework/src/java/org/apache/solr/SolrTestCaseJ4.java @@ -132,7 +132,7 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase { // wait a bit in case any ending threads have anything to release int retries = 0; while (endNumOpens - numOpens != endNumCloses - numCloses) { - if (retries++ > 60) { + if (retries++ > 120) { break; } try {