From b85dc82b4f32ef2765d62282363af62d2996cfab Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 20 Jul 2012 14:33:43 +0000 Subject: [PATCH] LUCENE-4227: add DirectPostingsFormat, to hold all postings in simple uncompressed arrays git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1363803 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/MIGRATE.txt | 5 + .../codecs/memory/DirectPostingsFormat.java | 2216 +++++++++++++++++ .../org.apache.lucene.codecs.PostingsFormat | 1 + .../apache/lucene/index/Test2BPostings.java | 2 +- .../org/apache/lucene/index/Test2BTerms.java | 2 +- .../lucene/index/TestIndexWriterReader.java | 2 +- .../lucene/index/TestLazyProxSkipping.java | 5 +- .../apache/lucene/index/TestLongPostings.java | 2 +- .../apache/lucene/index/TestNRTThreads.java | 2 +- .../org/apache/lucene/index/TestNorms.java | 2 +- .../apache/lucene/index/TestTermsEnum.java | 2 +- .../apache/lucene/search/TestNRTManager.java | 2 +- .../lucene/search/TestSearchWithThreads.java | 2 +- .../lucene/search/TestSearcherManager.java | 2 +- .../lucene/search/TestShardSearching.java | 2 +- .../org/apache/lucene/util/fst/TestFSTs.java | 2 +- .../org/apache/lucene/index/RandomCodec.java | 3 + 17 files changed, 2240 insertions(+), 14 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index faedeaeefc8..ffb506c5c3a 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -629,3 +629,8 @@ you can now do this: instance exposing the inverted index of the one document. From Fields you can enumerate all fields, terms, positions, offsets. +* LUCENE-4227: If you were previously using Instantiated index, you + may want to use DirectPostingsFormat after upgrading: it stores all + postings in simple arrrays (byte[] for terms, int[] for docs, freqs, + positions, offsets). Note that this only covers postings, whereas + Instantiated covered all other parts of the index as well. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java new file mode 100644 index 00000000000..89a805a7b46 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -0,0 +1,2216 @@ +package org.apache.lucene.codecs.memory; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; // javadocs +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.OrdTermState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.RAMOutputStream; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.Transition; + +// TODO: +// - build depth-N prefix hash? +// - or: longer dense skip lists than just next byte? + +/** Wraps {@link Lucene40PostingsFormat} format for on-disk + * storage, but then at read time loads and stores all + * terms & postings directly in RAM as byte[], int[]. + * + *

WARNING: This is + * exceptionally RAM intensive: it makes no effort to + * compress the postings data, storing terms as separate + * byte[] and postings as separate int[], but as a result it + * gives substantial increase in search performance. + * + *

This postings format supports {@link TermsEnum#ord} + * and {@link TermsEnum#seekExact(long)}. + + *

Because this holds all term bytes as a single + * byte[], you cannot have more than 2.1GB worth of term + * bytes in a single segment. + * + * @lucene.experimental */ + +public class DirectPostingsFormat extends PostingsFormat { + + private final int minSkipCount; + private final int lowFreqCutoff; + + private final static int DEFAULT_MIN_SKIP_COUNT = 8; + private final static int DEFAULT_LOW_FREQ_CUTOFF = 32; + + //private static final boolean DEBUG = true; + + // TODO: allow passing/wrapping arbitrary postings format? + + public DirectPostingsFormat() { + this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF); + } + + /** minSkipCount is how many terms in a row must have the + * same prefix before we put a skip pointer down. Terms + * with docFreq <= lowFreqCutoff will use a single int[] + * to hold all docs, freqs, position and offsets; terms + * with higher docFreq will use separate arrays. */ + public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) { + super("Direct"); + this.minSkipCount = minSkipCount; + this.lowFreqCutoff = lowFreqCutoff; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return PostingsFormat.forName("Lucene40").fieldsConsumer(state); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + FieldsProducer postings = PostingsFormat.forName("Lucene40").fieldsProducer(state); + if (state.context.context != IOContext.Context.MERGE) { + FieldsProducer loadedPostings; + try { + loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff); + } finally { + postings.close(); + } + return loadedPostings; + } else { + // Don't load postings for merge: + return postings; + } + } + + private static final class DirectFields extends FieldsProducer { + private final Map fields = new TreeMap(); + + public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException { + FieldsEnum fieldsEnum = fields.iterator(); + String field; + while ((field = fieldsEnum.next()) != null) { + this.fields.put(field, new DirectField(state, field, fieldsEnum.terms(), minSkipCount, lowFreqCutoff)); + } + } + + @Override + public FieldsEnum iterator() { + + final Iterator> iter = fields.entrySet().iterator(); + + return new FieldsEnum() { + Map.Entry current; + + @Override + public String next() { + if (iter.hasNext()) { + current = iter.next(); + return current.getKey(); + } else { + return null; + } + } + + @Override + public Terms terms() { + return current.getValue(); + } + }; + } + + @Override + public Terms terms(String field) { + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + @Override + public long getUniqueTermCount() { + long numTerms = 0; + for(DirectField field : fields.values()) { + numTerms += field.terms.length; + } + return numTerms; + } + + @Override + public void close() { + } + } + + private final static class DirectField extends Terms { + + private static abstract class TermAndSkip { + public int[] skips; + } + + private static final class LowFreqTerm extends TermAndSkip { + public final int[] postings; + public final byte[] payloads; + public final int docFreq; + public final int totalTermFreq; + + public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) { + this.postings = postings; + this.payloads = payloads; + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + } + } + + // TODO: maybe specialize into prx/no-prx/no-frq cases? + private static final class HighFreqTerm extends TermAndSkip { + public final long totalTermFreq; + public final int[] docIDs; + public final int[] freqs; + public final int[][] positions; + public final byte[][][] payloads; + + public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) { + this.docIDs = docIDs; + this.freqs = freqs; + this.positions = positions; + this.payloads = payloads; + this.totalTermFreq = totalTermFreq; + } + } + + private final byte[] termBytes; + private final int[] termOffsets; + + private final int[] skips; + private final int[] skipOffsets; + + private final TermAndSkip[] terms; + private final boolean hasFreq; + private final boolean hasPos; + private final boolean hasOffsets; + private final boolean hasPayloads; + private final long sumTotalTermFreq; + private final int docCount; + private final long sumDocFreq; + private int skipCount; + + // TODO: maybe make a separate builder? These are only + // used during load: + private int count; + private int[] sameCounts = new int[10]; + private final int minSkipCount; + + private final static class IntArrayWriter { + private int[] ints = new int[10]; + private int upto; + + public void add(int value) { + if (ints.length == upto) { + ints = ArrayUtil.grow(ints); + } + ints[upto++] = value; + } + + public int[] get() { + final int[] arr = new int[upto]; + System.arraycopy(ints, 0, arr, 0, upto); + upto = 0; + return arr; + } + } + + public DirectField(SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff) throws IOException { + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + + sumTotalTermFreq = termsIn.getSumTotalTermFreq(); + sumDocFreq = termsIn.getSumDocFreq(); + docCount = termsIn.getDocCount(); + + final int numTerms = (int) termsIn.size(); + if (numTerms == -1) { + throw new IllegalArgumentException("codec does not provide Terms.size()"); + } + terms = new TermAndSkip[numTerms]; + termOffsets = new int[1+numTerms]; + + byte[] termBytes = new byte[1024]; + + this.minSkipCount = minSkipCount; + + hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_ONLY) > 0; + hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0; + hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0; + hasPayloads = fieldInfo.hasPayloads(); + + BytesRef term; + DocsEnum docsEnum = null; + DocsAndPositionsEnum docsAndPositionsEnum = null; + final TermsEnum termsEnum = termsIn.iterator(null); + int termOffset = 0; + + final IntArrayWriter scratch = new IntArrayWriter(); + + // Used for payloads, if any: + final RAMOutputStream ros = new RAMOutputStream(); + + // if (DEBUG) { + // System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads); + // } + + while ((term = termsEnum.next()) != null) { + final int docFreq = termsEnum.docFreq(); + final long totalTermFreq = termsEnum.totalTermFreq(); + + // if (DEBUG) { + // System.out.println(" term=" + term.utf8ToString()); + // } + + termOffsets[count] = termOffset; + + if (termBytes.length < (termOffset + term.length)) { + termBytes = ArrayUtil.grow(termBytes, termOffset + term.length); + } + System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length); + termOffset += term.length; + termOffsets[count+1] = termOffset; + + if (hasPos) { + docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum, hasOffsets); + } else { + docsEnum = termsEnum.docs(null, docsEnum, hasFreq); + } + + final TermAndSkip ent; + + final DocsEnum docsEnum2; + if (hasPos) { + docsEnum2 = docsAndPositionsEnum; + } else { + docsEnum2 = docsEnum; + } + + int docID; + + if (docFreq <= lowFreqCutoff) { + + ros.reset(); + + // Pack postings for low-freq terms into a single int[]: + while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + scratch.add(docID); + if (hasFreq) { + final int freq = docsEnum2.freq(); + scratch.add(freq); + if (hasPos) { + for(int pos=0;pos 0) { + final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd-1]; + final int limit = Math.min(termLength, lastTermLength); + + int lastTermOffset = termOffsets[termOrd-1]; + int termOffset = termOffsets[termOrd]; + + int i = 0; + for(;i= minSkipCount) { + // Go back and add a skip pointer: + saveSkip(termOrd, sameCounts[i]); + } + sameCounts[i] = 1; + } + break; + } + } + + for(;i= minSkipCount) { + // Go back and add a skip pointer: + saveSkip(termOrd, sameCounts[i]); + } + sameCounts[i] = 0; + } + for(int j=limit;j= minSkipCount) { + // Go back and add a skip pointer: + saveSkip(count, sameCounts[i]); + } + } + + // Reverse the skip pointers so they are "nested": + for(int termID=0;termID 1) { + for(int pos=0;pos getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + private final class DirectTermsEnum extends TermsEnum { + + private final BytesRef scratch = new BytesRef(); + private int termOrd; + + boolean canReuse(TermAndSkip[] other) { + return DirectField.this.terms == other; + } + + private BytesRef setTerm() { + scratch.bytes = termBytes; + scratch.offset = termOffsets[termOrd]; + scratch.length = termOffsets[termOrd+1] - termOffsets[termOrd]; + return scratch; + } + + public void reset() { + termOrd = -1; + } + + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public BytesRef next() { + termOrd++; + if (termOrd < terms.length) { + return setTerm(); + } else { + return null; + } + } + + @Override + public TermState termState() { + OrdTermState state = new OrdTermState(); + state.ord = termOrd; + return state; + } + + // If non-negative, exact match; else, -ord-1, where ord + // is where you would insert the term. + private int findTerm(BytesRef term) { + + // Just do binary search: should be (constant factor) + // faster than using the skip list: + int low = 0; + int high = terms.length-1; + + while (low <= high) { + int mid = (low + high) >>> 1; + int cmp = compare(mid, term); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + return mid; // key found + } + } + + return -(low + 1); // key not found. + } + + @Override + public SeekStatus seekCeil(BytesRef term, boolean useCache) { + // TODO: we should use the skip pointers; should be + // faster than bin search; we should also hold + // & reuse current state so seeking forwards is + // faster + final int ord = findTerm(term); + // if (DEBUG) { + // System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord); + // } + if (ord >= 0) { + termOrd = ord; + setTerm(); + return SeekStatus.FOUND; + } else if (ord == -terms.length-1) { + return SeekStatus.END; + } else { + termOrd = -ord - 1; + setTerm(); + return SeekStatus.NOT_FOUND; + } + } + + @Override + public boolean seekExact(BytesRef term, boolean useCache) { + // TODO: we should use the skip pointers; should be + // faster than bin search; we should also hold + // & reuse current state so seeking forwards is + // faster + final int ord = findTerm(term); + if (ord >= 0) { + termOrd = ord; + setTerm(); + return true; + } else { + return false; + } + } + + @Override + public void seekExact(long ord) { + termOrd = (int) ord; + setTerm(); + } + + @Override + public void seekExact(BytesRef term, TermState state) throws IOException { + termOrd = (int) ((OrdTermState) state).ord; + setTerm(); + assert term.equals(scratch); + } + + @Override + public BytesRef term() { + return scratch; + } + + @Override + public long ord() { + return termOrd; + } + + @Override + public int docFreq() { + if (terms[termOrd] instanceof LowFreqTerm) { + return ((LowFreqTerm) terms[termOrd]).docFreq; + } else { + return ((HighFreqTerm) terms[termOrd]).docIDs.length; + } + } + + @Override + public long totalTermFreq() { + if (terms[termOrd] instanceof LowFreqTerm) { + return ((LowFreqTerm) terms[termOrd]).totalTermFreq; + } else { + return ((HighFreqTerm) terms[termOrd]).totalTermFreq; + } + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) { + if (needsFreqs && !hasFreq) { + return null; + } + + // TODO: implement reuse, something like Pulsing: + // it's hairy! + + if (terms[termOrd] instanceof LowFreqTerm) { + final int[] postings = ((LowFreqTerm) terms[termOrd]).postings; + if (hasFreq) { + if (hasPos) { + int posLen; + if (hasOffsets) { + posLen = 3; + } else { + posLen = 1; + } + if (hasPayloads) { + posLen++; + } + LowFreqDocsEnum docsEnum; + if (reuse instanceof LowFreqDocsEnum) { + docsEnum = (LowFreqDocsEnum) reuse; + if (!docsEnum.canReuse(liveDocs, posLen)) { + docsEnum = new LowFreqDocsEnum(liveDocs, posLen); + } + } else { + docsEnum = new LowFreqDocsEnum(liveDocs, posLen); + } + + return docsEnum.reset(postings); + } else { + LowFreqDocsEnumNoPos docsEnum; + if (reuse instanceof LowFreqDocsEnumNoPos) { + docsEnum = (LowFreqDocsEnumNoPos) reuse; + if (!docsEnum.canReuse(liveDocs)) { + docsEnum = new LowFreqDocsEnumNoPos(liveDocs); + } + } else { + docsEnum = new LowFreqDocsEnumNoPos(liveDocs); + } + + return docsEnum.reset(postings); + } + } else { + LowFreqDocsEnumNoTF docsEnum; + if (reuse instanceof LowFreqDocsEnumNoTF) { + docsEnum = (LowFreqDocsEnumNoTF) reuse; + if (!docsEnum.canReuse(liveDocs)) { + docsEnum = new LowFreqDocsEnumNoTF(liveDocs); + } + } else { + docsEnum = new LowFreqDocsEnumNoTF(liveDocs); + } + + return docsEnum.reset(postings); + } + } else { + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; + + HighFreqDocsEnum docsEnum; + if (reuse instanceof HighFreqDocsEnum) { + docsEnum = (HighFreqDocsEnum) reuse; + if (!docsEnum.canReuse(liveDocs)) { + docsEnum = new HighFreqDocsEnum(liveDocs); + } + } else { + docsEnum = new HighFreqDocsEnum(liveDocs); + } + + //System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs"); + return docsEnum.reset(term.docIDs, term.freqs); + } + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { + if (!hasPos) { + return null; + } + if (needsOffsets && !hasOffsets) { + return null; + } + + // TODO: implement reuse, something like Pulsing: + // it's hairy! + + if (terms[termOrd] instanceof LowFreqTerm) { + final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]); + final int[] postings = term.postings; + final byte[] payloads = term.payloads; + return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads); + } else { + final HighFreqTerm term = (HighFreqTerm) terms[termOrd]; + return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads); + } + } + } + + private final class DirectIntersectTermsEnum extends TermsEnum { + private final RunAutomaton runAutomaton; + private final CompiledAutomaton compiledAutomaton; + private int termOrd; + private final BytesRef scratch = new BytesRef(); + + private final class State { + int changeOrd; + int state; + Transition[] transitions; + int transitionUpto; + int transitionMax; + int transitionMin; + } + + private State[] states; + private int stateUpto; + + public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) { + runAutomaton = compiled.runAutomaton; + compiledAutomaton = compiled; + termOrd = -1; + states = new State[1]; + states[0] = new State(); + states[0].changeOrd = terms.length; + states[0].state = runAutomaton.getInitialState(); + states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state]; + states[0].transitionUpto = -1; + states[0].transitionMax = -1; + + //System.out.println("IE.init startTerm=" + startTerm); + + if (startTerm != null) { + int skipUpto = 0; + if (startTerm.length == 0) { + if (terms.length > 0 && termOffsets[1] == 0) { + termOrd = 0; + } + } else { + termOrd++; + + nextLabel: + for(int i=0;i states[i].transitionMax) { + states[i].transitionUpto++; + assert states[i].transitionUpto < states[i].transitions.length; + states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin(); + states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax(); + assert states[i].transitionMin >= 0; + assert states[i].transitionMin <= 255; + assert states[i].transitionMax >= 0; + assert states[i].transitionMax <= 255; + } + + // Skip forwards until we find a term matching + // the label at this position: + while (termOrd < terms.length) { + final int skipOffset = skipOffsets[termOrd]; + final int numSkips = skipOffsets[termOrd+1] - skipOffset; + final int termOffset = termOffsets[termOrd]; + final int termLength = termOffsets[1+termOrd] - termOffset; + + // if (DEBUG) { + // System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i); + // } + + if (termOrd == states[stateUpto].changeOrd) { + // if (DEBUG) { + // System.out.println(" end push return"); + // } + stateUpto--; + termOrd--; + return; + } + + if (termLength == i) { + termOrd++; + skipUpto = 0; + // if (DEBUG) { + // System.out.println(" term too short; next term"); + // } + } else if (label < (termBytes[termOffset+i] & 0xFF)) { + termOrd--; + // if (DEBUG) { + // System.out.println(" no match; already beyond; return termOrd=" + termOrd); + // } + stateUpto -= skipUpto; + assert stateUpto >= 0; + return; + } else if (label == (termBytes[termOffset+i] & 0xFF)) { + // if (DEBUG) { + // System.out.println(" label[" + i + "] matches"); + // } + if (skipUpto < numSkips) { + grow(); + + final int nextState = runAutomaton.step(states[stateUpto].state, label); + + // Automaton is required to accept startTerm: + assert nextState != -1; + + stateUpto++; + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; + states[stateUpto].state = nextState; + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionMax = -1; + //System.out.println(" push " + states[stateUpto].transitions.length + " trans"); + + // if (DEBUG) { + // System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd); + // } + + // Match next label at this same term: + continue nextLabel; + } else { + // if (DEBUG) { + // System.out.println(" linear scan"); + // } + // Index exhausted: just scan now (the + // number of scans required will be less + // than the minSkipCount): + final int startTermOrd = termOrd; + while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) { + assert termOrd == startTermOrd || skipOffsets[termOrd] == skipOffsets[termOrd+1]; + termOrd++; + } + assert termOrd - startTermOrd < minSkipCount; + termOrd--; + stateUpto -= skipUpto; + // if (DEBUG) { + // System.out.println(" end termOrd=" + termOrd); + // } + return; + } + } else { + if (skipUpto < numSkips) { + termOrd = skips[skipOffset + skipUpto]; + // if (DEBUG) { + // System.out.println(" no match; skip to termOrd=" + termOrd); + // } + } else { + // if (DEBUG) { + // System.out.println(" no match; next term"); + // } + termOrd++; + } + skipUpto = 0; + } + } + + // startTerm is >= last term so enum will not + // return any terms: + termOrd--; + // if (DEBUG) { + // System.out.println(" beyond end; no terms will match"); + // } + return; + } + } + + final int termOffset = termOffsets[termOrd]; + final int termLen = termOffsets[1+termOrd] - termOffset; + + if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) { + stateUpto -= skipUpto; + termOrd--; + } + // if (DEBUG) { + // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto); + // } + } + } + + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + private void grow() { + if (states.length == 1+stateUpto) { + final State[] newStates = new State[states.length+1]; + System.arraycopy(states, 0, newStates, 0, states.length); + newStates[states.length] = new State(); + states = newStates; + } + } + + @Override + public BytesRef next() { + // if (DEBUG) { + // System.out.println("\nIE.next"); + // } + + termOrd++; + int skipUpto = 0; + + if (termOrd == 0 && termOffsets[1] == 0) { + // Special-case empty string: + assert stateUpto == 0; + // if (DEBUG) { + // System.out.println(" visit empty string"); + // } + if (runAutomaton.isAccept(states[0].state)) { + scratch.bytes = termBytes; + scratch.offset = 0; + scratch.length = 0; + return scratch; + } + termOrd++; + } + + nextTerm: + + while (true) { + // if (DEBUG) { + // System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto); + // } + if (termOrd == terms.length) { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return null; + } + + final State state = states[stateUpto]; + if (termOrd == state.changeOrd) { + // Pop: + // if (DEBUG) { + // System.out.println(" pop stateUpto=" + stateUpto); + // } + stateUpto--; + /* + if (DEBUG) { + try { + //System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString()); + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); + } catch (ArrayIndexOutOfBoundsException aioobe) { + System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length))); + } + } + */ + + continue; + } + + final int termOffset = termOffsets[termOrd]; + final int termLength = termOffsets[termOrd+1] - termOffset; + final int skipOffset = skipOffsets[termOrd]; + final int numSkips = skipOffsets[termOrd+1] - skipOffset; + + // if (DEBUG) { + // System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips)); + // } + + assert termOrd < state.changeOrd; + + assert stateUpto <= termLength: "term.length=" + termLength + "; stateUpto=" + stateUpto; + final int label = termBytes[termOffset+stateUpto] & 0xFF; + + while (label > state.transitionMax) { + //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length); + state.transitionUpto++; + if (state.transitionUpto == state.transitions.length) { + // We've exhausted transitions leaving this + // state; force pop+next/skip now: + //System.out.println("forcepop: stateUpto=" + stateUpto); + if (stateUpto == 0) { + termOrd = terms.length; + return null; + } else { + assert state.changeOrd > termOrd; + // if (DEBUG) { + // System.out.println(" jumpend " + (state.changeOrd - termOrd)); + // } + //System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd); + termOrd = states[stateUpto].changeOrd; + skipUpto = 0; + stateUpto--; + } + continue nextTerm; + } + assert state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length; + state.transitionMin = state.transitions[state.transitionUpto].getMin(); + state.transitionMax = state.transitions[state.transitionUpto].getMax(); + assert state.transitionMin >= 0; + assert state.transitionMin <= 255; + assert state.transitionMax >= 0; + assert state.transitionMax <= 255; + } + + /* + if (DEBUG) { + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " + + (char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); + System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " + + Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips))); + } + */ + + final int targetLabel = state.transitionMin; + + if ((termBytes[termOffset+stateUpto] & 0xFF) < targetLabel) { + // if (DEBUG) { + // System.out.println(" do bin search"); + // } + //int startTermOrd = termOrd; + int low = termOrd+1; + int high = state.changeOrd-1; + while (true) { + if (low > high) { + // Label not found + termOrd = low; + // if (DEBUG) { + // System.out.println(" advanced by " + (termOrd - startTermOrd)); + // } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + continue nextTerm; + } + int mid = (low + high) >>> 1; + int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel; + // if (DEBUG) { + // System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid); + // } + if (cmp < 0) { + low = mid+1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Label found; walk backwards to first + // occurrence: + while (mid > termOrd && (termBytes[termOffsets[mid-1] + stateUpto] & 0xFF) == targetLabel) { + mid--; + } + termOrd = mid; + // if (DEBUG) { + // System.out.println(" advanced by " + (termOrd - startTermOrd)); + // } + //System.out.println(" jump " + (termOrd - startTermOrd)); + skipUpto = 0; + continue nextTerm; + } + } + } + + int nextState = runAutomaton.step(states[stateUpto].state, label); + + if (nextState == -1) { + // Skip + // if (DEBUG) { + // System.out.println(" automaton doesn't accept; skip"); + // } + if (skipUpto < numSkips) { + // if (DEBUG) { + // System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd)); + // } + termOrd = skips[skipOffset+skipUpto]; + } else { + termOrd++; + } + skipUpto = 0; + } else if (skipUpto < numSkips) { + // Push: + // if (DEBUG) { + // System.out.println(" push"); + // } + /* + if (DEBUG) { + try { + //System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString()); + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); + } catch (ArrayIndexOutOfBoundsException aioobe) { + System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1)); + } + } + */ + + grow(); + stateUpto++; + states[stateUpto].state = nextState; + states[stateUpto].changeOrd = skips[skipOffset + skipUpto++]; + states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState]; + states[stateUpto].transitionUpto = -1; + states[stateUpto].transitionMax = -1; + + if (stateUpto == termLength) { + // if (DEBUG) { + // System.out.println(" term ends after push"); + // } + if (runAutomaton.isAccept(nextState)) { + // if (DEBUG) { + // System.out.println(" automaton accepts: return"); + // } + scratch.bytes = termBytes; + scratch.offset = termOffsets[termOrd]; + scratch.length = termOffsets[1+termOrd] - scratch.offset; + // if (DEBUG) { + // System.out.println(" ret " + scratch.utf8ToString()); + // } + return scratch; + } else { + // if (DEBUG) { + // System.out.println(" automaton rejects: nextTerm"); + // } + termOrd++; + skipUpto = 0; + } + } + } else { + // Run the non-indexed tail of this term: + + // TODO: add assert that we don't inc too many times + + if (compiledAutomaton.commonSuffixRef != null) { + //System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString()); + assert compiledAutomaton.commonSuffixRef.offset == 0; + if (termLength < compiledAutomaton.commonSuffixRef.length) { + termOrd++; + skipUpto = 0; + continue nextTerm; + } + int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length; + for(int suffix=0;suffix 0; + return postings[upto]; + } + } else { + while (upto < postings.length) { + freq = postings[upto+1]; + assert freq > 0; + if (liveDocs.get(postings[upto])) { + return postings[upto]; + } + upto += 2 + freq*posMult; + } + } + return NO_MORE_DOCS; + } + + @Override + public int docID() { + // TODO: store docID member? + if (upto < 0) { + return -1; + } else if (upto < postings.length) { + return postings[upto]; + } else { + return NO_MORE_DOCS; + } + } + + @Override + public int freq() { + // TODO: can I do postings[upto+1]? + return freq; + } + + @Override + public int advance(int target) { + // Linear scan, but this is low-freq term so it won't + // be costly: + while(nextDoc() < target) { + } + return docID(); + } + } + + private final static class LowFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { + private int[] postings; + private final Bits liveDocs; + private final int posMult; + private final boolean hasOffsets; + private final boolean hasPayloads; + private final BytesRef payload = new BytesRef(); + private int upto; + private int docID; + private int freq; + private int skipPositions; + private int startOffset; + private int endOffset; + private int payloadOffset; + private int payloadLength; + + public LowFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets, boolean hasPayloads) { + this.liveDocs = liveDocs; + this.hasOffsets = hasOffsets; + this.hasPayloads = hasPayloads; + if (hasOffsets) { + if (hasPayloads) { + posMult = 4; + } else { + posMult = 3; + } + } else if (hasPayloads) { + posMult = 2; + } else { + posMult = 1; + } + } + + public DocsAndPositionsEnum reset(int[] postings, byte[] payloadBytes) { + this.postings = postings; + upto = 0; + skipPositions = 0; + startOffset = -1; + endOffset = -1; + docID = -1; + payloadLength = 0; + payload.bytes = payloadBytes; + return this; + } + + @Override + public int nextDoc() { + if (hasPayloads) { + for(int i=0;i 0; + skipPositions--; + final int pos = postings[upto++]; + if (hasOffsets) { + startOffset = postings[upto++]; + endOffset = postings[upto++]; + } + if (hasPayloads) { + payloadLength = postings[upto++]; + payload.offset = payloadOffset; + payloadOffset += payloadLength; + } + return pos; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public int advance(int target) { + // Linear scan, but this is low-freq term so it won't + // be costly: + while (nextDoc() < target) { + } + return docID; + } + + @Override + public boolean hasPayload() { + return payloadLength > 0; + } + + @Override + public BytesRef getPayload() { + if (payloadLength > 0) { + payload.length = payloadLength; + payloadLength = 0; + return payload; + } else { + return null; + } + } + } + + // Docs + freqs: + public final static class HighFreqDocsEnum extends DocsEnum { + private int[] docIDs; + private int[] freqs; + private final Bits liveDocs; + private int upto; + private int docID = -1; + + public HighFreqDocsEnum(Bits liveDocs) { + this.liveDocs = liveDocs; + } + + public boolean canReuse(Bits liveDocs) { + return liveDocs == this.liveDocs; + } + + public int[] getDocIDs() { + return docIDs; + } + + public int[] getFreqs() { + return freqs; + } + + public DocsEnum reset(int[] docIDs, int[] freqs) { + this.docIDs = docIDs; + this.freqs = freqs; + upto = -1; + return this; + } + + @Override + public int nextDoc() { + upto++; + if (liveDocs == null) { + try { + return docID = docIDs[upto]; + } catch (ArrayIndexOutOfBoundsException e) { + } + } else { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + return docID = docIDs[upto]; + } + upto++; + } + } + return docID = NO_MORE_DOCS; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int freq() { + return freqs[upto]; + } + + @Override + public int advance(int target) { + /* + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); + if (index < 0) { + upto = -index - 1; + } else { + upto = index; + } + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } else { + return docID = docIDs[upto]; + } + */ + + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); + // if (DEBUG) { + // System.out.println("advance target=" + target + " len=" + docIDs.length); + // } + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + + // First "grow" outwards, since most advances are to + // nearby docs: + int inc = 10; + int nextUpto = upto+10; + int low; + int high; + while (true) { + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); + if (nextUpto >= docIDs.length) { + low = nextUpto-inc; + high = docIDs.length-1; + break; + } + //System.out.println(" docID=" + docIDs[nextUpto]); + + if (target <= docIDs[nextUpto]) { + low = nextUpto-inc; + high = nextUpto; + break; + } + inc *= 2; + nextUpto += inc; + } + + // Now do normal binary search + //System.out.println(" after fwd: low=" + low + " high=" + high); + + while (true) { + + if (low > high) { + // Not exactly found + //System.out.println(" break: no match"); + upto = low; + break; + } + + int mid = (low + high) >>> 1; + int cmp = docIDs[mid] - target; + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Found target + upto = mid; + //System.out.println(" break: match"); + break; + } + } + + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); + + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + //System.out.println(" return END"); + return docID = NO_MORE_DOCS; + } else { + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); + return docID = docIDs[upto]; + } + } + } + + // TODO: specialize offsets and not + public final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum { + private int[] docIDs; + private int[] freqs; + private int[][] positions; + private byte[][][] payloads; + private final Bits liveDocs; + private final boolean hasOffsets; + private final int posJump; + private int upto; + private int docID = -1; + private int posUpto; + private boolean gotPayload; + private int[] curPositions; + + public HighFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets) { + this.liveDocs = liveDocs; + this.hasOffsets = hasOffsets; + posJump = hasOffsets ? 3 : 1; + } + + public int[] getDocIDs() { + return docIDs; + } + + public int[][] getPositions() { + return positions; + } + + public int getPosJump() { + return posJump; + } + + public Bits getLiveDocs() { + return liveDocs; + } + + public DocsAndPositionsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) { + this.docIDs = docIDs; + this.freqs = freqs; + this.positions = positions; + this.payloads = payloads; + upto = -1; + return this; + } + + @Override + public int nextDoc() { + upto++; + if (liveDocs == null) { + if (upto < docIDs.length) { + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } else { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + upto++; + } + } + + return docID = NO_MORE_DOCS; + } + + @Override + public int freq() { + return freqs[upto]; + } + + @Override + public int docID() { + return docID; + } + + @Override + public int nextPosition() { + posUpto += posJump; + gotPayload = false; + return curPositions[posUpto]; + } + + @Override + public int startOffset() { + if (hasOffsets) { + return curPositions[posUpto+1]; + } else { + return -1; + } + } + + @Override + public int endOffset() { + if (hasOffsets) { + return curPositions[posUpto+2]; + } else { + return -1; + } + } + + @Override + public int advance(int target) { + + /* + upto++; + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } + final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target); + if (index < 0) { + upto = -index - 1; + } else { + upto = index; + } + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + posUpto = hasOffsets ? -3 : -1; + if (upto == docIDs.length) { + return NO_MORE_DOCS; + } else { + return docID(); + } + */ + + //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length); + // if (DEBUG) { + // System.out.println("advance target=" + target + " len=" + docIDs.length); + // } + upto++; + if (upto == docIDs.length) { + return docID = NO_MORE_DOCS; + } + + // First "grow" outwards, since most advances are to + // nearby docs: + int inc = 10; + int nextUpto = upto+10; + int low; + int high; + while (true) { + //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc); + if (nextUpto >= docIDs.length) { + low = nextUpto-inc; + high = docIDs.length-1; + break; + } + //System.out.println(" docID=" + docIDs[nextUpto]); + + if (target <= docIDs[nextUpto]) { + low = nextUpto-inc; + high = nextUpto; + break; + } + inc *= 2; + nextUpto += inc; + } + + // Now do normal binary search + //System.out.println(" after fwd: low=" + low + " high=" + high); + + while (true) { + + if (low > high) { + // Not exactly found + //System.out.println(" break: no match"); + upto = low; + break; + } + + int mid = (low + high) >>> 1; + int cmp = docIDs[mid] - target; + //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]); + + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + // Found target + upto = mid; + //System.out.println(" break: match"); + break; + } + } + + //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto])); + + if (liveDocs != null) { + while (upto < docIDs.length) { + if (liveDocs.get(docIDs[upto])) { + break; + } + upto++; + } + } + if (upto == docIDs.length) { + //System.out.println(" return END"); + return docID = NO_MORE_DOCS; + } else { + //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto); + posUpto = -posJump; + curPositions = positions[upto]; + return docID = docIDs[upto]; + } + } + + @Override + public boolean hasPayload() { + return !gotPayload && payloads != null && payloads[upto][posUpto/(hasOffsets ? 3 : 1)] != null; + } + + private final BytesRef payload = new BytesRef(); + + @Override + public BytesRef getPayload() { + final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)]; + payload.bytes = payloadBytes; + payload.length = payloadBytes.length; + payload.offset = 0; + gotPayload = true; + return payload; + } + } +} diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 4d847e135a7..aaec1cef51d 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -17,3 +17,4 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat +org.apache.lucene.codecs.memory.DirectPostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java b/lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java index ed3e83a0a7d..cdb89b8bb7f 100644 --- a/lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BPostings.java @@ -34,7 +34,7 @@ import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; * Test indexes ~82M docs with 26 terms each, so you get > Integer.MAX_VALUE terms/docs pairs * @lucene.experimental */ -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class Test2BPostings extends LuceneTestCase { @Nightly diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java b/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java index b4f282e20a2..21468468cea 100644 --- a/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java +++ b/lucene/core/src/test/org/apache/lucene/index/Test2BTerms.java @@ -41,7 +41,7 @@ import java.util.Random; // // java -server -Xmx8g -d64 -cp .:lib/junit-4.10.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms // -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class Test2BTerms extends LuceneTestCase { private final static int TOKEN_LEN = 10; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java index 914fccf4195..15c93da83b3 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterReader.java @@ -976,7 +976,7 @@ public class TestIndexWriterReader extends LuceneTestCase { // Don't proceed if picked Codec is in the list of illegal ones. final String format = _TestUtil.getPostingsFormat("f"); assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!", - (format.equals("SimpleText") || format.equals("Memory"))); + (format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct"))); Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, conf); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java b/lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java index e2a48eb12e0..63b70ba21f4 100755 --- a/lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestLazyProxSkipping.java @@ -132,8 +132,9 @@ public class TestLazyProxSkipping extends LuceneTestCase { public void testLazySkipping() throws IOException { final String fieldFormat = _TestUtil.getPostingsFormat(this.field); - assumeFalse("This test cannot run with Memory codec", fieldFormat.equals("Memory")); - assumeFalse("This test cannot run with SimpleText codec", fieldFormat.equals("SimpleText")); + assumeFalse("This test cannot run with Memory postings format", fieldFormat.equals("Memory")); + assumeFalse("This test cannot run with Direct postings format", fieldFormat.equals("Direct")); + assumeFalse("This test cannot run with SimpleText postings format", fieldFormat.equals("SimpleText")); // test whether only the minimum amount of seeks() // are performed diff --git a/lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java b/lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java index bfc02e9f39b..6a1c883a9bc 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestLongPostings.java @@ -37,7 +37,7 @@ import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util._TestUtil; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestLongPostings extends LuceneTestCase { // Produces a realistic unicode random string that diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java b/lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java index 48e644c6c51..9490bfe8325 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestNRTThreads.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; // - mix in forceMerge, addIndexes // - randomoly mix in non-congruent docs -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestNRTThreads extends ThreadedIndexingAndSearchingTestCase { @Override diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java index f7447aa44e4..e734930e7a1 100755 --- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java @@ -40,7 +40,7 @@ import org.apache.lucene.util._TestUtil; * Test that norms info is preserved during index life - including * separate norms, addDocument, addIndexes, forceMerge. */ -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) @Slow public class TestNorms extends LuceneTestCase { final String byteTestField = "normsTestByte"; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java index 76de26632da..f122d68fef1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnum.java @@ -37,7 +37,7 @@ import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.RegExp; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestTermsEnum extends LuceneTestCase { public void test() throws Exception { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java b/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java index f88cf35dfd4..0ad6b9c2a72 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java @@ -41,7 +41,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.ThreadInterruptedException; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestNRTManager extends ThreadedIndexingAndSearchingTestCase { private final ThreadLocal lastGens = new ThreadLocal(); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java b/lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java index 33157228841..cd5ac0383eb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSearchWithThreads.java @@ -29,7 +29,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.LuceneTestCase; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestSearchWithThreads extends LuceneTestCase { int NUM_DOCS; final int NUM_SEARCH_THREADS = 5; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java index d700d963d03..3e2ac5f9dcb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSearcherManager.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util._TestUtil; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestSearcherManager extends ThreadedIndexingAndSearchingTestCase { boolean warmCalled; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java index 172b5ead96a..b72b9d52813 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestShardSearching.java @@ -41,7 +41,7 @@ import org.apache.lucene.util._TestUtil; // - test pulling docs in 2nd round trip... // - filter too -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) public class TestShardSearching extends ShardSearchingTestBase { private static class PreviousSearchState { diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 9bc2f980fe5..cb0c815ef93 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -67,7 +67,7 @@ import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.packed.PackedInts; -@SuppressCodecs({ "SimpleText", "Memory" }) +@SuppressCodecs({ "SimpleText", "Memory", "Direct" }) @Slow public class TestFSTs extends LuceneTestCase { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index d9973fd12a4..08334ca7da8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -32,6 +32,7 @@ import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; import org.apache.lucene.codecs.lucene40.Lucene40Codec; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds; +import org.apache.lucene.codecs.memory.DirectPostingsFormat; import org.apache.lucene.codecs.memory.MemoryPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockFixedIntBlockPostingsFormat; import org.apache.lucene.codecs.mockintblock.MockVariableIntBlockPostingsFormat; @@ -87,9 +88,11 @@ public class RandomCodec extends Lucene40Codec { // block via CL: int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); + int lowFreqCutoff = _TestUtil.nextInt(random, 2, 100); add(avoidCodecs, new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock), + new DirectPostingsFormat(minItemsPerBlock, lowFreqCutoff), new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), // add pulsing again with (usually) different parameters new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),