Merge branch 'apache-https-master' into jira/solr-8593

This commit is contained in:
Kevin Risden 2016-12-15 15:34:57 -06:00
commit 6c0cafedac
147 changed files with 6829 additions and 1551 deletions

1
.gitignore vendored
View File

@ -25,6 +25,7 @@ parent.iml
**/pom.xml
/nbproject
/nb-build
.pydevproject
/solr/package

View File

@ -497,7 +497,6 @@ def versionToTuple(version, name):
versionTuple = versionTuple[:-2] + ('100',)
elif versionTuple[-1].lower()[:2] == 'rc':
versionTuple = versionTuple[:-2] + (versionTuple[-1][2:],)
print('%s: %s' % (version, versionTuple))
return versionTuple

View File

@ -48,6 +48,10 @@ Optimizations
* LUCENE-7519: Add optimized APIs to compute browse-only top level
facets (Mike McCandless)
* LUCENE-7589: Numeric doc values now have the ability to encode blocks of
values using different numbers of bits per value if this proves to save
storage. (Adrien Grand)
Other
* LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
@ -67,6 +71,9 @@ New features
* LUCENE-7466: Added AxiomaticSimilarity. (Peilin Yang via Tommaso Teofili)
* LUCENE-7590: Added DocValuesStatsCollector to compute statistics on DocValues
fields. (Shai Erera)
Bug Fixes
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
@ -81,6 +88,23 @@ Bug Fixes
* LUCENE-7536: ASCIIFoldingFilterFactory used to return an illegal multi-term
component when preserveOriginal was set to true. (Adrien Grand)
* LUCENE-7576: Fix Terms.intersect in the default codec to detect when
the incoming automaton is a special case and throw a clearer
exception than NullPointerException (Tom Mortimer via Mike McCandless)
* LUCENE-6989: Fix Exception handling in MMapDirectory's unmap hack
support code to work with Java 9's new InaccessibleObjectException
that does not extend ReflectiveAccessException in Java 9.
(Uwe Schindler)
* LUCENE-7581: Lucene now prevents updating a doc values field that is used
in the index sort, since this would lead to corruption. (Jim
Ferenczi via Mike McCandless)
* LUCENE-7570: IndexWriter may deadlock if a commit is running while
there are too many merges running and one of the merges hits a
tragic exception (Joey Echeverria via Mike McCandless)
Improvements
* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery,
@ -117,11 +141,29 @@ Improvements
control how text is analyzed and converted into a query (Matt Weber
via Mike McCandless)
* LUCENE-7575: UnifiedHighlighter can now highlight fields with queries that don't
necessarily refer to that field (AKA requireFieldMatch==false). Disabled by default.
See UH get/setFieldMatcher. (Jim Ferenczi via David Smiley)
* LUCENE-7592: If the segments file is truncated, we now throw
CorruptIndexException instead of the more confusing EOFException
(Mike Drob via Mike McCandless)
Optimizations
* LUCENE-7568: Optimize merging when index sorting is used but the
index is already sorted (Jim Ferenczi via Mike McCandless)
* LUCENE-7563: The BKD in-memory index for dimensional points now uses
a compressed format, using substantially less RAM in some cases
(Adrien Grand, Mike McCandless)
* LUCENE-7583: BKD writing now buffers each leaf block in heap before
writing to disk, giving a small speedup in points-heavy use cases.
(Mike McCandless)
* LUCENE-7572: Doc values queries now cache their hash code. (Adrien Grand)
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file

View File

@ -94,7 +94,8 @@ public class DatasetSplitter {
}
}
if (classValues == null) {
throw new IllegalStateException("field \"" + classFieldName + "\" must have sorted (set) doc values");
// approximate with no. of terms
noOfClasses += leave.reader().terms(classFieldName).size();
}
noOfClasses += valueCount;
}

View File

@ -659,6 +659,9 @@ public final class DirectPostingsFormat extends PostingsFormat {
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new DirectIntersectTermsEnum(compiled, startTerm);
}

View File

@ -270,6 +270,9 @@ public class FSTOrdTermsReader extends FieldsProducer {
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(compiled, startTerm);
}

View File

@ -250,6 +250,9 @@ public class FSTTermsReader extends FieldsProducer {
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(compiled, startTerm);
}

View File

@ -16,13 +16,17 @@
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDReader;
@ -30,15 +34,105 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_C
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;
class SimpleTextBKDReader extends BKDReader {
/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */
public SimpleTextBKDReader(IndexInput datIn, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
final class SimpleTextBKDReader extends PointValues implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final private byte[] splitPackedValues;
final long[] leafBlockFPs;
final private int leafNodeOffset;
final int numDims;
final int bytesPerDim;
final int bytesPerIndexEntry;
final IndexInput in;
final int maxPointsInLeafNode;
final byte[] minPackedValue;
final byte[] maxPackedValue;
final long pointCount;
final int docCount;
final int version;
protected final int packedBytesLength;
public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
super(datIn, numDims, maxPointsInLeafNode, bytesPerDim, leafBlockFPs, splitPackedValues, minPackedValue, maxPackedValue, pointCount, docCount);
this.in = in;
this.numDims = numDims;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.bytesPerDim = bytesPerDim;
// no version check here because callers of this API (SimpleText) have no back compat:
bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
packedBytesLength = numDims * bytesPerDim;
this.leafNodeOffset = leafBlockFPs.length;
this.leafBlockFPs = leafBlockFPs;
this.splitPackedValues = splitPackedValues;
this.minPackedValue = minPackedValue;
this.maxPackedValue = maxPackedValue;
this.pointCount = pointCount;
this.docCount = docCount;
this.version = SimpleTextBKDWriter.VERSION_CURRENT;
assert minPackedValue.length == packedBytesLength;
assert maxPackedValue.length == packedBytesLength;
}
@Override
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
/** Used to track all state for a single call to {@link #intersect}. */
public static final class IntersectState {
final IndexInput in;
final int[] scratchDocIDs;
final byte[] scratchPackedValue;
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
public IntersectState(IndexInput in, int numDims,
int packedBytesLength,
int maxPointsInLeafNode,
IntersectVisitor visitor) {
this.in = in;
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
}
}
public void intersect(IntersectVisitor visitor) throws IOException {
intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
}
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
private void addAll(IntersectState state, int nodeID) throws IOException {
//System.out.println("R: addAll nodeID=" + nodeID);
if (nodeID >= leafNodeOffset) {
//System.out.println("ADDALL");
visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
// TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
addAll(state, 2*nodeID);
addAll(state, 2*nodeID+1);
}
}
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
return new IntersectState(in.clone(), numDims,
packedBytesLength,
maxPointsInLeafNode,
visitor);
}
/** Visits all docIDs and packed values in a single leaf block */
public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
int leafID = nodeID - leafNodeOffset;
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
}
void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
@ -50,8 +144,7 @@ class SimpleTextBKDReader extends BKDReader {
}
}
@Override
protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
in.seek(blockFP);
readLine(in, scratch);
@ -63,8 +156,7 @@ class SimpleTextBKDReader extends BKDReader {
return count;
}
@Override
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == packedBytesLength;
@ -79,6 +171,175 @@ class SimpleTextBKDReader extends BKDReader {
}
}
private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
// the byte at `compressedByteOffset` is compressed using run-length compression,
// other suffix bytes are stored verbatim
final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
commonPrefixLengths[compressedDim]++;
int i;
for (i = 0; i < count; ) {
scratchPackedValue[compressedByteOffset] = in.readByte();
final int runLen = Byte.toUnsignedInt(in.readByte());
for (int j = 0; j < runLen; ++j) {
for(int dim=0;dim<numDims;dim++) {
int prefix = commonPrefixLengths[dim];
in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
}
visitor.visit(docIDs[i+j], scratchPackedValue);
}
i += runLen;
}
if (i != count) {
throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
}
}
private int readCompressedDim(IndexInput in) throws IOException {
int compressedDim = in.readByte();
if (compressedDim < -1 || compressedDim >= numDims) {
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
}
return compressedDim;
}
private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
for(int dim=0;dim<numDims;dim++) {
int prefix = in.readVInt();
commonPrefixLengths[dim] = prefix;
if (prefix > 0) {
in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
}
//System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
}
}
private void intersect(IntersectState state,
int nodeID,
byte[] cellMinPacked, byte[] cellMaxPacked)
throws IOException {
/*
System.out.println("\nR: intersect nodeID=" + nodeID);
for(int dim=0;dim<numDims;dim++) {
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
}
*/
Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return;
} else if (r == Relation.CELL_INSIDE_QUERY) {
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
addAll(state, nodeID);
return;
} else {
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
}
if (nodeID >= leafNodeOffset) {
// TODO: we can assert that the first value here in fact matches what the index claimed?
int leafID = nodeID - leafNodeOffset;
// In the unbalanced case it's possible the left most node only has one child:
if (leafID < leafBlockFPs.length) {
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
}
} else {
// Non-leaf node: recurse on the split left and right nodes
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numDims == 1) {
splitDim = 0;
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numDims;
// TODO: can we alloc & reuse this up front?
byte[] splitPackedValue = new byte[packedBytesLength];
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID,
cellMinPacked, splitPackedValue);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID+1,
splitPackedValue, cellMaxPacked);
}
}
/** Copies the split value for this node into the provided byte array */
public void copySplitValue(int nodeID, byte[] splitPackedValue) {
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numDims == 1) {
splitDim = 0;
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numDims;
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
}
@Override
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(splitPackedValues) +
RamUsageEstimator.sizeOf(leafBlockFPs);
}
@Override
public byte[] getMinPackedValue() {
return minPackedValue.clone();
}
@Override
public byte[] getMaxPackedValue() {
return maxPackedValue.clone();
}
@Override
public int getNumDimensions() {
return numDims;
}
@Override
public int getBytesPerDimension() {
return bytesPerDim;
}
@Override
public long size() {
return pointCount;
}
@Override
public int getDocCount() {
return docCount;
}
public boolean isLeafNode(int nodeID) {
return nodeID >= leafNodeOffset;
}
private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
assert startsWith(scratch, prefix);
return Integer.parseInt(stripPrefix(scratch, prefix));

View File

@ -36,7 +36,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDReader;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_FP;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BYTES_PER_DIM;
@ -58,7 +57,7 @@ class SimpleTextPointsReader extends PointsReader {
private final IndexInput dataIn;
final SegmentReadState readState;
final Map<String,BKDReader> readers = new HashMap<>();
final Map<String,SimpleTextBKDReader> readers = new HashMap<>();
final BytesRefBuilder scratch = new BytesRefBuilder();
public SimpleTextPointsReader(SegmentReadState readState) throws IOException {
@ -98,7 +97,7 @@ class SimpleTextPointsReader extends PointsReader {
this.readState = readState;
}
private BKDReader initReader(long fp) throws IOException {
private SimpleTextBKDReader initReader(long fp) throws IOException {
// NOTE: matches what writeIndex does in SimpleTextPointsWriter
dataIn.seek(fp);
readLine(dataIn);

View File

@ -20,7 +20,6 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.function.IntFunction;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
@ -33,29 +32,28 @@ import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.bkd.BKDWriter;
class SimpleTextPointsWriter extends PointsWriter {
final static BytesRef NUM_DIMS = new BytesRef("num dims ");
final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
final static BytesRef INDEX_COUNT = new BytesRef("index count ");
final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
final static BytesRef FIELD_COUNT = new BytesRef("field count ");
final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
final static BytesRef FIELD_FP = new BytesRef(" field fp ");
final static BytesRef MIN_VALUE = new BytesRef("min value ");
final static BytesRef MAX_VALUE = new BytesRef("max value ");
final static BytesRef POINT_COUNT = new BytesRef("point count ");
final static BytesRef DOC_COUNT = new BytesRef("doc count ");
final static BytesRef END = new BytesRef("END");
public final static BytesRef NUM_DIMS = new BytesRef("num dims ");
public final static BytesRef BYTES_PER_DIM = new BytesRef("bytes per dim ");
public final static BytesRef MAX_LEAF_POINTS = new BytesRef("max leaf points ");
public final static BytesRef INDEX_COUNT = new BytesRef("index count ");
public final static BytesRef BLOCK_COUNT = new BytesRef("block count ");
public final static BytesRef BLOCK_DOC_ID = new BytesRef(" doc ");
public final static BytesRef BLOCK_FP = new BytesRef(" block fp ");
public final static BytesRef BLOCK_VALUE = new BytesRef(" block value ");
public final static BytesRef SPLIT_COUNT = new BytesRef("split count ");
public final static BytesRef SPLIT_DIM = new BytesRef(" split dim ");
public final static BytesRef SPLIT_VALUE = new BytesRef(" split value ");
public final static BytesRef FIELD_COUNT = new BytesRef("field count ");
public final static BytesRef FIELD_FP_NAME = new BytesRef(" field fp name ");
public final static BytesRef FIELD_FP = new BytesRef(" field fp ");
public final static BytesRef MIN_VALUE = new BytesRef("min value ");
public final static BytesRef MAX_VALUE = new BytesRef("max value ");
public final static BytesRef POINT_COUNT = new BytesRef("point count ");
public final static BytesRef DOC_COUNT = new BytesRef("doc count ");
public final static BytesRef END = new BytesRef("END");
private IndexOutput dataOut;
final BytesRefBuilder scratch = new BytesRefBuilder();
@ -74,106 +72,16 @@ class SimpleTextPointsWriter extends PointsWriter {
PointValues values = reader.getValues(fieldInfo.name);
boolean singleValuePerDoc = values.size() == values.getDocCount();
// We use the normal BKDWriter, but subclass to customize how it writes the index and blocks to disk:
try (BKDWriter writer = new BKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
values.size(),
singleValuePerDoc) {
@Override
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
write(out, NUM_DIMS);
writeInt(out, numDims);
newline(out);
write(out, BYTES_PER_DIM);
writeInt(out, bytesPerDim);
newline(out);
write(out, MAX_LEAF_POINTS);
writeInt(out, maxPointsInLeafNode);
newline(out);
write(out, INDEX_COUNT);
writeInt(out, leafBlockFPs.length);
newline(out);
write(out, MIN_VALUE);
BytesRef br = new BytesRef(minPackedValue, 0, minPackedValue.length);
write(out, br.toString());
newline(out);
write(out, MAX_VALUE);
br = new BytesRef(maxPackedValue, 0, maxPackedValue.length);
write(out, br.toString());
newline(out);
write(out, POINT_COUNT);
writeLong(out, pointCount);
newline(out);
write(out, DOC_COUNT);
writeInt(out, docsSeen.cardinality());
newline(out);
for(int i=0;i<leafBlockFPs.length;i++) {
write(out, BLOCK_FP);
writeLong(out, leafBlockFPs[i]);
newline(out);
}
assert (splitPackedValues.length % (1 + fieldInfo.getPointNumBytes())) == 0;
int count = splitPackedValues.length / (1 + fieldInfo.getPointNumBytes());
assert count == leafBlockFPs.length;
write(out, SPLIT_COUNT);
writeInt(out, count);
newline(out);
for(int i=0;i<count;i++) {
write(out, SPLIT_DIM);
writeInt(out, splitPackedValues[i * (1 + fieldInfo.getPointNumBytes())] & 0xff);
newline(out);
write(out, SPLIT_VALUE);
br = new BytesRef(splitPackedValues, 1+(i * (1+fieldInfo.getPointNumBytes())), fieldInfo.getPointNumBytes());
write(out, br.toString());
newline(out);
}
}
@Override
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
write(out, BLOCK_COUNT);
writeInt(out, count);
newline(out);
for(int i=0;i<count;i++) {
write(out, BLOCK_DOC_ID);
writeInt(out, docIDs[start+i]);
newline(out);
}
}
@Override
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixLengths, byte[] packedValue) {
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
}
@Override
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = 0; i < count; ++i) {
BytesRef packedValue = packedValues.apply(i);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
write(out, BLOCK_VALUE);
write(out, packedValue.toString());
newline(out);
}
}
}) {
// We use our own fork of the BKDWriter to customize how it writes the index and blocks to disk:
try (SimpleTextBKDWriter writer = new SimpleTextBKDWriter(writeState.segmentInfo.maxDoc(),
writeState.directory,
writeState.segmentInfo.name,
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointNumBytes(),
SimpleTextBKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
SimpleTextBKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
values.size(),
singleValuePerDoc)) {
values.intersect(new IntersectVisitor() {
@Override
@ -198,26 +106,6 @@ class SimpleTextPointsWriter extends PointsWriter {
}
}
private void write(IndexOutput out, String s) throws IOException {
SimpleTextUtil.write(out, s, scratch);
}
private void writeInt(IndexOutput out, int x) throws IOException {
SimpleTextUtil.write(out, Integer.toString(x), scratch);
}
private void writeLong(IndexOutput out, long x) throws IOException {
SimpleTextUtil.write(out, Long.toString(x), scratch);
}
private void write(IndexOutput out, BytesRef b) throws IOException {
SimpleTextUtil.write(out, b);
}
private void newline(IndexOutput out) throws IOException {
SimpleTextUtil.writeNewline(out);
}
@Override
public void finish() throws IOException {
SimpleTextUtil.write(dataOut, END);
@ -250,4 +138,24 @@ class SimpleTextPointsWriter extends PointsWriter {
}
}
}
private void write(IndexOutput out, String s) throws IOException {
SimpleTextUtil.write(out, s, scratch);
}
private void writeInt(IndexOutput out, int x) throws IOException {
SimpleTextUtil.write(out, Integer.toString(x), scratch);
}
private void writeLong(IndexOutput out, long x) throws IOException {
SimpleTextUtil.write(out, Long.toString(x), scratch);
}
private void write(IndexOutput out, BytesRef b) throws IOException {
SimpleTextUtil.write(out, b);
}
private void newline(IndexOutput out) throws IOException {
SimpleTextUtil.writeNewline(out);
}
}

View File

@ -182,6 +182,9 @@ public final class FieldReader extends Terms implements Accountable {
//System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
// TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
// can we optimize knowing that...?
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm, compiled.sinkState);
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@ -157,7 +158,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
}
this.numStoredFields[numBufferedDocs] = numStoredFieldsInDoc;
numStoredFieldsInDoc = 0;
endOffsets[numBufferedDocs] = bufferedDocs.length;
endOffsets[numBufferedDocs] = bufferedDocs.getPosition();
++numBufferedDocs;
if (triggerFlush()) {
flush();
@ -210,7 +211,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
}
private boolean triggerFlush() {
return bufferedDocs.length >= chunkSize || // chunks of at least chunkSize bytes
return bufferedDocs.getPosition() >= chunkSize || // chunks of at least chunkSize bytes
numBufferedDocs >= maxDocsPerChunk;
}
@ -223,23 +224,23 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
lengths[i] = endOffsets[i] - endOffsets[i - 1];
assert lengths[i] >= 0;
}
final boolean sliced = bufferedDocs.length >= 2 * chunkSize;
final boolean sliced = bufferedDocs.getPosition() >= 2 * chunkSize;
writeHeader(docBase, numBufferedDocs, numStoredFields, lengths, sliced);
// compress stored fields to fieldsStream
if (sliced) {
// big chunk, slice it
for (int compressed = 0; compressed < bufferedDocs.length; compressed += chunkSize) {
compressor.compress(bufferedDocs.bytes, compressed, Math.min(chunkSize, bufferedDocs.length - compressed), fieldsStream);
for (int compressed = 0; compressed < bufferedDocs.getPosition(); compressed += chunkSize) {
compressor.compress(bufferedDocs.getBytes(), compressed, Math.min(chunkSize, bufferedDocs.getPosition() - compressed), fieldsStream);
}
} else {
compressor.compress(bufferedDocs.bytes, 0, bufferedDocs.length, fieldsStream);
compressor.compress(bufferedDocs.getBytes(), 0, bufferedDocs.getPosition(), fieldsStream);
}
// reset
docBase += numBufferedDocs;
numBufferedDocs = 0;
bufferedDocs.length = 0;
bufferedDocs.reset();
numChunks++;
}
@ -459,7 +460,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
flush();
numDirtyChunks++; // incomplete: we had to force this flush
} else {
assert bufferedDocs.length == 0;
assert bufferedDocs.getPosition() == 0;
}
if (docBase != numDocs) {
throw new RuntimeException("Wrote " + docBase + " docs, finish called with numDocs=" + numDocs);
@ -468,7 +469,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
fieldsStream.writeVLong(numChunks);
fieldsStream.writeVLong(numDirtyChunks);
CodecUtil.writeFooter(fieldsStream);
assert bufferedDocs.length == 0;
assert bufferedDocs.getPosition() == 0;
}
// bulk merge is scary: its caused corruption bugs in the past.

View File

@ -37,6 +37,7 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@ -269,8 +270,8 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
@Override
public void finishDocument() throws IOException {
// append the payload bytes of the doc after its terms
termSuffixes.writeBytes(payloadBytes.bytes, payloadBytes.length);
payloadBytes.length = 0;
termSuffixes.writeBytes(payloadBytes.getBytes(), payloadBytes.getPosition());
payloadBytes.reset();
++numDocs;
if (triggerFlush()) {
flush();
@ -316,7 +317,7 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
}
private boolean triggerFlush() {
return termSuffixes.length >= chunkSize
return termSuffixes.getPosition() >= chunkSize
|| pendingDocs.size() >= MAX_DOCUMENTS_PER_CHUNK;
}
@ -355,14 +356,14 @@ public final class CompressingTermVectorsWriter extends TermVectorsWriter {
flushPayloadLengths();
// compress terms and payloads and write them to the output
compressor.compress(termSuffixes.bytes, 0, termSuffixes.length, vectorsStream);
compressor.compress(termSuffixes.getBytes(), 0, termSuffixes.getPosition(), vectorsStream);
}
// reset
pendingDocs.clear();
curDoc = null;
curField = null;
termSuffixes.length = 0;
termSuffixes.reset();
numChunks++;
}

View File

@ -28,7 +28,8 @@ import org.apache.lucene.index.SegmentWriteState;
/**
* Lucene 6.0 point format, which encodes dimensional values in a block KD-tree structure
* for fast shape intersection filtering. See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
* for fast 1D range and N dimesional shape intersection filtering.
* See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
*
* <p>This data structure is written as a series of blocks on disk, with an in-memory perfectly balanced
* binary tree of split values referencing those blocks at the leaves.
@ -50,10 +51,13 @@ import org.apache.lucene.index.SegmentWriteState;
* <li> maxPointsInLeafNode (vInt)
* <li> bytesPerDim (vInt)
* <li> count (vInt)
* <li> byte[bytesPerDim]<sup>count</sup> (packed <code>byte[]</code> all split values)
* <li> delta-blockFP (vLong)<sup>count</sup> (delta-coded file pointers to the on-disk leaf blocks))
* <li> packed index (byte[])
* </ul>
*
* <p>The packed index uses hierarchical delta and prefix coding to compactly encode the file pointer for
* all leaf blocks, once the tree is traversed, as well as the split dimension and split value for each
* inner node of the tree.
*
* <p>After all fields blocks + index data are written, {@link CodecUtil#writeFooter} writes the checksum.
*
* <p>The <code>.dii</code> file records the file pointer in the <code>.dim</code> file where each field's

View File

@ -16,7 +16,7 @@
*/
/**
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene62}
* for an overview of the index format.
* Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene70}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;

View File

@ -17,8 +17,8 @@
/**
* Components from the Lucene 6.2 index format
* See {@link org.apache.lucene.codecs.lucene62} for an overview
* of the index format.
* See {@link org.apache.lucene.codecs.lucene70} for an overview
* of the current index format.
*/
package org.apache.lucene.codecs.lucene62;

View File

@ -18,6 +18,8 @@ package org.apache.lucene.codecs.lucene70;
import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SHIFT;
import static org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE;
import java.io.Closeable; // javadocs
import java.io.IOException;
@ -42,6 +44,7 @@ import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
@ -112,12 +115,46 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
});
}
private static class MinMaxTracker {
long min, max, numValues, spaceInBits;
MinMaxTracker() {
reset();
spaceInBits = 0;
}
private void reset() {
min = Long.MAX_VALUE;
max = Long.MIN_VALUE;
numValues = 0;
}
/** Accumulate a new value. */
void update(long v) {
min = Math.min(min, v);
max = Math.max(max, v);
++numValues;
}
/** Update the required space. */
void finish() {
if (max > min) {
spaceInBits += DirectWriter.unsignedBitsRequired(max - min) * numValues;
}
}
/** Update space usage and get ready for accumulating values for the next block. */
void nextBlock() {
finish();
reset();
}
}
private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
int numDocsWithValue = 0;
long numValues = 0;
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
MinMaxTracker minMax = new MinMaxTracker();
MinMaxTracker blockMinMax = new MinMaxTracker();
long gcd = 0;
Set<Long> uniqueValues = new HashSet<>();
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
@ -130,26 +167,35 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
// wrong results. Since these extreme values are unlikely, we just discard
// GCD computation for them
gcd = 1;
} else if (numValues != 0) { // minValue needs to be set first
gcd = MathUtil.gcd(gcd, v - min);
} else if (minMax.numValues != 0) { // minValue needs to be set first
gcd = MathUtil.gcd(gcd, v - minMax.min);
}
}
min = Math.min(min, v);
max = Math.max(max, v);
minMax.update(v);
blockMinMax.update(v);
if (blockMinMax.numValues == NUMERIC_BLOCK_SIZE) {
blockMinMax.nextBlock();
}
if (uniqueValues != null
&& uniqueValues.add(v)
&& uniqueValues.size() > 256) {
uniqueValues = null;
}
numValues++;
}
numDocsWithValue++;
}
minMax.finish();
blockMinMax.finish();
final long numValues = minMax.numValues;
long min = minMax.min;
final long max = minMax.max;
assert blockMinMax.spaceInBits <= minMax.spaceInBits;
if (numDocsWithValue == 0) {
meta.writeLong(-2);
meta.writeLong(0L);
@ -166,6 +212,7 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
meta.writeLong(numValues);
final int numBitsPerValue;
boolean doBlocks = false;
Map<Long, Integer> encode = null;
if (min >= max) {
numBitsPerValue = 0;
@ -189,12 +236,19 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
gcd = 1;
} else {
uniqueValues = null;
numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
if (gcd == 1 && min > 0
&& DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) {
min = 0;
// we do blocks if that appears to save 10+% storage
doBlocks = minMax.spaceInBits > 0 && (double) blockMinMax.spaceInBits / minMax.spaceInBits <= 0.9;
if (doBlocks) {
numBitsPerValue = 0xFF;
meta.writeInt(-2 - NUMERIC_BLOCK_SHIFT);
} else {
numBitsPerValue = DirectWriter.unsignedBitsRequired((max - min) / gcd);
if (gcd == 1 && min > 0
&& DirectWriter.unsignedBitsRequired(max) == DirectWriter.unsignedBitsRequired(max - min)) {
min = 0;
}
meta.writeInt(-1);
}
meta.writeInt(-1);
}
}
@ -203,26 +257,79 @@ final class Lucene70DocValuesConsumer extends DocValuesConsumer implements Close
meta.writeLong(gcd);
long startOffset = data.getFilePointer();
meta.writeLong(startOffset);
if (numBitsPerValue != 0) {
values = valuesProducer.getSortedNumeric(field);
DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
long v = values.nextValue();
if (encode == null) {
writer.add((v - min) / gcd);
} else {
writer.add(encode.get(v));
}
}
}
writer.finish();
if (doBlocks) {
writeValuesMultipleBlocks(valuesProducer.getSortedNumeric(field), gcd);
} else if (numBitsPerValue != 0) {
writeValuesSingleBlock(valuesProducer.getSortedNumeric(field), numValues, numBitsPerValue, min, gcd, encode);
}
meta.writeLong(data.getFilePointer() - startOffset);
return new long[] {numDocsWithValue, numValues};
}
private void writeValuesSingleBlock(SortedNumericDocValues values, long numValues, int numBitsPerValue,
long min, long gcd, Map<Long, Integer> encode) throws IOException {
DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
long v = values.nextValue();
if (encode == null) {
writer.add((v - min) / gcd);
} else {
writer.add(encode.get(v));
}
}
}
writer.finish();
}
private void writeValuesMultipleBlocks(SortedNumericDocValues values, long gcd) throws IOException {
final long[] buffer = new long[NUMERIC_BLOCK_SIZE];
final GrowableByteArrayDataOutput encodeBuffer = new GrowableByteArrayDataOutput(NUMERIC_BLOCK_SIZE);
int upTo = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
for (int i = 0, count = values.docValueCount(); i < count; ++i) {
buffer[upTo++] = values.nextValue();
if (upTo == NUMERIC_BLOCK_SIZE) {
writeBlock(buffer, NUMERIC_BLOCK_SIZE, gcd, encodeBuffer);
upTo = 0;
}
}
}
if (upTo > 0) {
writeBlock(buffer, upTo, gcd, encodeBuffer);
}
}
private void writeBlock(long[] values, int length, long gcd, GrowableByteArrayDataOutput buffer) throws IOException {
assert length > 0;
long min = values[0];
long max = values[0];
for (int i = 1; i < length; ++i) {
final long v = values[i];
assert Math.floorMod(values[i] - min, gcd) == 0;
min = Math.min(min, v);
max = Math.max(max, v);
}
if (min == max) {
data.writeByte((byte) 0);
data.writeLong(min);
} else {
final int bitsPerValue = DirectWriter.unsignedBitsRequired(max - min);
buffer.reset();
assert buffer.getPosition() == 0;
final DirectWriter w = DirectWriter.getInstance(buffer, length, bitsPerValue);
for (int i = 0; i < length; ++i) {
w.add((values[i] - min) / gcd);
}
w.finish();
data.writeByte((byte) bitsPerValue);
data.writeLong(min);
data.writeInt(buffer.getPosition());
data.writeBytes(buffer.getBytes(), buffer.getPosition());
}
}
@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);

View File

@ -146,10 +146,11 @@ public final class Lucene70DocValuesFormat extends DocValuesFormat {
static final byte SORTED_SET = 3;
static final byte SORTED_NUMERIC = 4;
// addressing uses 16k blocks
static final int MONOTONIC_BLOCK_SIZE = 16384;
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
static final int NUMERIC_BLOCK_SHIFT = 14;
static final int NUMERIC_BLOCK_SIZE = 1 << NUMERIC_BLOCK_SHIFT;
static final int TERMS_DICT_BLOCK_SHIFT = 4;
static final int TERMS_DICT_BLOCK_SIZE = 1 << TERMS_DICT_BLOCK_SHIFT;
static final int TERMS_DICT_BLOCK_MASK = TERMS_DICT_BLOCK_SIZE - 1;

View File

@ -144,7 +144,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
entry.docsWithFieldLength = meta.readLong();
entry.numValues = meta.readLong();
int tableSize = meta.readInt();
if (tableSize < -1 || tableSize > 256) {
if (tableSize > 256) {
throw new CorruptIndexException("invalid table size: " + tableSize, meta);
}
if (tableSize >= 0) {
@ -154,6 +154,11 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
entry.table[i] = meta.readLong();
}
}
if (tableSize < -1) {
entry.blockShift = -2 - tableSize;
} else {
entry.blockShift = -1;
}
entry.bitsPerValue = meta.readByte();
entry.minValue = meta.readLong();
entry.gcd = meta.readLong();
@ -260,6 +265,7 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
private static class NumericEntry {
long[] table;
int blockShift;
byte bitsPerValue;
long docsWithFieldOffset;
long docsWithFieldLength;
@ -429,24 +435,62 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
if (entry.blockShift >= 0) {
// dense but split into blocks of different bits per value
final int shift = entry.blockShift;
final long mul = entry.gcd;
final int mask = (1 << shift) - 1;
return new DenseNumericDocValues(maxDoc) {
int block = -1;
long delta;
long offset;
long blockEndOffset;
LongValues values;
@Override
public long longValue() throws IOException {
return table[(int) values.get(doc)];
final int block = doc >>> shift;
if (this.block != block) {
int bitsPerValue;
do {
offset = blockEndOffset;
bitsPerValue = slice.readByte(offset++);
delta = slice.readLong(offset);
offset += Long.BYTES;
if (bitsPerValue == 0) {
blockEndOffset = offset;
} else {
final int length = slice.readInt(offset);
offset += Integer.BYTES;
blockEndOffset = offset + length;
}
this.block ++;
} while (this.block != block);
values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
}
return mul * values.get(doc & mask) + delta;
}
};
} else {
final long mul = entry.gcd;
final long delta = entry.minValue;
return new DenseNumericDocValues(maxDoc) {
@Override
public long longValue() throws IOException {
return mul * values.get(doc) + delta;
}
};
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
return new DenseNumericDocValues(maxDoc) {
@Override
public long longValue() throws IOException {
return table[(int) values.get(doc)];
}
};
} else {
final long mul = entry.gcd;
final long delta = entry.minValue;
return new DenseNumericDocValues(maxDoc) {
@Override
public long longValue() throws IOException {
return mul * values.get(doc) + delta;
}
};
}
}
}
} else {
@ -461,24 +505,63 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
if (entry.blockShift >= 0) {
// sparse and split into blocks of different bits per value
final int shift = entry.blockShift;
final long mul = entry.gcd;
final int mask = (1 << shift) - 1;
return new SparseNumericDocValues(disi) {
int block = -1;
long delta;
long offset;
long blockEndOffset;
LongValues values;
@Override
public long longValue() throws IOException {
return table[(int) values.get(disi.index())];
final int index = disi.index();
final int block = index >>> shift;
if (this.block != block) {
int bitsPerValue;
do {
offset = blockEndOffset;
bitsPerValue = slice.readByte(offset++);
delta = slice.readLong(offset);
offset += Long.BYTES;
if (bitsPerValue == 0) {
blockEndOffset = offset;
} else {
final int length = slice.readInt(offset);
offset += Integer.BYTES;
blockEndOffset = offset + length;
}
this.block ++;
} while (this.block != block);
values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
}
return mul * values.get(index & mask) + delta;
}
};
} else {
final long mul = entry.gcd;
final long delta = entry.minValue;
return new SparseNumericDocValues(disi) {
@Override
public long longValue() throws IOException {
return mul * values.get(disi.index()) + delta;
}
};
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
return new SparseNumericDocValues(disi) {
@Override
public long longValue() throws IOException {
return table[(int) values.get(disi.index())];
}
};
} else {
final long mul = entry.gcd;
final long delta = entry.minValue;
return new SparseNumericDocValues(disi) {
@Override
public long longValue() throws IOException {
return mul * values.get(disi.index()) + delta;
}
};
}
}
}
}
@ -494,34 +577,75 @@ final class Lucene70DocValuesProducer extends DocValuesProducer implements Close
};
} else {
final RandomAccessInput slice = data.randomAccessSlice(entry.valuesOffset, entry.valuesLength);
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
if (entry.blockShift >= 0) {
final int shift = entry.blockShift;
final long mul = entry.gcd;
final long mask = (1L << shift) - 1;
return new LongValues() {
@Override
long block = -1;
long delta;
long offset;
long blockEndOffset;
LongValues values;
public long get(long index) {
return table[(int) values.get(index)];
}
};
} else if (entry.gcd != 1) {
final long gcd = entry.gcd;
final long minValue = entry.minValue;
return new LongValues() {
@Override
public long get(long index) {
return values.get(index) * gcd + minValue;
}
};
} else if (entry.minValue != 0) {
final long minValue = entry.minValue;
return new LongValues() {
@Override
public long get(long index) {
return values.get(index) + minValue;
final long block = index >>> shift;
if (this.block != block) {
assert block > this.block : "Reading backwards is illegal: " + this.block + " < " + block;
int bitsPerValue;
do {
offset = blockEndOffset;
try {
bitsPerValue = slice.readByte(offset++);
delta = slice.readLong(offset);
offset += Long.BYTES;
if (bitsPerValue == 0) {
blockEndOffset = offset;
} else {
final int length = slice.readInt(offset);
offset += Integer.BYTES;
blockEndOffset = offset + length;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
this.block ++;
} while (this.block != block);
values = bitsPerValue == 0 ? LongValues.ZEROES : DirectReader.getInstance(slice, bitsPerValue, offset);
}
return mul * values.get(index & mask) + delta;
}
};
} else {
return values;
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue);
if (entry.table != null) {
final long[] table = entry.table;
return new LongValues() {
@Override
public long get(long index) {
return table[(int) values.get(index)];
}
};
} else if (entry.gcd != 1) {
final long gcd = entry.gcd;
final long minValue = entry.minValue;
return new LongValues() {
@Override
public long get(long index) {
return values.get(index) * gcd + minValue;
}
};
} else if (entry.minValue != 0) {
final long minValue = entry.minValue;
return new LongValues() {
@Override
public long get(long index) {
return values.get(index) + minValue;
}
};
} else {
return values;
}
}
}
}

View File

@ -185,6 +185,12 @@
* {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
* An optional file indicating which documents are live.
* </li>
* <li>
* {@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}.
* Optional pair of files, recording dimensionally indexed fields, to enable fast
* numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
* and geographic shape intersection (2D, 3D).
* </li>
* </ul>
* <p>Details on each of these are provided in their linked pages.</p>
* </div>
@ -300,7 +306,12 @@
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}</td>
* <td>.liv</td>
* <td>Info about what files are live</td>
* <td>Info about what documents are live</td>
* </tr>
* <tr>
* <td>{@link org.apache.lucene.codecs.lucene60.Lucene60PointsFormat Point values}</td>
* <td>.dii, .dim</td>
* <td>Holds indexed points, if any</td>
* </tr>
* </table>
* </div>
@ -374,6 +385,8 @@
* that is suitable for faceting/sorting/analytics.
* <li>In version 5.4, DocValues have been improved to store more information on disk:
* addresses for binary fields and ord indexes for multi-valued fields.
* <li>In version 6.0, Points were added, for multi-dimensional range/distance search.
* <li>In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
* <li>In version 7.0, DocValues have been improved to better support sparse doc values
* thanks to an iterator API.
* </li>

View File

@ -1801,161 +1801,32 @@ public final class CheckIndex implements Closeable {
}
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getPointDimensionCount() > 0) {
FixedBitSet docsSeen = new FixedBitSet(reader.maxDoc());
status.totalValueFields++;
int dimCount = fieldInfo.getPointDimensionCount();
int bytesPerDim = fieldInfo.getPointNumBytes();
int packedBytesCount = dimCount * bytesPerDim;
byte[] lastMinPackedValue = new byte[packedBytesCount];
byte[] lastMaxPackedValue = new byte[packedBytesCount];
BytesRef scratch = new BytesRef();
scratch.length = bytesPerDim;
byte[] lastPackedValue = new byte[packedBytesCount];
long[] pointCountSeen = new long[1];
PointValues values = pointsReader.getValues(fieldInfo.name);
if (values == null) {
continue;
}
byte[] globalMinPackedValue = values.getMinPackedValue();
status.totalValueFields++;
long size = values.size();
int docCount = values.getDocCount();
if (docCount > size) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points and inconsistent docCount=" + docCount);
VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
values.intersect(visitor);
if (visitor.getPointCountSeen() != size) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + visitor.getPointCountSeen());
}
if (docCount > reader.maxDoc()) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but that's greater than maxDoc=" + reader.maxDoc());
if (visitor.getDocCountSeen() != docCount) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + visitor.getDocCountSeen());
}
if (globalMinPackedValue == null) {
if (size != 0) {
throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
}
} else if (globalMinPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMinPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
}
byte[] globalMaxPackedValue = values.getMaxPackedValue();
if (globalMaxPackedValue == null) {
if (size != 0) {
throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldInfo.name + "\" yet size=" + size);
}
} else if (globalMaxPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMaxPackedValue for field \"" + fieldInfo.name + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
}
values.intersect(new PointValues.IntersectVisitor() {
private int lastDocID = -1;
@Override
public void visit(int docID) {
throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
checkPackedValue("packed value", packedValue, docID);
pointCountSeen[0]++;
docsSeen.set(docID);
for(int dim=0;dim<dimCount;dim++) {
int offset = bytesPerDim * dim;
// Compare to last cell:
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
// This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
}
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
// This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
}
}
// In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
// increasing docID:
if (dimCount == 1) {
int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
if (cmp > 0) {
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldInfo.name + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
} else if (cmp == 0) {
if (docID < lastDocID) {
throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldInfo.name + "\"");
}
}
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
lastDocID = docID;
}
status.totalValuePoints++;
}
@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
checkPackedValue("min packed value", minPackedValue, -1);
System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
checkPackedValue("max packed value", maxPackedValue, -1);
System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
for(int dim=0;dim<dimCount;dim++) {
int offset = bytesPerDim * dim;
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
}
// Make sure this cell is not outside of the global min/max:
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
}
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
}
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
}
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldInfo.name + "\"");
}
}
// We always pretend the query shape is so complex that it crosses every cell, so
// that packedValue is passed for every document
return PointValues.Relation.CELL_CROSSES_QUERY;
}
private void checkPackedValue(String desc, byte[] packedValue, int docID) {
if (packedValue == null) {
throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
}
if (packedValue.length != packedBytesCount) {
throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldInfo.name + "\"");
}
}
});
if (pointCountSeen[0] != size) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have size=" + size + " points, but in fact has " + pointCountSeen[0]);
}
if (docsSeen.cardinality() != docCount) {
throw new RuntimeException("point values for field \"" + fieldInfo.name + "\" claims to have docCount=" + docCount + " but in fact has " + docsSeen.cardinality());
}
status.totalValuePoints += visitor.getPointCountSeen();
}
}
}
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d points] [took %.3f sec]", status.totalValueFields, status.totalValuePoints, nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
@ -1972,6 +1843,167 @@ public final class CheckIndex implements Closeable {
return status;
}
/** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
*
* @lucene.internal */
public static class VerifyPointsVisitor implements PointValues.IntersectVisitor {
private long pointCountSeen;
private int lastDocID = -1;
private final int maxDoc;
private final FixedBitSet docsSeen;
private final byte[] lastMinPackedValue;
private final byte[] lastMaxPackedValue;
private final byte[] lastPackedValue;
private final byte[] globalMinPackedValue;
private final byte[] globalMaxPackedValue;
private final int packedBytesCount;
private final int numDims;
private final int bytesPerDim;
private final String fieldName;
/** Sole constructor */
public VerifyPointsVisitor(String fieldName, int maxDoc, PointValues values) throws IOException {
this.maxDoc = maxDoc;
this.fieldName = fieldName;
numDims = values.getNumDimensions();
bytesPerDim = values.getBytesPerDimension();
packedBytesCount = numDims * bytesPerDim;
globalMinPackedValue = values.getMinPackedValue();
globalMaxPackedValue = values.getMaxPackedValue();
docsSeen = new FixedBitSet(maxDoc);
lastMinPackedValue = new byte[packedBytesCount];
lastMaxPackedValue = new byte[packedBytesCount];
lastPackedValue = new byte[packedBytesCount];
if (values.getDocCount() > values.size()) {
throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have size=" + values.size() + " points and inconsistent docCount=" + values.getDocCount());
}
if (values.getDocCount() > maxDoc) {
throw new RuntimeException("point values for field \"" + fieldName + "\" claims to have docCount=" + values.getDocCount() + " but that's greater than maxDoc=" + maxDoc);
}
if (globalMinPackedValue == null) {
if (values.size() != 0) {
throw new RuntimeException("getMinPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
}
} else if (globalMinPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMinPackedValue for field \"" + fieldName + "\" return length=" + globalMinPackedValue.length + " array, but should be " + packedBytesCount);
}
if (globalMaxPackedValue == null) {
if (values.size() != 0) {
throw new RuntimeException("getMaxPackedValue is null points for field \"" + fieldName + "\" yet size=" + values.size());
}
} else if (globalMaxPackedValue.length != packedBytesCount) {
throw new RuntimeException("getMaxPackedValue for field \"" + fieldName + "\" return length=" + globalMaxPackedValue.length + " array, but should be " + packedBytesCount);
}
}
/** Returns total number of points in this BKD tree */
public long getPointCountSeen() {
return pointCountSeen;
}
/** Returns total number of unique docIDs in this BKD tree */
public long getDocCountSeen() {
return docsSeen.cardinality();
}
@Override
public void visit(int docID) {
throw new RuntimeException("codec called IntersectVisitor.visit without a packed value for docID=" + docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
checkPackedValue("packed value", packedValue, docID);
pointCountSeen++;
docsSeen.set(docID);
for(int dim=0;dim<numDims;dim++) {
int offset = bytesPerDim * dim;
// Compare to last cell:
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMinPackedValue, offset) < 0) {
// This doc's point, in this dimension, is lower than the minimum value of the last cell checked:
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
}
if (StringHelper.compare(bytesPerDim, packedValue, offset, lastMaxPackedValue, offset) > 0) {
// This doc's point, in this dimension, is greater than the maximum value of the last cell checked:
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", docID=" + docID + " is out-of-bounds of the last cell min=" + Arrays.toString(lastMinPackedValue) + " max=" + Arrays.toString(lastMaxPackedValue) + " dim=" + dim);
}
}
// In the 1D case, PointValues must make a single in-order sweep through all values, and tie-break by
// increasing docID:
if (numDims == 1) {
int cmp = StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0);
if (cmp > 0) {
throw new RuntimeException("packed points value " + Arrays.toString(packedValue) + " for field=\"" + fieldName + "\", for docID=" + docID + " is out-of-order vs the previous document's value " + Arrays.toString(lastPackedValue));
} else if (cmp == 0) {
if (docID < lastDocID) {
throw new RuntimeException("packed points value is the same, but docID=" + docID + " is out of order vs previous docID=" + lastDocID + ", field=\"" + fieldName + "\"");
}
}
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
lastDocID = docID;
}
}
@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
checkPackedValue("min packed value", minPackedValue, -1);
System.arraycopy(minPackedValue, 0, lastMinPackedValue, 0, packedBytesCount);
checkPackedValue("max packed value", maxPackedValue, -1);
System.arraycopy(maxPackedValue, 0, lastMaxPackedValue, 0, packedBytesCount);
for(int dim=0;dim<numDims;dim++) {
int offset = bytesPerDim * dim;
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, maxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the cell's maxPackedValue " + Arrays.toString(maxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
}
// Make sure this cell is not outside of the global min/max:
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMinPackedValue, offset) < 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
}
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMinPackedValue, offset) < 0) {
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
" is out-of-bounds of the global minimum " + Arrays.toString(globalMinPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
}
if (StringHelper.compare(bytesPerDim, minPackedValue, offset, globalMaxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell minPackedValue " + Arrays.toString(minPackedValue) +
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
}
if (StringHelper.compare(bytesPerDim, maxPackedValue, offset, globalMaxPackedValue, offset) > 0) {
throw new RuntimeException("packed points cell maxPackedValue " + Arrays.toString(maxPackedValue) +
" is out-of-bounds of the global maximum " + Arrays.toString(globalMaxPackedValue) + " dim=" + dim + " field=\"" + fieldName + "\"");
}
}
// We always pretend the query shape is so complex that it crosses every cell, so
// that packedValue is passed for every document
return PointValues.Relation.CELL_CROSSES_QUERY;
}
private void checkPackedValue(String desc, byte[] packedValue, int docID) {
if (packedValue == null) {
throw new RuntimeException(desc + " is null for docID=" + docID + " field=\"" + fieldName + "\"");
}
if (packedValue.length != packedBytesCount) {
throw new RuntimeException(desc + " has incorrect length=" + packedValue.length + " vs expected=" + packedBytesCount + " for docID=" + docID + " field=\"" + fieldName + "\"");
}
}
}
/**
* Test stored fields.
* @lucene.experimental

View File

@ -133,19 +133,24 @@ import org.apache.lucene.util.Version;
<a name="deletionPolicy"></a>
<p>Expert: <code>IndexWriter</code> allows an optional
{@link IndexDeletionPolicy} implementation to be
specified. You can use this to control when prior commits
are deleted from the index. The default policy is {@link
KeepOnlyLastCommitDeletionPolicy} which removes all prior
commits as soon as a new commit is done (this matches
behavior before 2.2). Creating your own policy can allow
you to explicitly keep previous "point in time" commits
alive in the index for some time, to allow readers to
refresh to the new commit without having the old commit
deleted out from under them. This is necessary on
filesystems like NFS that do not support "delete on last
close" semantics, which Lucene's "point in time" search
normally relies on. </p>
{@link IndexDeletionPolicy} implementation to be specified. You
can use this to control when prior commits are deleted from
the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy}
which removes all prior commits as soon as a new commit is
done. Creating your own policy can allow you to explicitly
keep previous "point in time" commits alive in the index for
some time, either because this is useful for your application,
or to give readers enough time to refresh to the new commit
without having the old commit deleted out from under them.
The latter is necessary when multiple computers take turns opening
their own {@code IndexWriter} and {@code IndexReader}s
against a single shared index mounted via remote filesystems
like NFS which do not support "delete on last close" semantics.
A single computer accessing an index via NFS is fine with the
default deletion policy since NFS clients emulate "delete on
last close" locally. That said, accessing an index via NFS
will likely result in poor performance compared to a local IO
device. </p>
<a name="mergePolicy"></a> <p>Expert:
<code>IndexWriter</code> allows you to separately change
@ -1614,6 +1619,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
if (!globalFieldNumberMap.contains(field, DocValuesType.NUMERIC)) {
throw new IllegalArgumentException("can only update existing numeric-docvalues fields!");
}
if (config.getIndexSortFields().contains(field)) {
throw new IllegalArgumentException("cannot update docvalues field involved in the index sort, field=" + field + ", sort=" + config.getIndexSort());
}
try {
long seqNo = docWriter.updateDocValues(new NumericDocValuesUpdate(term, field, value));
if (seqNo < 0) {
@ -1708,6 +1716,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
if (!globalFieldNumberMap.contains(f.name(), dvType)) {
throw new IllegalArgumentException("can only update existing docvalues fields! field=" + f.name() + ", type=" + dvType);
}
if (config.getIndexSortFields().contains(f.name())) {
throw new IllegalArgumentException("cannot update docvalues field involved in the index sort, field=" + f.name() + ", sort=" + config.getIndexSort());
}
switch (dvType) {
case NUMERIC:
dvUpdates[i] = new NumericDocValuesUpdate(term, f.name(), (Long) f.numericValue());
@ -2941,11 +2952,16 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
@Override
public final long prepareCommit() throws IOException {
ensureOpen();
pendingSeqNo = prepareCommitInternal(config.getMergePolicy());
boolean[] doMaybeMerge = new boolean[1];
pendingSeqNo = prepareCommitInternal(doMaybeMerge);
// we must do this outside of the commitLock else we can deadlock:
if (doMaybeMerge[0]) {
maybeMerge(config.getMergePolicy(), MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
return pendingSeqNo;
}
private long prepareCommitInternal(MergePolicy mergePolicy) throws IOException {
private long prepareCommitInternal(boolean[] doMaybeMerge) throws IOException {
startCommitTime = System.nanoTime();
synchronized(commitLock) {
ensureOpen(false);
@ -3052,7 +3068,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
boolean success = false;
try {
if (anySegmentsFlushed) {
maybeMerge(mergePolicy, MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
doMaybeMerge[0] = true;
}
startCommit(toCommit);
success = true;
@ -3173,6 +3189,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
infoStream.message("IW", "commit: start");
}
boolean[] doMaybeMerge = new boolean[1];
long seqNo;
synchronized(commitLock) {
ensureOpen(false);
@ -3180,13 +3200,11 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
infoStream.message("IW", "commit: enter lock");
}
long seqNo;
if (pendingCommit == null) {
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "commit: now prepare");
}
seqNo = prepareCommitInternal(mergePolicy);
seqNo = prepareCommitInternal(doMaybeMerge);
} else {
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "commit: already prepared");
@ -3195,9 +3213,14 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
finishCommit();
return seqNo;
}
// we must do this outside of the commitLock else we can deadlock:
if (doMaybeMerge[0]) {
maybeMerge(mergePolicy, MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
return seqNo;
}
private final void finishCommit() throws IOException {

View File

@ -18,7 +18,9 @@ package org.apache.lucene.index;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -474,6 +476,7 @@ public final class IndexWriterConfig extends LiveIndexWriterConfig {
}
}
this.indexSort = sort;
this.indexSortFields = Arrays.stream(sort.getSort()).map(SortField::getField).collect(Collectors.toSet());
return this;
}

View File

@ -242,7 +242,7 @@ public abstract class LeafReader extends IndexReader {
/** Returns {@link NumericDocValues} for this field, or
* null if no numeric doc values were indexed for
* this field. The returned instance should only be
* used by a single thread. This will never return null. */
* used by a single thread. */
public abstract NumericDocValues getNumericDocValues(String field) throws IOException;
/** Returns {@link BinaryDocValues} for this field, or

View File

@ -17,6 +17,9 @@
package org.apache.lucene.index;
import java.util.Collections;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
@ -98,6 +101,9 @@ public class LiveIndexWriterConfig {
/** The sort order to use to write merged segments. */
protected Sort indexSort = null;
/** The field names involved in the index sort */
protected Set<String> indexSortFields = Collections.emptySet();
// used by IndexWriterConfig
LiveIndexWriterConfig(Analyzer analyzer) {
this.analyzer = analyzer;
@ -457,6 +463,13 @@ public class LiveIndexWriterConfig {
return indexSort;
}
/**
* Returns the field names involved in the index sort
*/
public Set<String> getIndexSortFields() {
return indexSortFields;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();

View File

@ -28,7 +28,9 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/**
* Prefix codes term instances (prefixes are shared)
* Prefix codes term instances (prefixes are shared). This is expected to be
* faster to build than a FST and might also be more compact if there are no
* common suffixes.
* @lucene.internal
*/
public class PrefixCodedTerms implements Accountable {

View File

@ -56,6 +56,7 @@ final class SegmentCoreReaders {
final TermVectorsReader termVectorsReaderOrig;
final PointsReader pointsReader;
final Directory cfsReader;
final String segment;
/**
* fieldinfos for this core: means gen=-1.
* this is the exact fieldinfos these codec components saw at write.
@ -98,6 +99,8 @@ final class SegmentCoreReaders {
cfsDir = dir;
}
segment = si.info.name;
coreFieldInfos = codec.fieldInfosFormat().read(cfsDir, si.info, "", context);
final SegmentReadState segmentReadState = new SegmentReadState(cfsDir, si.info, coreFieldInfos, context);
@ -192,4 +195,9 @@ final class SegmentCoreReaders {
void removeCoreClosedListener(CoreClosedListener listener) {
coreClosedListeners.remove(listener);
}
@Override
public String toString() {
return "SegmentCoreReader(" + segment + ")";
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.EOFException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
@ -277,7 +278,11 @@ public final class SegmentInfos implements Cloneable, Iterable<SegmentCommitInfo
long generation = generationFromSegmentsFileName(segmentFileName);
//System.out.println(Thread.currentThread() + ": SegmentInfos.readCommit " + segmentFileName);
try (ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ)) {
return readCommit(directory, input, generation);
try {
return readCommit(directory, input, generation);
} catch (EOFException e) {
throw new CorruptIndexException("Unexpected end of file while reading index.", input, e);
}
}
}

View File

@ -49,8 +49,12 @@ public abstract class Terms {
* provided <code>startTerm</code> must be accepted by
* the automaton.
*
* <p><b>NOTE</b>: the returned TermsEnum cannot
* seek</p>.
* <p>This is an expert low-level API and will only work
* for {@code NORMAL} compiled automata. To handle any
* compiled automata you should instead use
* {@link CompiledAutomaton#getTermsEnum} instead.
*
* <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
*
* <p><b>NOTE</b>: the terms dictionary is free to
* return arbitrary terms as long as the resulted visited

View File

@ -14,8 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
package org.apache.lucene.store;
import java.io.IOException;
@ -25,6 +25,7 @@ import org.apache.lucene.util.UnicodeUtil;
/**
* A {@link DataOutput} that can be used to build a byte[].
*
* @lucene.internal
*/
public final class GrowableByteArrayDataOutput extends DataOutput {
@ -33,12 +34,13 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
/** The bytes */
public byte[] bytes;
private byte[] bytes;
/** The length */
public int length;
private int length;
// scratch for utf8 encoding of small strings
byte[] scratchBytes = new byte[16];
private byte[] scratchBytes;
/** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
public GrowableByteArrayDataOutput(int cp) {
@ -57,7 +59,9 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
@Override
public void writeBytes(byte[] b, int off, int len) {
final int newLength = length + len;
bytes = ArrayUtil.grow(bytes, newLength);
if (newLength > bytes.length) {
bytes = ArrayUtil.grow(bytes, newLength);
}
System.arraycopy(b, off, bytes, length, len);
length = newLength;
}
@ -68,7 +72,11 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
// string is small enough that we don't need to save memory by falling back to double-pass approach
// this is just an optimized writeString() that re-uses scratchBytes.
scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
if (scratchBytes == null) {
scratchBytes = new byte[ArrayUtil.oversize(maxLen, Character.BYTES)];
} else {
scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
}
int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
writeVInt(len);
writeBytes(scratchBytes, len);
@ -80,4 +88,16 @@ public final class GrowableByteArrayDataOutput extends DataOutput {
length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
}
}
public byte[] getBytes() {
return bytes;
}
public int getPosition() {
return length;
}
public void reset() {
length = 0;
}
}

View File

@ -385,13 +385,13 @@ public class MMapDirectory extends FSDirectory {
}
}
};
} catch (ReflectiveOperationException e) {
return "Unmapping is not supported on this platform, because internal Java APIs are not compatible to this Lucene version: " + e;
} catch (SecurityException e) {
return "Unmapping is not supported, because not all required permissions are given to the Lucene JAR file: " + e +
" [Please grant at least the following permissions: RuntimePermission(\"accessClassInPackage.sun.misc\"), " +
"RuntimePermission(\"accessClassInPackage.jdk.internal.ref\"), and " +
"ReflectPermission(\"suppressAccessChecks\")]";
} catch (ReflectiveOperationException | RuntimeException e) {
return "Unmapping is not supported on this platform, because internal Java APIs are not compatible to this Lucene version: " + e;
}
}

View File

@ -30,6 +30,15 @@ public abstract class LongValues {
};
public static final LongValues ZEROES = new LongValues() {
@Override
public long get(long index) {
return 0;
}
};
/** Get value at <code>index</code>. */
public abstract long get(long index);

View File

@ -17,14 +17,15 @@
package org.apache.lucene.util.bkd;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
@ -32,14 +33,12 @@ import org.apache.lucene.util.StringHelper;
*
* @lucene.experimental */
public class BKDReader extends PointValues implements Accountable {
public final class BKDReader extends PointValues implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final private byte[] splitPackedValues;
final long[] leafBlockFPs;
final private int leafNodeOffset;
final int leafNodeOffset;
final int numDims;
final int bytesPerDim;
final int bytesPerIndexEntry;
final int numLeaves;
final IndexInput in;
final int maxPointsInLeafNode;
final byte[] minPackedValue;
@ -49,6 +48,14 @@ public class BKDReader extends PointValues implements Accountable {
final int version;
protected final int packedBytesLength;
// Used for 6.4.0+ index format:
final byte[] packedIndex;
// Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
final private byte[] splitPackedValues;
final int bytesPerIndexEntry;
final long[] leafBlockFPs;
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
public BKDReader(IndexInput in) throws IOException {
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
@ -59,7 +66,7 @@ public class BKDReader extends PointValues implements Accountable {
packedBytesLength = numDims * bytesPerDim;
// Read index:
int numLeaves = in.readVInt();
numLeaves = in.readVInt();
assert numLeaves > 0;
leafNodeOffset = numLeaves;
@ -78,203 +85,377 @@ public class BKDReader extends PointValues implements Accountable {
pointCount = in.readVLong();
docCount = in.readVInt();
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
// TODO: don't write split packed values[0]!
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
// Read the file pointers to the start of each leaf block:
long[] leafBlockFPs = new long[numLeaves];
long lastFP = 0;
for(int i=0;i<numLeaves;i++) {
long delta = in.readVLong();
leafBlockFPs[i] = lastFP + delta;
lastFP += delta;
}
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
// if it was created by BKDWriter.merge). In this case the leaf nodes may straddle the two bottom
// levels of the binary tree:
if (numDims == 1 && numLeaves > 1) {
//System.out.println("BKDR: numLeaves=" + numLeaves);
int levelCount = 2;
while (true) {
//System.out.println(" cycle levelCount=" + levelCount);
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
int lastLevel = 2*(numLeaves - levelCount);
assert lastLevel >= 0;
/*
System.out.println("BKDR: lastLevel=" + lastLevel + " vs " + levelCount);
System.out.println("FPs before:");
for(int i=0;i<leafBlockFPs.length;i++) {
System.out.println(" " + i + " " + leafBlockFPs[i]);
}
*/
if (lastLevel != 0) {
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
// at read-time, so that we can still delta code them on disk at write:
//System.out.println("BKDR: now rotate index");
long[] newLeafBlockFPs = new long[numLeaves];
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
leafBlockFPs = newLeafBlockFPs;
}
/*
System.out.println("FPs:");
for(int i=0;i<leafBlockFPs.length;i++) {
System.out.println(" " + i + " " + leafBlockFPs[i]);
}
*/
break;
}
levelCount *= 2;
}
}
this.leafBlockFPs = leafBlockFPs;
this.in = in;
}
/** Called by consumers that have their own on-disk format for the index (e.g. SimpleText) */
protected BKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
this.in = in;
this.numDims = numDims;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.bytesPerDim = bytesPerDim;
// no version check here because callers of this API (SimpleText) have no back compat:
bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
packedBytesLength = numDims * bytesPerDim;
this.leafNodeOffset = leafBlockFPs.length;
this.leafBlockFPs = leafBlockFPs;
this.splitPackedValues = splitPackedValues;
this.minPackedValue = minPackedValue;
this.maxPackedValue = maxPackedValue;
this.pointCount = pointCount;
this.docCount = docCount;
this.version = BKDWriter.VERSION_CURRENT;
assert minPackedValue.length == packedBytesLength;
assert maxPackedValue.length == packedBytesLength;
}
private static class VerifyVisitor implements IntersectVisitor {
byte[] cellMinPacked;
byte[] cellMaxPacked;
byte[] lastPackedValue;
final int numDims;
final int bytesPerDim;
final int maxDoc;
public VerifyVisitor(int numDims, int bytesPerDim, int maxDoc) {
this.numDims = numDims;
this.bytesPerDim = bytesPerDim;
this.maxDoc = maxDoc;
}
@Override
public void visit(int docID) {
throw new UnsupportedOperationException();
}
@Override
public void visit(int docID, byte[] packedValue) {
if (docID < 0 || docID >= maxDoc) {
throw new RuntimeException("docID=" + docID + " is out of bounds of 0.." + maxDoc);
}
for(int dim=0;dim<numDims;dim++) {
if (StringHelper.compare(bytesPerDim, cellMinPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) > 0) {
throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is less than this leaf block's minimum=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim));
}
if (StringHelper.compare(bytesPerDim, cellMaxPacked, dim*bytesPerDim, packedValue, dim*bytesPerDim) < 0) {
throw new RuntimeException("value=" + new BytesRef(packedValue, dim*bytesPerDim, bytesPerDim) + " for docID=" + docID + " dim=" + dim + " is greater than this leaf block's maximum=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
}
}
if (numDims == 1) {
// With only 1D, all values should always be in sorted order
if (lastPackedValue == null) {
lastPackedValue = Arrays.copyOf(packedValue, packedValue.length);
} else if (StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, 0) > 0) {
throw new RuntimeException("value=" + new BytesRef(packedValue) + " for docID=" + docID + " dim=0" + " sorts before last value=" + new BytesRef(lastPackedValue));
} else {
System.arraycopy(packedValue, 0, lastPackedValue, 0, bytesPerDim);
}
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
throw new UnsupportedOperationException();
}
}
/** Only used for debugging, to make sure all values in each leaf block fall within the range expected by the index */
// TODO: maybe we can get this into CheckIndex?
public void verify(int maxDoc) throws IOException {
//System.out.println("BKDR.verify this=" + this);
// Visits every doc in every leaf block and confirms that
// their values agree with the index:
byte[] rootMinPacked = new byte[packedBytesLength];
byte[] rootMaxPacked = new byte[packedBytesLength];
Arrays.fill(rootMaxPacked, (byte) 0xff);
verify(getIntersectState(new VerifyVisitor(numDims, bytesPerDim, maxDoc)), 1, rootMinPacked, rootMaxPacked);
}
private void verify(IntersectState state, int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
if (nodeID >= leafNodeOffset) {
int leafID = nodeID - leafNodeOffset;
// In the unbalanced case it's possible the left most node only has one child:
if (leafID < leafBlockFPs.length) {
//System.out.println("CHECK nodeID=" + nodeID + " leaf=" + (nodeID-leafNodeOffset) + " offset=" + leafNodeOffset + " fp=" + leafBlockFPs[leafID]);
//System.out.println("BKDR.verify leafID=" + leafID + " nodeID=" + nodeID + " fp=" + leafBlockFPs[leafID] + " min=" + new BytesRef(cellMinPacked) + " max=" + new BytesRef(cellMaxPacked));
// Leaf node: check that all values are in fact in bounds:
VerifyVisitor visitor = (VerifyVisitor) state.visitor;
visitor.cellMinPacked = cellMinPacked;
visitor.cellMaxPacked = cellMaxPacked;
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
} else {
//System.out.println("BKDR.verify skip leafID=" + leafID);
}
if (version >= BKDWriter.VERSION_PACKED_INDEX) {
int numBytes = in.readVInt();
packedIndex = new byte[numBytes];
in.readBytes(packedIndex, 0, numBytes);
leafBlockFPs = null;
splitPackedValues = null;
} else {
// Non-leaf node:
// legacy un-packed index
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
// Read the file pointers to the start of each leaf block:
long[] leafBlockFPs = new long[numLeaves];
long lastFP = 0;
for(int i=0;i<numLeaves;i++) {
long delta = in.readVLong();
leafBlockFPs[i] = lastFP + delta;
lastFP += delta;
}
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
// if it was created by BKDWriter.merge or OneDimWriter). In this case the leaf nodes may straddle the two bottom
// levels of the binary tree:
if (numDims == 1 && numLeaves > 1) {
int levelCount = 2;
while (true) {
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
int lastLevel = 2*(numLeaves - levelCount);
assert lastLevel >= 0;
if (lastLevel != 0) {
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
// at read-time, so that we can still delta code them on disk at write:
long[] newLeafBlockFPs = new long[numLeaves];
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
leafBlockFPs = newLeafBlockFPs;
}
break;
}
levelCount *= 2;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numDims;
byte[] splitPackedValue = new byte[packedBytesLength];
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
verify(state,
2*nodeID,
cellMinPacked, splitPackedValue);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
verify(state,
2*nodeID+1,
splitPackedValue, cellMaxPacked);
this.leafBlockFPs = leafBlockFPs;
packedIndex = null;
}
this.in = in;
}
long getMinLeafBlockFP() {
if (packedIndex != null) {
return new ByteArrayDataInput(packedIndex).readVLong();
} else {
long minFP = Long.MAX_VALUE;
for(long fp : leafBlockFPs) {
minFP = Math.min(minFP, fp);
}
return minFP;
}
}
/** Used to walk the in-heap index
*
* @lucene.internal */
public abstract class IndexTree implements Cloneable {
protected int nodeID;
// level is 1-based so that we can do level-1 w/o checking each time:
protected int level;
protected int splitDim;
protected final byte[][] splitPackedValueStack;
protected IndexTree() {
int treeDepth = getTreeDepth();
splitPackedValueStack = new byte[treeDepth+1][];
nodeID = 1;
level = 1;
splitPackedValueStack[level] = new byte[packedBytesLength];
}
public void pushLeft() {
nodeID *= 2;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
}
/** Clone, but you are not allowed to pop up past the point where the clone happened. */
public abstract IndexTree clone();
public void pushRight() {
nodeID = nodeID * 2 + 1;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
}
public void pop() {
nodeID /= 2;
level--;
splitDim = -1;
//System.out.println(" pop nodeID=" + nodeID);
}
public boolean isLeafNode() {
return nodeID >= leafNodeOffset;
}
public boolean nodeExists() {
return nodeID - leafNodeOffset < leafNodeOffset;
}
public int getNodeID() {
return nodeID;
}
public byte[] getSplitPackedValue() {
assert isLeafNode() == false;
assert splitPackedValueStack[level] != null: "level=" + level;
return splitPackedValueStack[level];
}
/** Only valid after pushLeft or pushRight, not pop! */
public int getSplitDim() {
assert isLeafNode() == false;
return splitDim;
}
/** Only valid after pushLeft or pushRight, not pop! */
public abstract BytesRef getSplitDimValue();
/** Only valid after pushLeft or pushRight, not pop! */
public abstract long getLeafBlockFP();
}
/** Reads the original simple yet heap-heavy index format */
private final class LegacyIndexTree extends IndexTree {
private long leafBlockFP;
private final byte[] splitDimValue = new byte[bytesPerDim];
private final BytesRef scratch = new BytesRef();
public LegacyIndexTree() {
setNodeData();
scratch.bytes = splitDimValue;
scratch.length = bytesPerDim;
}
@Override
public LegacyIndexTree clone() {
LegacyIndexTree index = new LegacyIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFP = leafBlockFP;
index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
return index;
}
@Override
public void pushLeft() {
super.pushLeft();
setNodeData();
}
@Override
public void pushRight() {
super.pushRight();
setNodeData();
}
private void setNodeData() {
if (isLeafNode()) {
leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
splitDim = -1;
} else {
leafBlockFP = -1;
int address = nodeID * bytesPerIndexEntry;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
}
}
@Override
public long getLeafBlockFP() {
assert isLeafNode();
return leafBlockFP;
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
return scratch;
}
@Override
public void pop() {
super.pop();
leafBlockFP = -1;
}
}
/** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This
* format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
* downwards one child at a time. */
private final class PackedIndexTree extends IndexTree {
// used to read the packed byte[]
private final ByteArrayDataInput in;
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
private final long[] leafBlockFPStack;
// holds the address, in the packed byte[] index, of the left-node of each level:
private final int[] leftNodePositions;
// holds the address, in the packed byte[] index, of the right-node of each level:
private final int[] rightNodePositions;
// holds the splitDim for each level:
private final int[] splitDims;
// true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
// 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
// split on this dimension, we next pushed to the left sub-tree:
private final boolean[] negativeDeltas;
// holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
private final byte[][] splitValuesStack;
// scratch value to return from getPackedValue:
private final BytesRef scratch;
public PackedIndexTree() {
int treeDepth = getTreeDepth();
leafBlockFPStack = new long[treeDepth+1];
leftNodePositions = new int[treeDepth+1];
rightNodePositions = new int[treeDepth+1];
splitValuesStack = new byte[treeDepth+1][];
splitDims = new int[treeDepth+1];
negativeDeltas = new boolean[numDims*(treeDepth+1)];
in = new ByteArrayDataInput(packedIndex);
splitValuesStack[0] = new byte[packedBytesLength];
readNodeData(false);
scratch = new BytesRef();
scratch.length = bytesPerDim;
}
@Override
public PackedIndexTree clone() {
PackedIndexTree index = new PackedIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFPStack[level] = leafBlockFPStack[level];
index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
index.splitDims[level] = splitDims[level];
return index;
}
@Override
public void pushLeft() {
int nodePosition = leftNodePositions[level];
super.pushLeft();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = true;
in.setPosition(nodePosition);
readNodeData(true);
}
@Override
public void pushRight() {
int nodePosition = rightNodePositions[level];
super.pushRight();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = false;
in.setPosition(nodePosition);
readNodeData(false);
}
@Override
public void pop() {
super.pop();
splitDim = splitDims[level];
}
@Override
public long getLeafBlockFP() {
assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
return leafBlockFPStack[level];
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
scratch.bytes = splitValuesStack[level];
scratch.offset = splitDim * bytesPerDim;
return scratch;
}
private void readNodeData(boolean isLeft) {
leafBlockFPStack[level] = leafBlockFPStack[level-1];
// read leaf block FP delta
if (isLeft == false) {
leafBlockFPStack[level] += in.readVLong();
}
if (isLeafNode()) {
splitDim = -1;
} else {
// read split dim, prefix, firstDiffByteDelta encoded as int:
int code = in.readVInt();
splitDim = code % numDims;
splitDims[level] = splitDim;
code /= numDims;
int prefix = code % (1+bytesPerDim);
int suffix = bytesPerDim - prefix;
if (splitValuesStack[level] == null) {
splitValuesStack[level] = new byte[packedBytesLength];
}
System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength);
if (suffix > 0) {
int firstDiffByteDelta = code / (1+bytesPerDim);
if (negativeDeltas[level*numDims + splitDim]) {
firstDiffByteDelta = -firstDiffByteDelta;
}
int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF;
splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta);
in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1);
} else {
// our split value is == last split value in this dim, which can happen when there are many duplicate values
}
int leftNumBytes;
if (nodeID * 2 < leafNodeOffset) {
leftNumBytes = in.readVInt();
} else {
leftNumBytes = 0;
}
leftNodePositions[level] = in.getPosition();
rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
}
}
}
private int getTreeDepth() {
// First +1 because all the non-leave nodes makes another power
// of 2; e.g. to have a fully balanced tree with 4 leaves you
// need a depth=3 tree:
// Second +1 because MathUtil.log computes floor of the logarithm; e.g.
// with 5 leaves you need a depth=4 tree:
return MathUtil.log(numLeaves, 2) + 2;
}
/** Used to track all state for a single call to {@link #intersect}. */
@ -285,57 +466,73 @@ public class BKDReader extends PointValues implements Accountable {
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
public final IndexTree index;
public IntersectState(IndexInput in, int numDims,
int packedBytesLength,
int maxPointsInLeafNode,
IntersectVisitor visitor) {
IntersectVisitor visitor,
IndexTree indexVisitor) {
this.in = in;
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
this.index = indexVisitor;
}
}
public void intersect(IntersectVisitor visitor) throws IOException {
intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
intersect(getIntersectState(visitor), minPackedValue, maxPackedValue);
}
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
private void addAll(IntersectState state, int nodeID) throws IOException {
private void addAll(IntersectState state) throws IOException {
//System.out.println("R: addAll nodeID=" + nodeID);
if (nodeID >= leafNodeOffset) {
if (state.index.isLeafNode()) {
//System.out.println("ADDALL");
visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
if (state.index.nodeExists()) {
visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor);
}
// TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
addAll(state, 2*nodeID);
addAll(state, 2*nodeID+1);
state.index.pushLeft();
addAll(state);
state.index.pop();
state.index.pushRight();
addAll(state);
state.index.pop();
}
}
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
IndexTree index;
if (packedIndex != null) {
index = new PackedIndexTree();
} else {
index = new LegacyIndexTree();
}
return new IntersectState(in.clone(), numDims,
packedBytesLength,
maxPointsInLeafNode,
visitor);
visitor,
index);
}
/** Visits all docIDs and packed values in a single leaf block */
public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
int leafID = nodeID - leafNodeOffset;
public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException {
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
}
protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
// Leaf node
in.seek(blockFP);
@ -350,7 +547,7 @@ public class BKDReader extends PointValues implements Accountable {
}
}
protected int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
in.seek(blockFP);
// How many points are stored in this leaf cell:
@ -365,7 +562,7 @@ public class BKDReader extends PointValues implements Accountable {
return count;
}
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
@ -434,13 +631,10 @@ public class BKDReader extends PointValues implements Accountable {
}
}
private void intersect(IntersectState state,
int nodeID,
byte[] cellMinPacked, byte[] cellMaxPacked)
throws IOException {
private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
/*
System.out.println("\nR: intersect nodeID=" + nodeID);
System.out.println("\nR: intersect nodeID=" + state.index.getNodeID());
for(int dim=0;dim<numDims;dim++) {
System.out.println(" dim=" + dim + "\n cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
}
@ -450,24 +644,18 @@ public class BKDReader extends PointValues implements Accountable {
if (r == Relation.CELL_OUTSIDE_QUERY) {
// This cell is fully outside of the query shape: stop recursing
return;
} else if (r == Relation.CELL_INSIDE_QUERY) {
// This cell is fully inside of the query shape: recursively add all points in this cell without filtering
addAll(state, nodeID);
return;
} else {
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
}
if (nodeID >= leafNodeOffset) {
addAll(state);
// The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering:
} else if (state.index.isLeafNode()) {
// TODO: we can assert that the first value here in fact matches what the index claimed?
int leafID = nodeID - leafNodeOffset;
// In the unbalanced case it's possible the left most node only has one child:
if (leafID < leafBlockFPs.length) {
if (state.index.nodeExists()) {
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);
int count = readDocIDs(state.in, state.index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
@ -476,65 +664,45 @@ public class BKDReader extends PointValues implements Accountable {
} else {
// Non-leaf node: recurse on the split left and right nodes
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
int splitDim = state.index.getSplitDim();
assert splitDim >= 0: "splitDim=" + splitDim;
assert splitDim < numDims;
// TODO: can we alloc & reuse this up front?
byte[] splitPackedValue = state.index.getSplitPackedValue();
BytesRef splitDimValue = state.index.getSplitDimValue();
assert splitDimValue.length == bytesPerDim;
//System.out.println(" splitDimValue=" + splitDimValue + " splitDim=" + splitDim);
byte[] splitPackedValue = new byte[packedBytesLength];
// make sure cellMin <= splitValue <= cellMax:
assert StringHelper.compare(bytesPerDim, cellMinPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) <= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
assert StringHelper.compare(bytesPerDim, cellMaxPacked, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset) >= 0: "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
// Recurse on left sub-tree:
System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID,
cellMinPacked, splitPackedValue);
System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
state.index.pushLeft();
intersect(state, cellMinPacked, splitPackedValue);
state.index.pop();
// Restore the split dim value since it may have been overwritten while recursing:
System.arraycopy(splitPackedValue, splitDim*bytesPerDim, splitDimValue.bytes, splitDimValue.offset, bytesPerDim);
// Recurse on right sub-tree:
System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
intersect(state,
2*nodeID+1,
splitPackedValue, cellMaxPacked);
System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
state.index.pushRight();
intersect(state, splitPackedValue, cellMaxPacked);
state.index.pop();
}
}
/** Copies the split value for this node into the provided byte array */
public void copySplitValue(int nodeID, byte[] splitPackedValue) {
int address = nodeID * bytesPerIndexEntry;
int splitDim;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
assert splitDim < numDims;
System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
}
@Override
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(splitPackedValues) +
RamUsageEstimator.sizeOf(leafBlockFPs);
if (packedIndex != null) {
return packedIndex.length;
} else {
return RamUsageEstimator.sizeOf(splitPackedValues) + RamUsageEstimator.sizeOf(leafBlockFPs);
}
}
@Override

View File

@ -30,9 +30,12 @@ import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.GrowableByteArrayDataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.TrackingDirectoryWrapper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -83,7 +86,8 @@ public class BKDWriter implements Closeable {
public static final int VERSION_COMPRESSED_DOC_IDS = 1;
public static final int VERSION_COMPRESSED_VALUES = 2;
public static final int VERSION_IMPLICIT_SPLIT_DIM_1D = 3;
public static final int VERSION_CURRENT = VERSION_IMPLICIT_SPLIT_DIM_1D;
public static final int VERSION_PACKED_INDEX = 4;
public static final int VERSION_CURRENT = VERSION_PACKED_INDEX;
/** How many bytes each docs takes in the fixed-width offline format */
private final int bytesPerDoc;
@ -325,15 +329,10 @@ public class BKDWriter implements Closeable {
bkd.numDims,
bkd.packedBytesLength,
bkd.maxPointsInLeafNode,
null,
null);
this.docMap = docMap;
long minFP = Long.MAX_VALUE;
//System.out.println("MR.init " + this + " bkdreader=" + bkd + " leafBlockFPs.length=" + bkd.leafBlockFPs.length);
for(long fp : bkd.leafBlockFPs) {
minFP = Math.min(minFP, fp);
//System.out.println(" leaf fp=" + fp);
}
state.in.seek(minFP);
state.in.seek(bkd.getMinLeafBlockFP());
this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
}
@ -341,7 +340,7 @@ public class BKDWriter implements Closeable {
//System.out.println("MR.next this=" + this);
while (true) {
if (docBlockUpto == docsInBlock) {
if (blockID == bkd.leafBlockFPs.length) {
if (blockID == bkd.leafNodeOffset) {
//System.out.println(" done!");
return false;
}
@ -481,15 +480,14 @@ public class BKDWriter implements Closeable {
}
build(1, numLeaves, values, 0, Math.toIntExact(pointCount), out,
minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
minPackedValue, maxPackedValue, splitPackedValues, leafBlockFPs,
new int[maxPointsInLeafNode]);
long indexFP = out.getFilePointer();
writeIndex(out, leafBlockFPs, splitPackedValues);
return indexFP;
}
/* In the 1D case, we can simply sort points in ascending order and use the
* same writing logic as we use at merge time. */
private long writeField1Dim(IndexOutput out, String fieldName, MutablePointValues reader) throws IOException {
@ -560,6 +558,9 @@ public class BKDWriter implements Closeable {
return oneDimWriter.finish();
}
// reused when writing leaf blocks
private final GrowableByteArrayDataOutput scratchOut = new GrowableByteArrayDataOutput(32*1024);
private class OneDimensionBKDWriter {
final IndexOutput out;
@ -567,8 +568,8 @@ public class BKDWriter implements Closeable {
final List<byte[]> leafBlockStartValues = new ArrayList<>();
final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
final int[] leafDocs = new int[maxPointsInLeafNode];
long valueCount;
int leafCount;
private long valueCount;
private int leafCount;
OneDimensionBKDWriter(IndexOutput out) {
if (numDims != 1) {
@ -593,7 +594,7 @@ public class BKDWriter implements Closeable {
// for asserts
final byte[] lastPackedValue;
int lastDocID;
private int lastDocID;
void add(byte[] packedValue, int docID) throws IOException {
assert valueInOrder(valueCount + leafCount,
@ -610,8 +611,7 @@ public class BKDWriter implements Closeable {
if (leafCount == maxPointsInLeafNode) {
// We write a block once we hit exactly the max count ... this is different from
// when we flush a new segment, where we write between max/2 and max per leaf block,
// so merged segments will behave differently from newly flushed segments:
// when we write N > 1 dimensional points where we write between max/2 and max per leaf block
writeLeafBlock();
leafCount = 0;
}
@ -663,42 +663,39 @@ public class BKDWriter implements Closeable {
leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size());
Arrays.fill(commonPrefixLengths, bytesPerDim);
// Find per-dim common prefix:
for(int dim=0;dim<numDims;dim++) {
int offset1 = dim * bytesPerDim;
int offset2 = (leafCount - 1) * packedBytesLength + offset1;
for(int j=0;j<commonPrefixLengths[dim];j++) {
if (leafValues[offset1+j] != leafValues[offset2+j]) {
commonPrefixLengths[dim] = j;
break;
}
int prefix = bytesPerDim;
int offset = (leafCount - 1) * packedBytesLength;
for(int j=0;j<bytesPerDim;j++) {
if (leafValues[j] != leafValues[offset+j]) {
prefix = j;
break;
}
}
writeLeafBlockDocs(out, leafDocs, 0, leafCount);
writeCommonPrefixes(out, commonPrefixLengths, leafValues);
commonPrefixLengths[0] = prefix;
assert scratchOut.getPosition() == 0;
writeLeafBlockDocs(scratchOut, leafDocs, 0, leafCount);
writeCommonPrefixes(scratchOut, commonPrefixLengths, leafValues);
scratchBytesRef1.length = packedBytesLength;
scratchBytesRef1.bytes = leafValues;
final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
final BytesRef scratch = new BytesRef();
{
scratch.length = packedBytesLength;
scratch.bytes = leafValues;
}
@Override
public BytesRef apply(int i) {
scratch.offset = packedBytesLength * i;
return scratch;
scratchBytesRef1.offset = packedBytesLength * i;
return scratchBytesRef1;
}
};
assert valuesInOrderAndBounds(leafCount, 0, Arrays.copyOf(leafValues, packedBytesLength),
Arrays.copyOfRange(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
packedValues, leafDocs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
scratchOut.reset();
}
}
// TODO: there must be a simpler way?
@ -811,6 +808,24 @@ public class BKDWriter implements Closeable {
}.sort(0, pointCount);
}
// useful for debugging:
/*
private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
System.out.println(" " + desc + " dim=" + dim + " count=" + slice.count + ":");
try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
int count = 0;
while (r.next()) {
byte[] v = r.packedValue();
System.out.println(" " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
count++;
if (count == slice.count) {
break;
}
}
}
}
*/
private PointWriter sort(int dim) throws IOException {
assert dim >= 0 && dim < numDims;
@ -1019,46 +1034,238 @@ public class BKDWriter implements Closeable {
return indexFP;
}
/** Subclass can change how it writes the index. */
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
/** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
int numLeaves = leafBlockFPs.length;
// Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
// if it was created by OneDimensionBKDWriter). In this case the leaf nodes may straddle the two bottom
// levels of the binary tree:
if (numDims == 1 && numLeaves > 1) {
int levelCount = 2;
while (true) {
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
int lastLevel = 2*(numLeaves - levelCount);
assert lastLevel >= 0;
if (lastLevel != 0) {
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
// at read-time, so that we can still delta code them on disk at write:
long[] newLeafBlockFPs = new long[numLeaves];
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
leafBlockFPs = newLeafBlockFPs;
}
break;
}
levelCount *= 2;
}
}
/** Reused while packing the index */
RAMOutputStream writeBuffer = new RAMOutputStream();
// This is the "file" we append the byte[] to:
List<byte[]> blocks = new ArrayList<>();
byte[] lastSplitValues = new byte[bytesPerDim * numDims];
//System.out.println("\npack index");
int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false);
// Compact the byte[] blocks into single byte index:
byte[] index = new byte[totalSize];
int upto = 0;
for(byte[] block : blocks) {
System.arraycopy(block, 0, index, upto, block.length);
upto += block.length;
}
assert upto == totalSize;
return index;
}
/** Appends the current contents of writeBuffer as another block on the growing in-memory file */
private int appendBlock(RAMOutputStream writeBuffer, List<byte[]> blocks) throws IOException {
int pos = Math.toIntExact(writeBuffer.getFilePointer());
byte[] bytes = new byte[pos];
writeBuffer.writeTo(bytes, 0);
writeBuffer.reset();
blocks.add(bytes);
return pos;
}
/**
* lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each inner node
*/
private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List<byte[]> blocks,
int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException {
if (nodeID >= leafBlockFPs.length) {
int leafID = nodeID - leafBlockFPs.length;
//System.out.println("recursePack leaf nodeID=" + nodeID);
// In the unbalanced case it's possible the left most node only has one child:
if (leafID < leafBlockFPs.length) {
long delta = leafBlockFPs[leafID] - minBlockFP;
if (isLeft) {
assert delta == 0;
return 0;
} else {
assert nodeID == 1 || delta > 0: "nodeID=" + nodeID;
writeBuffer.writeVLong(delta);
return appendBlock(writeBuffer, blocks);
}
} else {
return 0;
}
} else {
long leftBlockFP;
if (isLeft == false) {
leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
long delta = leftBlockFP - minBlockFP;
assert nodeID == 1 || delta > 0;
writeBuffer.writeVLong(delta);
} else {
// The left tree's left most leaf block FP is always the minimal FP:
leftBlockFP = minBlockFP;
}
int address = nodeID * (1+bytesPerDim);
int splitDim = splitPackedValues[address++] & 0xff;
//System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
// find common prefix with last split value in this dim:
int prefix = 0;
for(;prefix<bytesPerDim;prefix++) {
if (splitPackedValues[address+prefix] != lastSplitValues[splitDim * bytesPerDim + prefix]) {
break;
}
}
//System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + prefix);
int firstDiffByteDelta;
if (prefix < bytesPerDim) {
//System.out.println(" delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF);
if (negativeDeltas[splitDim]) {
firstDiffByteDelta = -firstDiffByteDelta;
}
//System.out.println(" delta=" + firstDiffByteDelta);
assert firstDiffByteDelta > 0;
} else {
firstDiffByteDelta = 0;
}
// pack the prefix, splitDim and delta first diff byte into a single vInt:
int code = (firstDiffByteDelta * (1+bytesPerDim) + prefix) * numDims + splitDim;
//System.out.println(" code=" + code);
//System.out.println(" splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
writeBuffer.writeVInt(code);
// write the split value, prefix coded vs. our parent's split value:
int suffix = bytesPerDim - prefix;
byte[] savSplitValue = new byte[suffix];
if (suffix > 1) {
writeBuffer.writeBytes(splitPackedValues, address+prefix+1, suffix-1);
}
byte[] cmp = lastSplitValues.clone();
System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
// copy our split value into lastSplitValues for our children to prefix-code against
System.arraycopy(splitPackedValues, address+prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
int numBytes = appendBlock(writeBuffer, blocks);
// placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can
// quickly seek to its starting point
int idxSav = blocks.size();
blocks.add(null);
boolean savNegativeDelta = negativeDeltas[splitDim];
negativeDeltas[splitDim] = true;
int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID, lastSplitValues, negativeDeltas, true);
if (nodeID * 2 < leafBlockFPs.length) {
writeBuffer.writeVInt(leftNumBytes);
} else {
assert leftNumBytes == 0: "leftNumBytes=" + leftNumBytes;
}
int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());
byte[] bytes2 = new byte[numBytes2];
writeBuffer.writeTo(bytes2, 0);
writeBuffer.reset();
// replace our placeholder:
blocks.set(idxSav, bytes2);
negativeDeltas[splitDim] = false;
int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2*nodeID+1, lastSplitValues, negativeDeltas, false);
negativeDeltas[splitDim] = savNegativeDelta;
// restore lastSplitValues to what caller originally passed us:
System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
assert Arrays.equals(lastSplitValues, cmp);
return numBytes + numBytes2 + leftNumBytes + rightNumBytes;
}
}
private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID) {
int nodeIDIn = nodeID;
// TODO: can we do this cheaper, e.g. a closed form solution instead of while loop? Or
// change the recursion while packing the index to return this left-most leaf block FP
// from each recursion instead?
//
// Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
// we call it O(N) times (N = number of leaf blocks)
while (nodeID < leafBlockFPs.length) {
nodeID *= 2;
}
int leafID = nodeID - leafBlockFPs.length;
long result = leafBlockFPs[leafID];
if (result < 0) {
throw new AssertionError(result + " for leaf " + leafID);
}
return result;
}
private void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException {
byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
writeIndex(out, leafBlockFPs.length, packedIndex);
}
private void writeIndex(IndexOutput out, int numLeaves, byte[] packedIndex) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeVInt(numDims);
out.writeVInt(maxPointsInLeafNode);
out.writeVInt(bytesPerDim);
assert leafBlockFPs.length > 0;
out.writeVInt(leafBlockFPs.length);
assert numLeaves > 0;
out.writeVInt(numLeaves);
out.writeBytes(minPackedValue, 0, packedBytesLength);
out.writeBytes(maxPackedValue, 0, packedBytesLength);
out.writeVLong(pointCount);
out.writeVInt(docsSeen.cardinality());
// NOTE: splitPackedValues[0] is unused, because nodeID is 1-based:
if (numDims == 1) {
// write the index, skipping the byte used to store the split dim since it is always 0
for (int i = 1; i < splitPackedValues.length; i += 1 + bytesPerDim) {
out.writeBytes(splitPackedValues, i, bytesPerDim);
}
} else {
out.writeBytes(splitPackedValues, 0, splitPackedValues.length);
}
long lastFP = 0;
for (int i=0;i<leafBlockFPs.length;i++) {
long delta = leafBlockFPs[i]-lastFP;
out.writeVLong(delta);
lastFP = leafBlockFPs[i];
}
out.writeVInt(packedIndex.length);
out.writeBytes(packedIndex, 0, packedIndex.length);
}
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
private void writeLeafBlockDocs(DataOutput out, int[] docIDs, int start, int count) throws IOException {
assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
out.writeVInt(count);
DocIdsWriter.writeDocIds(docIDs, start, count, out);
}
protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
if (prefixLenSum == packedBytesLength) {
// all values in this block are equal
@ -1083,7 +1290,7 @@ public class BKDWriter implements Closeable {
}
}
private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
for (int i = start; i < end; ++i) {
BytesRef ref = packedValues.apply(i);
assert ref.length == packedBytesLength;
@ -1109,7 +1316,7 @@ public class BKDWriter implements Closeable {
return end - start;
}
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
for(int dim=0;dim<numDims;dim++) {
out.writeVInt(commonPrefixes[dim]);
//System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
@ -1177,7 +1384,7 @@ public class BKDWriter implements Closeable {
// TODO: find a way to also checksum this reader? If we changed to markLeftTree, and scanned the final chunk, it could work?
try (PointReader reader = source.writer.getReader(source.start + source.count - rightCount, rightCount)) {
boolean result = reader.next();
assert result;
assert result: "rightCount=" + rightCount + " source.count=" + source.count + " source.writer=" + source.writer;
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
if (numDims > 1) {
assert ordBitSet.get(reader.ord()) == false;
@ -1242,14 +1449,15 @@ public class BKDWriter implements Closeable {
}
}
/* Recursively reorders the provided reader and writes the bkd-tree on the fly. */
/* Recursively reorders the provided reader and writes the bkd-tree on the fly; this method is used
* when we are writing a new segment directly from IndexWriter's indexing buffer (MutablePointsReader). */
private void build(int nodeID, int leafNodeOffset,
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
MutablePointValues reader, int from, int to,
IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs,
int[] spareDocIds) throws IOException {
if (nodeID >= leafNodeOffset) {
// leaf node
@ -1306,17 +1514,20 @@ public class BKDWriter implements Closeable {
// Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
assert scratchOut.getPosition() == 0;
// Write doc IDs
int[] docIDs = spareDocIds;
for (int i = from; i < to; ++i) {
docIDs[i - from] = reader.getDocID(i);
}
writeLeafBlockDocs(out, docIDs, 0, count);
//System.out.println("writeLeafBlock pos=" + out.getFilePointer());
writeLeafBlockDocs(scratchOut, docIDs, 0, count);
// Write the common prefixes:
reader.getValue(from, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset, scratch1, 0, packedBytesLength);
writeCommonPrefixes(out, commonPrefixLengths, scratch1);
writeCommonPrefixes(scratchOut, commonPrefixLengths, scratch1);
// Write the full values:
IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
@ -1328,7 +1539,10 @@ public class BKDWriter implements Closeable {
};
assert valuesInOrderAndBounds(count, sortedDim, minPackedValue, maxPackedValue, packedValues,
docIDs, 0);
writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, count, sortedDim, packedValues);
out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
scratchOut.reset();
} else {
// inner node
@ -1344,6 +1558,7 @@ public class BKDWriter implements Closeable {
break;
}
}
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPrefixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
@ -1368,7 +1583,8 @@ public class BKDWriter implements Closeable {
}
}
/** The array (sized numDims) of PathSlice describe the cell we have currently recursed to. */
/** The array (sized numDims) of PathSlice describe the cell we have currently recursed to.
/* This method is used when we are merging previously written segments, in the numDims > 1 case. */
private void build(int nodeID, int leafNodeOffset,
PathSlice[] slices,
LongBitSet ordBitSet,
@ -1381,7 +1597,7 @@ public class BKDWriter implements Closeable {
for(PathSlice slice : slices) {
assert slice.count == slices[0].count;
}
if (numDims == 1 && slices[0].writer instanceof OfflinePointWriter && slices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0], toCloseHeroically);

View File

@ -19,14 +19,14 @@ package org.apache.lucene.util.bkd;
import java.io.IOException;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
class DocIdsWriter {
private DocIdsWriter() {}
static void writeDocIds(int[] docIds, int start, int count, IndexOutput out) throws IOException {
static void writeDocIds(int[] docIds, int start, int count, DataOutput out) throws IOException {
// docs can be sorted either when all docs in a block have the same value
// or when a segment is sorted
boolean sorted = true;

View File

@ -18,7 +18,10 @@ package org.apache.lucene.util.bkd;
import java.util.List;
final class HeapPointReader extends PointReader {
/** Utility class to read buffered points from in-heap arrays.
*
* @lucene.internal */
public final class HeapPointReader extends PointReader {
private int curRead;
final List<byte[]> blocks;
final int valuesPerBlock;
@ -30,7 +33,7 @@ final class HeapPointReader extends PointReader {
final byte[] scratch;
final boolean singleValuePerDoc;
HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
public HeapPointReader(List<byte[]> blocks, int valuesPerBlock, int packedBytesLength, int[] ords, long[] ordsLong, int[] docIDs, int start, int end, boolean singleValuePerDoc) {
this.blocks = blocks;
this.valuesPerBlock = valuesPerBlock;
this.singleValuePerDoc = singleValuePerDoc;

View File

@ -24,18 +24,21 @@ import java.util.List;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
final class HeapPointWriter implements PointWriter {
int[] docIDs;
long[] ordsLong;
int[] ords;
/** Utility class to write new points into in-heap arrays.
*
* @lucene.internal */
public final class HeapPointWriter implements PointWriter {
public int[] docIDs;
public long[] ordsLong;
public int[] ords;
private int nextWrite;
private boolean closed;
final int maxSize;
final int valuesPerBlock;
public final int valuesPerBlock;
final int packedBytesLength;
final boolean singleValuePerDoc;
// NOTE: can't use ByteBlockPool because we need random-write access when sorting in heap
final List<byte[]> blocks = new ArrayList<>();
public final List<byte[]> blocks = new ArrayList<>();
public HeapPointWriter(int initSize, int maxSize, int packedBytesLength, boolean longOrds, boolean singleValuePerDoc) {
docIDs = new int[initSize];
@ -77,7 +80,7 @@ final class HeapPointWriter implements PointWriter {
nextWrite = other.nextWrite;
}
void readPackedValue(int index, byte[] bytes) {
public void readPackedValue(int index, byte[] bytes) {
assert bytes.length == packedBytesLength;
int block = index / valuesPerBlock;
int blockIndex = index % valuesPerBlock;
@ -85,7 +88,7 @@ final class HeapPointWriter implements PointWriter {
}
/** Returns a reference, in <code>result</code>, to the byte[] slice holding this value */
void getPackedValueSlice(int index, BytesRef result) {
public void getPackedValueSlice(int index, BytesRef result) {
int block = index / valuesPerBlock;
int blockIndex = index % valuesPerBlock;
result.bytes = blocks.get(block);
@ -138,7 +141,8 @@ final class HeapPointWriter implements PointWriter {
@Override
public PointReader getReader(long start, long length) {
assert start + length <= docIDs.length: "start=" + start + " length=" + length + " docIDs.length=" + docIDs.length;
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, nextWrite, singleValuePerDoc);
assert start + length <= nextWrite: "start=" + start + " length=" + length + " nextWrite=" + nextWrite;
return new HeapPointReader(blocks, valuesPerBlock, packedBytesLength, ords, ordsLong, docIDs, (int) start, Math.toIntExact(start+length), singleValuePerDoc);
}
@Override

View File

@ -26,13 +26,16 @@ import org.apache.lucene.util.Selector;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.PackedInts;
final class MutablePointsReaderUtils {
/** Utility APIs for sorting and partitioning buffered points.
*
* @lucene.internal */
public final class MutablePointsReaderUtils {
MutablePointsReaderUtils() {}
/** Sort the given {@link MutablePointValues} based on its packed value then doc ID. */
static void sort(int maxDoc, int packedBytesLength,
MutablePointValues reader, int from, int to) {
public static void sort(int maxDoc, int packedBytesLength,
MutablePointValues reader, int from, int to) {
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);
new MSBRadixSorter(packedBytesLength + (bitsPerDocId + 7) / 8) {
@ -88,9 +91,9 @@ final class MutablePointsReaderUtils {
}
/** Sort points on the given dimension. */
static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointValues reader, int from, int to,
BytesRef scratch1, BytesRef scratch2) {
public static void sortByDim(int sortedDim, int bytesPerDim, int[] commonPrefixLengths,
MutablePointValues reader, int from, int to,
BytesRef scratch1, BytesRef scratch2) {
// No need for a fancy radix sort here, this is called on the leaves only so
// there are not many values to sort
@ -127,9 +130,9 @@ final class MutablePointsReaderUtils {
/** Partition points around {@code mid}. All values on the left must be less
* than or equal to it and all values on the right must be greater than or
* equal to it. */
static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointValues reader, int from, int to, int mid,
BytesRef scratch1, BytesRef scratch2) {
public static void partition(int maxDoc, int splitDim, int bytesPerDim, int commonPrefixLen,
MutablePointValues reader, int from, int to, int mid,
BytesRef scratch1, BytesRef scratch2) {
final int offset = splitDim * bytesPerDim + commonPrefixLen;
final int cmpBytes = bytesPerDim - commonPrefixLen;
final int bitsPerDocId = PackedInts.bitsRequired(maxDoc - 1);

View File

@ -27,8 +27,10 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LongBitSet;
/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}. */
final class OfflinePointReader extends PointReader {
/** Reads points from disk in a fixed-with format, previously written with {@link OfflinePointWriter}.
*
* @lucene.internal */
public final class OfflinePointReader extends PointReader {
long countLeft;
final IndexInput in;
private final byte[] packedValue;
@ -43,7 +45,7 @@ final class OfflinePointReader extends PointReader {
// File name we are reading
final String name;
OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
public OfflinePointReader(Directory tempDir, String tempFileName, int packedBytesLength, long start, long length,
boolean longOrds, boolean singleValuePerDoc) throws IOException {
this.singleValuePerDoc = singleValuePerDoc;
int bytesPerDoc = packedBytesLength + Integer.BYTES;

View File

@ -26,12 +26,14 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
/** Writes points to disk in a fixed-with format. */
final class OfflinePointWriter implements PointWriter {
/** Writes points to disk in a fixed-with format.
*
* @lucene.internal */
public final class OfflinePointWriter implements PointWriter {
final Directory tempDir;
final IndexOutput out;
final String name;
public final IndexOutput out;
public final String name;
final int packedBytesLength;
final boolean singleValuePerDoc;
long count;

View File

@ -24,20 +24,22 @@ import org.apache.lucene.util.LongBitSet;
/** One pass iterator through all points previously written with a
* {@link PointWriter}, abstracting away whether points a read
* from (offline) disk or simple arrays in heap. */
abstract class PointReader implements Closeable {
* from (offline) disk or simple arrays in heap.
*
* @lucene.internal */
public abstract class PointReader implements Closeable {
/** Returns false once iteration is done, else true. */
abstract boolean next() throws IOException;
public abstract boolean next() throws IOException;
/** Returns the packed byte[] value */
abstract byte[] packedValue();
public abstract byte[] packedValue();
/** Point ordinal */
abstract long ord();
public abstract long ord();
/** DocID for this point */
abstract int docID();
public abstract int docID();
/** Iterates through the next {@code count} ords, marking them in the provided {@code ordBitSet}. */
public void markOrds(long count, LongBitSet ordBitSet) throws IOException {

View File

@ -23,8 +23,10 @@ import java.util.List;
/** Appends many points, and then at the end provides a {@link PointReader} to iterate
* those points. This abstracts away whether we write to disk, or use simple arrays
* in heap. */
interface PointWriter extends Closeable {
* in heap.
*
* @lucene.internal */
public interface PointWriter extends Closeable {
/** Add a new point */
void append(byte[] packedValue, long ord, int docID) throws IOException;

View File

@ -21,7 +21,7 @@ import java.io.EOFException;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.DataOutput;
/**
* Class for writing packed integers to be directly read from Directory.
@ -44,7 +44,7 @@ import org.apache.lucene.store.IndexOutput;
public final class DirectWriter {
final int bitsPerValue;
final long numValues;
final IndexOutput output;
final DataOutput output;
long count;
boolean finished;
@ -56,7 +56,7 @@ public final class DirectWriter {
final BulkOperation encoder;
final int iterations;
DirectWriter(IndexOutput output, long numValues, int bitsPerValue) {
DirectWriter(DataOutput output, long numValues, int bitsPerValue) {
this.output = output;
this.numValues = numValues;
this.bitsPerValue = bitsPerValue;
@ -103,7 +103,7 @@ public final class DirectWriter {
}
/** Returns an instance suitable for encoding {@code numValues} using {@code bitsPerValue} */
public static DirectWriter getInstance(IndexOutput output, long numValues, int bitsPerValue) {
public static DirectWriter getInstance(DataOutput output, long numValues, int bitsPerValue) {
if (Arrays.binarySearch(SUPPORTED_BITS_PER_VALUE, bitsPerValue) < 0) {
throw new IllegalArgumentException("Unsupported bitsPerValue " + bitsPerValue + ". Did you use bitsRequired?");
}

View File

@ -25,6 +25,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
import org.apache.lucene.analysis.MockAnalyzer;
@ -61,6 +62,7 @@ import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
@ -534,4 +536,154 @@ public class TestLucene70DocValuesFormat extends BaseCompressingDocValuesFormatT
dir.close();
}
}
@Slow
public void testSortedNumericBlocksOfVariousBitsPerValue() throws Exception {
doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 1, 3));
}
@Slow
public void testSparseSortedNumericBlocksOfVariousBitsPerValue() throws Exception {
doTestSortedNumericBlocksOfVariousBitsPerValue(() -> TestUtil.nextInt(random(), 0, 2));
}
@Slow
public void testNumericBlocksOfVariousBitsPerValue() throws Exception {
doTestSparseNumericBlocksOfVariousBitsPerValue(1);
}
@Slow
public void testSparseNumericBlocksOfVariousBitsPerValue() throws Exception {
doTestSparseNumericBlocksOfVariousBitsPerValue(random().nextDouble());
}
private static LongSupplier blocksOfVariousBPV() {
final long mul = TestUtil.nextInt(random(), 1, 100);
final long min = random().nextInt();
return new LongSupplier() {
int i = Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE;
int maxDelta;
@Override
public long getAsLong() {
if (i == Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE) {
maxDelta = 1 << random().nextInt(5);
i = 0;
}
i++;
return min + mul * random().nextInt(maxDelta);
}
};
}
private void doTestSortedNumericBlocksOfVariousBitsPerValue(LongSupplier counts) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
IndexWriter writer = new IndexWriter(dir, conf);
final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
final LongSupplier values = blocksOfVariousBPV();
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int valueCount = (int) counts.getAsLong();
long valueArray[] = new long[valueCount];
for (int j = 0; j < valueCount; j++) {
long value = values.getAsLong();
valueArray[j] = value;
doc.add(new SortedNumericDocValuesField("dv", value));
}
Arrays.sort(valueArray);
for (int j = 0; j < valueCount; j++) {
doc.add(new StoredField("stored", Long.toString(valueArray[j])));
}
writer.addDocument(doc);
if (random().nextInt(31) == 0) {
writer.commit();
}
}
writer.forceMerge(1);
writer.close();
// compare
DirectoryReader ir = DirectoryReader.open(dir);
TestUtil.checkReader(ir);
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
SortedNumericDocValues docValues = DocValues.getSortedNumeric(r, "dv");
for (int i = 0; i < r.maxDoc(); i++) {
if (i > docValues.docID()) {
docValues.nextDoc();
}
String expected[] = r.document(i).getValues("stored");
if (i < docValues.docID()) {
assertEquals(0, expected.length);
} else {
String actual[] = new String[docValues.docValueCount()];
for (int j = 0; j < actual.length; j++) {
actual[j] = Long.toString(docValues.nextValue());
}
assertArrayEquals(expected, actual);
}
}
}
ir.close();
dir.close();
}
private void doTestSparseNumericBlocksOfVariousBitsPerValue(double density) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
conf.setMaxBufferedDocs(atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE));
conf.setRAMBufferSizeMB(-1);
conf.setMergePolicy(newLogMergePolicy(random().nextBoolean()));
IndexWriter writer = new IndexWriter(dir, conf);
Document doc = new Document();
Field storedField = newStringField("stored", "", Field.Store.YES);
Field dvField = new NumericDocValuesField("dv", 0);
doc.add(storedField);
doc.add(dvField);
final int numDocs = atLeast(Lucene70DocValuesFormat.NUMERIC_BLOCK_SIZE*3);
final LongSupplier longs = blocksOfVariousBPV();
for (int i = 0; i < numDocs; i++) {
if (random().nextDouble() > density) {
writer.addDocument(new Document());
continue;
}
long value = longs.getAsLong();
storedField.setStringValue(Long.toString(value));
dvField.setLongValue(value);
writer.addDocument(doc);
}
writer.forceMerge(1);
writer.close();
// compare
DirectoryReader ir = DirectoryReader.open(dir);
TestUtil.checkReader(ir);
for (LeafReaderContext context : ir.leaves()) {
LeafReader r = context.reader();
NumericDocValues docValues = DocValues.getNumeric(r, "dv");
docValues.nextDoc();
for (int i = 0; i < r.maxDoc(); i++) {
String storedValue = r.document(i).get("stored");
if (storedValue == null) {
assertTrue(docValues.docID() > i);
} else {
assertEquals(i, docValues.docID());
assertEquals(Long.parseLong(storedValue), docValues.longValue());
docValues.nextDoc();
}
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID());
}
ir.close();
dir.close();
}
}

View File

@ -1700,6 +1700,29 @@ public class TestIndexSorting extends LuceneTestCase {
dir.close();
}
// docvalues fields involved in the index sort cannot be updated
public void testBadDVUpdate() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortField("foo", SortField.Type.LONG));
iwc.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new StringField("id", new BytesRef("0"), Store.NO));
doc.add(new NumericDocValuesField("foo", random().nextInt()));
w.addDocument(doc);
w.commit();
IllegalArgumentException exc = expectThrows(IllegalArgumentException.class,
() -> w.updateDocValues(new Term("id", "0"), new NumericDocValuesField("foo", -1)));
assertEquals(exc.getMessage(), "cannot update docvalues field involved in the index sort, field=foo, sort=<long: \"foo\">");
exc = expectThrows(IllegalArgumentException.class,
() -> w.updateNumericDocValue(new Term("id", "0"), "foo", -1));
assertEquals(exc.getMessage(), "cannot update docvalues field involved in the index sort, field=foo, sort=<long: \"foo\">");
w.close();
dir.close();
}
static class DVUpdateRunnable implements Runnable {
private final int numDocs;
@ -1727,7 +1750,7 @@ public class TestIndexSorting extends LuceneTestCase {
final long value = random.nextInt(20);
synchronized (values) {
w.updateDocValues(new Term("id", Integer.toString(id)), new NumericDocValuesField("foo", value));
w.updateDocValues(new Term("id", Integer.toString(id)), new NumericDocValuesField("bar", value));
values.put(id, value);
}
@ -1762,7 +1785,8 @@ public class TestIndexSorting extends LuceneTestCase {
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new StringField("id", Integer.toString(i), Store.NO));
doc.add(new NumericDocValuesField("foo", -1));
doc.add(new NumericDocValuesField("foo", random().nextInt()));
doc.add(new NumericDocValuesField("bar", -1));
w.addDocument(doc);
values.put(i, -1L);
}
@ -1786,7 +1810,7 @@ public class TestIndexSorting extends LuceneTestCase {
for (int i = 0; i < numDocs; ++i) {
final TopDocs topDocs = searcher.search(new TermQuery(new Term("id", Integer.toString(i))), 1);
assertEquals(1, topDocs.totalHits);
NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "foo");
NumericDocValues dvs = MultiDocValues.getNumericValues(reader, "bar");
int hitDoc = topDocs.scoreDocs[0].doc;
assertEquals(hitDoc, dvs.advance(hitDoc));
assertEquals(values.get(i).longValue(), dvs.longValue());

View File

@ -998,4 +998,22 @@ public class TestTermsEnum extends LuceneTestCase {
}
dir.close();
}
// LUCENE-7576
public void testIntersectRegexp() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
doc.add(newStringField("field", "foobar", Field.Store.NO));
w.addDocument(doc);
IndexReader r = w.getReader();
Fields fields = MultiFields.getFields(r);
CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
Terms terms = fields.terms("field");
String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
r.close();
w.close();
d.close();
}
}

View File

@ -14,13 +14,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase;
@ -92,4 +94,69 @@ public class TestTragicIndexWriterDeadlock extends LuceneTestCase {
w.close();
dir.close();
}
// LUCENE-7570
public void testDeadlockStalledMerges() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
// so we merge every 2 segments:
LogMergePolicy mp = new LogDocMergePolicy();
mp.setMergeFactor(2);
iwc.setMergePolicy(mp);
CountDownLatch done = new CountDownLatch(1);
ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler() {
@Override
protected void doMerge(IndexWriter writer, MergePolicy.OneMerge merge) throws IOException {
// let merge takes forever, until commit thread is stalled
try {
done.await();
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
throw new RuntimeException(ie);
}
super.doMerge(writer, merge);
}
@Override
protected synchronized void doStall() {
done.countDown();
super.doStall();
}
@Override
protected void handleMergeException(Directory dir, Throwable exc) {
}
};
// so we stall once the 2nd merge wants to run:
cms.setMaxMergesAndThreads(1, 1);
iwc.setMergeScheduler(cms);
// so we write a segment every 2 indexed docs:
iwc.setMaxBufferedDocs(2);
final IndexWriter w = new IndexWriter(dir, iwc) {
@Override
void mergeSuccess(MergePolicy.OneMerge merge) {
// tragedy strikes!
throw new OutOfMemoryError();
}
};
w.addDocument(new Document());
w.addDocument(new Document());
// w writes first segment
w.addDocument(new Document());
w.addDocument(new Document());
// w writes second segment, and kicks off merge, that takes forever (done.await)
w.addDocument(new Document());
w.addDocument(new Document());
// w writes third segment
w.addDocument(new Document());
w.commit();
// w writes fourth segment, and commit flushes and kicks off merge that stalls
w.close();
dir.close();
}
}

View File

@ -621,6 +621,9 @@ public class TestPointQueries extends LuceneTestCase {
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
int sameValuePct = random().nextInt(100);
if (VERBOSE) {
System.out.println("TEST: sameValuePct=" + sameValuePct);
}
byte[][][] docValues = new byte[numValues][][];

View File

@ -14,8 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.compressing;
package org.apache.lucene.store;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@ -43,13 +43,13 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
//explicitly write utf8 len so that we know how many bytes it occupies
dataOutput.writeVInt(len);
int vintLen = dataOutput.length;
int vintLen = dataOutput.getPosition();
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
dataOutput.writeString(unicode);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.getPosition());
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
assertEquals(utf8[j], dataOutput.bytes[k]);
assertEquals(utf8[j], dataOutput.getBytes()[k]);
}
}
}
@ -67,13 +67,13 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
//explicitly write utf8 len so that we know how many bytes it occupies
dataOutput.writeVInt(len);
int vintLen = dataOutput.length;
int vintLen = dataOutput.getPosition();
// now write the string which will internally write number of bytes as a vint and then utf8 bytes
dataOutput.writeString(unicode);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.getPosition());
for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
assertEquals(utf8[j], dataOutput.bytes[k]);
assertEquals(utf8[j], dataOutput.getBytes()[k]);
}
}
}

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.util.bkd;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
@ -64,7 +65,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("1d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
r.verify(numDocs);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("1d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}
@ -101,7 +105,10 @@ public class Test2BBKDPoints extends LuceneTestCase {
IndexInput in = dir.openInput("2d.bkd", IOContext.DEFAULT);
in.seek(indexFP);
BKDReader r = new BKDReader(in);
r.verify(numDocs);
CheckIndex.VerifyPointsVisitor visitor = new CheckIndex.VerifyPointsVisitor("2d", numDocs, r);
r.intersect(visitor);
assertEquals(r.size(), visitor.getPointCountSeen());
assertEquals(r.getDocCount(), visitor.getDocCountSeen());
in.close();
dir.close();
}

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.CorruptingIndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
@ -1010,4 +1011,57 @@ public class TestBKD extends LuceneTestCase {
}
}
// Claims 16 bytes per dim, but only use the bottom N 1-3 bytes; this would happen e.g. if a user indexes what are actually just short
// values as a LongPoint:
public void testWastedLeadingBytes() throws Exception {
int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
int bytesPerDim = PointValues.MAX_NUM_BYTES;
int bytesUsed = TestUtil.nextInt(random(), 1, 3);
Directory dir = newFSDirectory(createTempDir());
int numDocs = 100000;
BKDWriter w = new BKDWriter(numDocs+1, dir, "tmp", numDims, bytesPerDim, 32, 1f, numDocs, true);
byte[] tmp = new byte[bytesUsed];
byte[] buffer = new byte[numDims * bytesPerDim];
for(int i=0;i<numDocs;i++) {
for(int dim=0;dim<numDims;dim++) {
random().nextBytes(tmp);
System.arraycopy(tmp, 0, buffer, dim*bytesPerDim+(bytesPerDim-bytesUsed), tmp.length);
}
w.add(buffer, i);
}
IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT);
long fp = w.finish(out);
out.close();
IndexInput in = dir.openInput("bkd", IOContext.DEFAULT);
in.seek(fp);
BKDReader r = new BKDReader(in);
int[] count = new int[1];
r.intersect(new IntersectVisitor() {
@Override
public void visit(int docID) {
count[0]++;
}
@Override
public void visit(int docID, byte[] packedValue) {
visit(docID);
}
@Override
public Relation compare(byte[] minPacked, byte[] maxPacked) {
if (random().nextInt(7) == 1) {
return Relation.CELL_CROSSES_QUERY;
} else {
return Relation.CELL_INSIDE_QUERY;
}
}
});
assertEquals(numDocs, count[0]);
in.close();
dir.close();
}
}

View File

@ -228,7 +228,7 @@ public class TestFSTs extends LuceneTestCase {
final long value = lastOutput + TestUtil.nextInt(random(), 1, 1000);
lastOutput = value;
pairs.add(new FSTTester.InputOutput<>(terms[idx],
outputs.newPair((long) idx, value)));
outputs.newPair((long) idx, value)));
}
new FSTTester<>(random(), dir, inputMode, pairs, outputs, false).doTest(true);
}

View File

@ -23,6 +23,7 @@ import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.FilteringTokenFilter;
@ -49,7 +50,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
private final LeafReader leafReader;
private final CharacterRunAutomaton preMemIndexFilterAutomaton;
public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
public MemoryIndexOffsetStrategy(String field, Predicate<String> fieldMatcher, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
CharacterRunAutomaton[] automata, Analyzer analyzer,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
super(field, extractedTerms, phraseHelper, automata, analyzer);
@ -57,13 +58,14 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
// preFilter for MemoryIndex
preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
preMemIndexFilterAutomaton = buildCombinedAutomaton(fieldMatcher, terms, this.automata, phraseHelper, multiTermQueryRewrite);
}
/**
* Build one {@link CharacterRunAutomaton} matching any term the query might match.
*/
private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
private static CharacterRunAutomaton buildCombinedAutomaton(Predicate<String> fieldMatcher,
BytesRef[] terms,
CharacterRunAutomaton[] automata,
PhraseHelper strictPhrases,
Function<Query, Collection<Query>> multiTermQueryRewrite) {
@ -74,7 +76,7 @@ public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
Collections.addAll(allAutomata, automata);
for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
Collections.addAll(allAutomata,
MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));//true==lookInSpan
}
if (allAutomata.size() == 1) {

View File

@ -22,6 +22,7 @@ import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
@ -56,50 +57,52 @@ class MultiTermHighlighting {
}
/**
* Extracts all MultiTermQueries for {@code field}, and returns equivalent
* automata that will match terms.
* Extracts MultiTermQueries that match the provided field predicate.
* Returns equivalent automata that will match terms.
*/
public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan,
public static CharacterRunAutomaton[] extractAutomata(Query query,
Predicate<String> fieldMatcher,
boolean lookInSpan,
Function<Query, Collection<Query>> preRewriteFunc) {
List<CharacterRunAutomaton> list = new ArrayList<>();
Collection<Query> customSubQueries = preRewriteFunc.apply(query);
if (customSubQueries != null) {
for (Query sub : customSubQueries) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (query instanceof BooleanQuery) {
for (BooleanClause clause : (BooleanQuery) query) {
if (!clause.isProhibited()) {
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), fieldMatcher, lookInSpan, preRewriteFunc)));
}
}
} else if (query instanceof ConstantScoreQuery) {
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (query instanceof DisjunctionMaxQuery) {
for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanOrQuery) {
for (Query sub : ((SpanOrQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNearQuery) {
for (Query sub : ((SpanNearQuery) query).getClauses()) {
list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(sub, fieldMatcher, lookInSpan, preRewriteFunc)));
}
} else if (lookInSpan && query instanceof SpanNotQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanPositionCheckQuery) {
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan,
list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), fieldMatcher, lookInSpan,
preRewriteFunc)));
} else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) {
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field,
lookInSpan, preRewriteFunc)));
list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(),
fieldMatcher, lookInSpan, preRewriteFunc)));
} else if (query instanceof AutomatonQuery) {
final AutomatonQuery aq = (AutomatonQuery) query;
if (aq.getField().equals(field)) {
if (fieldMatcher.test(aq.getField())) {
list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
@Override
public String toString() {
@ -110,7 +113,7 @@ class MultiTermHighlighting {
} else if (query instanceof PrefixQuery) {
final PrefixQuery pq = (PrefixQuery) query;
Term prefix = pq.getPrefix();
if (prefix.field().equals(field)) {
if (fieldMatcher.test(prefix.field())) {
list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
Automata.makeAnyString())) {
@Override
@ -121,7 +124,7 @@ class MultiTermHighlighting {
}
} else if (query instanceof FuzzyQuery) {
final FuzzyQuery fq = (FuzzyQuery) query;
if (fq.getField().equals(field)) {
if (fieldMatcher.test(fq.getField())) {
String utf16 = fq.getTerm().text();
int termText[] = new int[utf16.codePointCount(0, utf16.length())];
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
@ -142,7 +145,7 @@ class MultiTermHighlighting {
}
} else if (query instanceof TermRangeQuery) {
final TermRangeQuery tq = (TermRangeQuery) query;
if (tq.getField().equals(field)) {
if (fieldMatcher.test(tq.getField())) {
final CharsRef lowerBound;
if (tq.getLowerTerm() == null) {
lowerBound = null;

View File

@ -16,17 +16,50 @@
*/
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanWeight;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.*;
import java.util.function.Function;
/**
* Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly).
* This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
@ -40,7 +73,7 @@ import java.util.function.Function;
public class PhraseHelper {
public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
spanQuery -> null, query -> null, true);
(s) -> false, spanQuery -> null, query -> null, true);
//TODO it seems this ought to be a general thing on Spans?
private static final Comparator<? super Spans> SPANS_COMPARATOR = (o1, o2) -> {
@ -59,10 +92,11 @@ public class PhraseHelper {
}
};
private final String fieldName; // if non-null, only look at queries/terms for this field
private final String fieldName;
private final Set<Term> positionInsensitiveTerms; // (TermQuery terms)
private final Set<SpanQuery> spanQueries;
private final boolean willRewrite;
private final Predicate<String> fieldMatcher;
/**
* Constructor.
@ -73,14 +107,15 @@ public class PhraseHelper {
* to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked.
* {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
* usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
* {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted.
*/
public PhraseHelper(Query query, String field, Function<SpanQuery, Boolean> rewriteQueryPred,
public PhraseHelper(Query query, String field, Predicate<String> fieldMatcher, Function<SpanQuery, Boolean> rewriteQueryPred,
Function<Query, Collection<Query>> preExtractRewriteFunction,
boolean ignoreQueriesNeedingRewrite) {
this.fieldName = field; // if null then don't require field match
this.fieldName = field;
this.fieldMatcher = fieldMatcher;
// filter terms to those we want
positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>();
// requireFieldMatch optional
positionInsensitiveTerms = new FieldFilteringTermSet();
spanQueries = new HashSet<>();
// TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls
@ -131,11 +166,11 @@ public class PhraseHelper {
@Override
protected void extractWeightedSpanTerms(Map<String, WeightedSpanTerm> terms, SpanQuery spanQuery,
float boost) throws IOException {
if (field != null) {
// if this span query isn't for this field, skip it.
Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1
collectSpanQueryFields(spanQuery, fieldNameSet);
if (!fieldNameSet.contains(field)) {
// if this span query isn't for this field, skip it.
Set<String> fieldNameSet = new HashSet<>();//TODO reuse. note: almost always size 1
collectSpanQueryFields(spanQuery, fieldNameSet);
for (String spanField : fieldNameSet) {
if (!fieldMatcher.test(spanField)) {
return;
}
}
@ -190,10 +225,11 @@ public class PhraseHelper {
if (spanQueries.isEmpty()) {
return Collections.emptyMap();
}
final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName);
// for each SpanQuery, collect the member spans into a map.
Map<BytesRef, Spans> result = new HashMap<>();
for (SpanQuery spanQuery : spanQueries) {
getTermToSpans(spanQuery, leafReader.getContext(), doc, result);
getTermToSpans(spanQuery, filteredReader.getContext(), doc, result);
}
return result;
}
@ -203,15 +239,14 @@ public class PhraseHelper {
int doc, Map<BytesRef, Spans> result)
throws IOException {
// note: in WSTE there was some field specific looping that seemed pointless so that isn't here.
final IndexSearcher searcher = new IndexSearcher(readerContext);
final IndexSearcher searcher = new IndexSearcher(readerContext.reader());
searcher.setQueryCache(null);
if (willRewrite) {
spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done
}
// Get the underlying query terms
TreeSet<Term> termSet = new TreeSet<>(); // sorted so we can loop over results in order shortly...
TreeSet<Term> termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly...
searcher.createWeight(spanQuery, false, 1.0f).extractTerms(termSet);//needsScores==false
// Get Spans by running the query against the reader
@ -240,9 +275,6 @@ public class PhraseHelper {
for (final Term queryTerm : termSet) {
// note: we expect that at least one query term will pass these filters. This is because the collected
// spanQuery list were already filtered by these conditions.
if (fieldName != null && fieldName.equals(queryTerm.field()) == false) {
continue;
}
if (positionInsensitiveTerms.contains(queryTerm)) {
continue;
}
@ -375,19 +407,17 @@ public class PhraseHelper {
}
/**
* Simple HashSet that filters out Terms not matching a desired field on {@code add()}.
* Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}.
*/
private static class FieldFilteringTermHashSet extends HashSet<Term> {
private final String field;
FieldFilteringTermHashSet(String field) {
this.field = field;
}
private class FieldFilteringTermSet extends TreeSet<Term> {
@Override
public boolean add(Term term) {
if (term.field().equals(field)) {
return super.add(term);
if (fieldMatcher.test(term.field())) {
if (term.field().equals(fieldName)) {
return super.add(term);
} else {
return super.add(new Term(fieldName, term.bytes()));
}
} else {
return false;
}
@ -499,6 +529,64 @@ public class PhraseHelper {
}
}
/**
* This reader will just delegate every call to a single field in the wrapped
* LeafReader. This way we ensure that all queries going through this reader target the same field.
*/
static final class SingleFieldFilterLeafReader extends FilterLeafReader {
final String fieldName;
SingleFieldFilterLeafReader(LeafReader in, String fieldName) {
super(in);
this.fieldName = fieldName;
}
@Override
public FieldInfos getFieldInfos() {
throw new UnsupportedOperationException();
}
@Override
public Fields fields() throws IOException {
return new FilterFields(super.fields()) {
@Override
public Terms terms(String field) throws IOException {
return super.terms(fieldName);
}
@Override
public Iterator<String> iterator() {
return Collections.singletonList(fieldName).iterator();
}
@Override
public int size() {
return 1;
}
};
}
@Override
public NumericDocValues getNumericDocValues(String field) throws IOException {
return super.getNumericDocValues(fieldName);
}
@Override
public BinaryDocValues getBinaryDocValues(String field) throws IOException {
return super.getBinaryDocValues(fieldName);
}
@Override
public SortedDocValues getSortedDocValues(String field) throws IOException {
return super.getSortedDocValues(fieldName);
}
@Override
public NumericDocValues getNormValues(String field) throws IOException {
return super.getNormValues(fieldName);
}
}
/**
* A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc.
*/

View File

@ -24,6 +24,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -31,6 +32,7 @@ import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
@ -58,7 +60,6 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@ -119,13 +120,13 @@ public class UnifiedHighlighter {
private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
// private boolean defaultRequireFieldMatch = true; TODO
private int maxLength = DEFAULT_MAX_LENGTH;
// BreakIterator is stateful so we use a Supplier factory method
private Supplier<BreakIterator> defaultBreakIterator = () -> BreakIterator.getSentenceInstance(Locale.ROOT);
private Predicate<String> defaultFieldMatcher;
private PassageScorer defaultScorer = new PassageScorer();
private PassageFormatter defaultFormatter = new DefaultPassageFormatter();
@ -140,8 +141,8 @@ public class UnifiedHighlighter {
/**
* Calls {@link Weight#extractTerms(Set)} on an empty index for the query.
*/
protected static SortedSet<Term> extractTerms(Query query) throws IOException {
SortedSet<Term> queryTerms = new TreeSet<>();
protected static Set<Term> extractTerms(Query query) throws IOException {
Set<Term> queryTerms = new HashSet<>();
EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
return queryTerms;
}
@ -197,6 +198,10 @@ public class UnifiedHighlighter {
this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
}
public void setFieldMatcher(Predicate<String> predicate) {
this.defaultFieldMatcher = predicate;
}
/**
* Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's enabled. MTQ
* highlighting can be expensive, particularly when using offsets in postings.
@ -220,6 +225,18 @@ public class UnifiedHighlighter {
return defaultPassageRelevancyOverSpeed;
}
/**
* Returns the predicate to use for extracting the query part that must be highlighted.
* By default only queries that target the current field are kept. (AKA requireFieldMatch)
*/
protected Predicate<String> getFieldMatcher(String field) {
if (defaultFieldMatcher != null) {
return defaultFieldMatcher;
} else {
// requireFieldMatch = true
return (qf) -> field.equals(qf);
}
}
/**
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
@ -548,7 +565,7 @@ public class UnifiedHighlighter {
copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params
// Init field highlighters (where most of the highlight logic lives, and on a per field basis)
SortedSet<Term> queryTerms = extractTerms(query);
Set<Term> queryTerms = extractTerms(query);
FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
int numTermVectors = 0;
int numPostings = 0;
@ -718,13 +735,13 @@ public class UnifiedHighlighter {
getClass().getSimpleName() + " without an IndexSearcher.");
}
Objects.requireNonNull(content, "content is required");
SortedSet<Term> queryTerms = extractTerms(query);
Set<Term> queryTerms = extractTerms(query);
return getFieldHighlighter(field, query, queryTerms, maxPassages)
.highlightFieldForDoc(null, -1, content);
}
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(field, allTerms);
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
BytesRef[] terms = filterExtractedTerms(getFieldMatcher(field), allTerms);
Set<HighlightFlag> highlightFlags = getFlags(field);
PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
@ -738,19 +755,15 @@ public class UnifiedHighlighter {
getFormatter(field));
}
protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
// TODO consider requireFieldMatch
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
// Strip off the redundant field:
BytesRef[] terms = new BytesRef[fieldTerms.size()];
int termUpto = 0;
for (Term term : fieldTerms) {
terms[termUpto++] = term.bytes();
protected static BytesRef[] filterExtractedTerms(Predicate<String> fieldMatcher, Set<Term> queryTerms) {
// Strip off the redundant field and sort the remaining terms
SortedSet<BytesRef> filteredTerms = new TreeSet<>();
for (Term term : queryTerms) {
if (fieldMatcher.test(term.field())) {
filteredTerms.add(term.bytes());
}
}
return terms;
return filteredTerms.toArray(new BytesRef[filteredTerms.size()]);
}
protected Set<HighlightFlag> getFlags(String field) {
@ -771,14 +784,13 @@ public class UnifiedHighlighter {
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly ?
new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) :
PhraseHelper.NONE;
new PhraseHelper(query, field, getFieldMatcher(field),
this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE;
}
protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
this::preMultiTermQueryRewrite)
? MultiTermHighlighting.extractAutomata(query, getFieldMatcher(field), !highlightFlags.contains(HighlightFlag.PHRASES), this::preMultiTermQueryRewrite)
: ZERO_LEN_AUTOMATA_ARRAY;
}
@ -826,7 +838,7 @@ public class UnifiedHighlighter {
//skip using a memory index since it's pure term filtering
return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
} else {
return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
return new MemoryIndexOffsetStrategy(field, getFieldMatcher(field), terms, phraseHelper, automata, getIndexAnalyzer(),
this::preMultiTermQueryRewrite);
}
case NONE_NEEDED:

View File

@ -25,6 +25,7 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.apache.lucene.analysis.MockAnalyzer;
@ -32,14 +33,17 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
@ -959,4 +963,275 @@ public class TestUnifiedHighlighter extends LuceneTestCase {
ir.close();
}
private IndexReader indexSomeFields() throws IOException {
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
FieldType ft = new FieldType();
ft.setIndexOptions(IndexOptions.NONE);
ft.setTokenized(false);
ft.setStored(true);
ft.freeze();
Field title = new Field("title", "", fieldType);
Field text = new Field("text", "", fieldType);
Field category = new Field("category", "", fieldType);
Document doc = new Document();
doc.add(title);
doc.add(text);
doc.add(category);
title.setStringValue("This is the title field.");
text.setStringValue("This is the text field. You can put some text if you want.");
category.setStringValue("This is the category field.");
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
return ir;
}
public void testFieldMatcherTermQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("text", "some")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "field")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("text", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "this")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "some")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("category", "category")), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
public void testFieldMatcherMultiTermQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new FuzzyQuery(new Term("text", "sime"), 1), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("text", "fie")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("text", "thi")), BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term("title", "is")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("title", "thi")), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("category", "thi")), BooleanClause.Occur.SHOULD)
.add(new FuzzyQuery(new Term("category", "sime"), 1), BooleanClause.Occur.SHOULD)
.add(new PrefixQuery(new Term("category", "categ")), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the title <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the text <b>field</b>. You can put <b>some</b> text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the text field. ", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>category</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> is the <b>category</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
public void testFieldMatcherPhraseQuery() throws Exception {
IndexReader ir = indexSomeFields();
IndexSearcher searcher = newSearcher(ir);
UnifiedHighlighter highlighterNoFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer) {
@Override
protected Predicate<String> getFieldMatcher(String field) {
// requireFieldMatch=false
return (qf) -> true;
}
};
UnifiedHighlighter highlighterFieldMatch = new UnifiedHighlighter(searcher, indexAnalyzer);
BooleanQuery.Builder queryBuilder =
new BooleanQuery.Builder()
.add(new PhraseQuery("title", "this", "is", "the", "title"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery(2, "category", "this", "is", "the", "field"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery("text", "this", "is"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery("category", "this", "is"), BooleanClause.Occur.SHOULD)
.add(new PhraseQuery(1, "text", "you", "can", "put", "text"), BooleanClause.Occur.SHOULD);
Query query = queryBuilder.build();
// title
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>title</b> field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("title", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the title field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// text
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> <b>text</b> <b>field</b>. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the <b>text</b> field. <b>You</b> <b>can</b> <b>put</b> some <b>text</b> if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "title".equals(fq));
snippets = highlighterFieldMatch.highlight("text", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("This is the text field. You can put some text if you want.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
// category
{
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.totalHits);
String[] snippets = highlighterNoFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> <b>the</b> category <b>field</b>.", snippets[0]);
highlighterFieldMatch.setFieldMatcher((fq) -> "text".equals(fq));
snippets = highlighterFieldMatch.highlight("category", query, topDocs, 10);
assertEquals(1, snippets.length);
assertEquals("<b>This</b> <b>is</b> the category field.", snippets[0]);
highlighterFieldMatch.setFieldMatcher(null);
}
ir.close();
}
}

View File

@ -23,7 +23,6 @@ import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
@ -144,7 +143,7 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
}
@Override
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
protected FieldHighlighter getFieldHighlighter(String field, Query query, Set<Term> allTerms, int maxPassages) {
return super.getFieldHighlighter(field, query, allTerms, maxPassages);
}

View File

@ -62,7 +62,7 @@ com.sun.jersey.version = 1.9
/commons-collections/commons-collections = 3.2.2
/commons-configuration/commons-configuration = 1.6
/commons-digester/commons-digester = 2.1
/commons-fileupload/commons-fileupload = 1.3.1
/commons-fileupload/commons-fileupload = 1.3.2
/commons-io/commons-io = 2.5
/commons-lang/commons-lang = 2.6
/commons-logging/commons-logging = 1.1.3

View File

@ -0,0 +1,202 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
/** Holds statistics for a DocValues field. */
public abstract class DocValuesStats<T> {
private int missing = 0;
private int count = 0;
protected final String field;
protected T min;
protected T max;
protected DocValuesStats(String field, T initialMin, T initialMax) {
this.field = field;
this.min = initialMin;
this.max = initialMax;
}
/**
* Called after #{@link DocValuesStats#accumulate(int)} was processed and verified that the document has a value for
* the field. Implementations should update the statistics based on the value of the current document.
*
* @param count
* the updated number of documents with value for this field.
*/
protected abstract void doAccumulate(int count) throws IOException;
/**
* Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e.
* it does have the requested DocValues field).
*/
protected abstract boolean init(LeafReaderContext context) throws IOException;
/** Returns whether the given document has a value for the requested DocValues field. */
protected abstract boolean hasValue(int doc) throws IOException;
final void accumulate(int doc) throws IOException {
if (hasValue(doc)) {
++count;
doAccumulate(count);
} else {
++missing;
}
}
final void addMissing() {
++missing;
}
/** The field for which these stats were computed. */
public final String field() {
return field;
}
/** The number of documents which have a value of the field. */
public final int count() {
return count;
}
/** The number of documents which do not have a value of the field. */
public final int missing() {
return missing;
}
/** The minimum value of the field. Undefined when {@link #count} is zero. */
public final T min() {
return min;
}
/** The maximum value of the field. Undefined when {@link #count} is zero. */
public final T max() {
return max;
}
/** Holds statistics for a numeric DocValues field. */
public static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
protected double mean = 0.0;
protected double variance = 0.0;
protected NumericDocValues ndv;
protected NumericDocValuesStats(String field, T initialMin, T initialMax) {
super(field, initialMin, initialMax);
}
@Override
protected final boolean init(LeafReaderContext context) throws IOException {
ndv = context.reader().getNumericDocValues(field);
return ndv != null;
}
@Override
protected boolean hasValue(int doc) throws IOException {
return ndv.advanceExact(doc);
}
/** The mean of all values of the field. */
public final double mean() {
return mean;
}
/** Returns the variance of all values of the field. */
public final double variance() {
int count = count();
return count > 0 ? variance / count : 0;
}
/** Returns the stdev of all values of the field. */
public final double stdev() {
return Math.sqrt(variance());
}
/** Returns the sum of values of the field. Note that if the values are large, the {@code sum} might overflow. */
public abstract T sum();
}
/** Holds DocValues statistics for a numeric field storing {@code long} values. */
public static final class LongDocValuesStats extends NumericDocValuesStats<Long> {
// To avoid boxing 'long' to 'Long' while the sum is computed, declare it as private variable.
private long sum = 0;
public LongDocValuesStats(String field) {
super(field, Long.MAX_VALUE, Long.MIN_VALUE);
}
@Override
protected void doAccumulate(int count) throws IOException {
long val = ndv.longValue();
if (val > max) {
max = val;
}
if (val < min) {
min = val;
}
sum += val;
double oldMean = mean;
mean += (val - mean) / count;
variance += (val - mean) * (val - oldMean);
}
@Override
public Long sum() {
return sum;
}
}
/** Holds DocValues statistics for a numeric field storing {@code double} values. */
public static final class DoubleDocValuesStats extends NumericDocValuesStats<Double> {
// To avoid boxing 'double' to 'Double' while the sum is computed, declare it as private variable.
private double sum = 0;
public DoubleDocValuesStats(String field) {
super(field, Double.MAX_VALUE, Double.MIN_VALUE);
}
@Override
protected void doAccumulate(int count) throws IOException {
double val = Double.longBitsToDouble(ndv.longValue());
if (Double.compare(val, max) > 0) {
max = val;
}
if (Double.compare(val, min) < 0) {
min = val;
}
sum += val;
double oldMean = mean;
mean += (val - mean) / count;
variance += (val - mean) * (val - oldMean);
}
@Override
public Double sum() {
return sum;
}
}
}

View File

@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
/** A {@link Collector} which computes statistics for a DocValues field. */
public class DocValuesStatsCollector implements Collector {
private final DocValuesStats<?> stats;
/** Creates a collector to compute statistics for a DocValues field using the given {@code stats}. */
public DocValuesStatsCollector(DocValuesStats<?> stats) {
this.stats = stats;
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
boolean shouldProcess = stats.init(context);
if (!shouldProcess) {
// Stats cannot be computed for this segment, therefore consider all matching documents as a 'miss'.
return new LeafCollector() {
@Override public void setScorer(Scorer scorer) throws IOException {}
@Override
public void collect(int doc) throws IOException {
// All matching documents in this reader are missing a value
stats.addMissing();
}
};
}
return new LeafCollector() {
@Override public void setScorer(Scorer scorer) throws IOException {}
@Override
public void collect(int doc) throws IOException {
stats.accumulate(doc);
}
};
}
@Override
public boolean needsScores() {
return false;
}
}

View File

@ -0,0 +1,212 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.DoubleSummaryStatistics;
import java.util.LongSummaryStatistics;
import java.util.stream.DoubleStream;
import java.util.stream.LongStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleDocValuesField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/** Unit tests for {@link DocValuesStatsCollector}. */
public class TestDocValuesStatsCollector extends LuceneTestCase {
public void testNoDocsWithField() throws IOException {
try (Directory dir = newDirectory();
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
int numDocs = TestUtil.nextInt(random(), 1, 100);
for (int i = 0; i < numDocs; i++) {
indexWriter.addDocument(new Document());
}
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
IndexSearcher searcher = new IndexSearcher(reader);
LongDocValuesStats stats = new LongDocValuesStats("foo");
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
assertEquals(0, stats.count());
assertEquals(numDocs, stats.missing());
}
}
}
public void testOneDoc() throws IOException {
try (Directory dir = newDirectory();
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
String field = "numeric";
Document doc = new Document();
doc.add(new NumericDocValuesField(field, 1));
doc.add(new StringField("id", "doc1", Store.NO));
indexWriter.addDocument(doc);
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
IndexSearcher searcher = new IndexSearcher(reader);
LongDocValuesStats stats = new LongDocValuesStats(field);
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
assertEquals(1, stats.count());
assertEquals(0, stats.missing());
assertEquals(1, stats.max().longValue());
assertEquals(1, stats.min().longValue());
assertEquals(1, stats.sum().longValue());
assertEquals(1, stats.mean(), 0.0001);
assertEquals(0, stats.variance(), 0.0001);
assertEquals(0, stats.stdev(), 0.0001);
}
}
}
public void testDocsWithLongValues() throws IOException {
try (Directory dir = newDirectory();
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
String field = "numeric";
int numDocs = TestUtil.nextInt(random(), 1, 100);
long[] docValues = new long[numDocs];
int nextVal = 1;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
if (random().nextBoolean()) { // not all documents have a value
doc.add(new NumericDocValuesField(field, nextVal));
doc.add(new StringField("id", "doc" + i, Store.NO));
docValues[i] = nextVal;
++nextVal;
}
indexWriter.addDocument(doc);
}
// 20% of cases delete some docs
if (random().nextDouble() < 0.2) {
for (int i = 0; i < numDocs; i++) {
if (random().nextBoolean()) {
indexWriter.deleteDocuments(new Term("id", "doc" + i));
docValues[i] = 0;
}
}
}
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
IndexSearcher searcher = new IndexSearcher(reader);
LongDocValuesStats stats = new LongDocValuesStats(field);
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
assertEquals(expCount, stats.count());
assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
if (stats.count() > 0) {
LongSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics();
assertEquals(sumStats.getMax(), stats.max().longValue());
assertEquals(sumStats.getMin(), stats.min().longValue());
assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
assertEquals(sumStats.getSum(), stats.sum().longValue());
double variance = computeVariance(docValues, stats.mean, stats.count());
assertEquals(variance, stats.variance(), 0.00001);
assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
}
}
}
}
public void testDocsWithDoubleValues() throws IOException {
try (Directory dir = newDirectory();
IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
String field = "numeric";
int numDocs = TestUtil.nextInt(random(), 1, 100);
double[] docValues = new double[numDocs];
double nextVal = 1.0;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
if (random().nextBoolean()) { // not all documents have a value
doc.add(new DoubleDocValuesField(field, nextVal));
doc.add(new StringField("id", "doc" + i, Store.NO));
docValues[i] = nextVal;
++nextVal;
}
indexWriter.addDocument(doc);
}
// 20% of cases delete some docs
if (random().nextDouble() < 0.2) {
for (int i = 0; i < numDocs; i++) {
if (random().nextBoolean()) {
indexWriter.deleteDocuments(new Term("id", "doc" + i));
docValues[i] = 0;
}
}
}
try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
IndexSearcher searcher = new IndexSearcher(reader);
DoubleDocValuesStats stats = new DoubleDocValuesStats(field);
searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
assertEquals(expCount, stats.count());
assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
if (stats.count() > 0) {
DoubleSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics();
assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001);
assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001);
assertEquals(sumStats.getAverage(), stats.mean(), 0.00001);
assertEquals(sumStats.getSum(), stats.sum(), 0.00001);
double variance = computeVariance(docValues, stats.mean, stats.count());
assertEquals(variance, stats.variance(), 0.00001);
assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001);
}
}
}
}
private static LongStream getPositiveValues(long[] docValues) {
return Arrays.stream(docValues).filter(v -> v > 0);
}
private static DoubleStream getPositiveValues(double[] docValues) {
return Arrays.stream(docValues).filter(v -> v > 0);
}
private static LongStream getZeroValues(long[] docValues) {
return Arrays.stream(docValues).filter(v -> v == 0);
}
private static DoubleStream getZeroValues(double[] docValues) {
return Arrays.stream(docValues).filter(v -> v == 0);
}
private static double computeVariance(long[] values, double mean, int count) {
return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count;
}
private static double computeVariance(double[] values, double mean, int count) {
return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count;
}
}

View File

@ -26,7 +26,10 @@ import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.bkd.BKDReader.IndexTree;
import org.apache.lucene.util.bkd.BKDReader.IntersectState;
import org.apache.lucene.util.bkd.BKDReader;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
@ -41,16 +44,16 @@ class NearestNeighbor {
static class Cell implements Comparable<Cell> {
final int readerIndex;
final int nodeID;
final byte[] minPacked;
final byte[] maxPacked;
final IndexTree index;
/** The closest possible distance of all points in this cell */
final double distanceMeters;
public Cell(int readerIndex, int nodeID, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceMeters) {
this.index = index;
this.readerIndex = readerIndex;
this.nodeID = nodeID;
this.minPacked = minPacked.clone();
this.maxPacked = maxPacked.clone();
this.distanceMeters = distanceMeters;
@ -66,7 +69,7 @@ class NearestNeighbor {
double minLon = decodeLongitude(minPacked, Integer.BYTES);
double maxLat = decodeLatitude(maxPacked, 0);
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
return "Cell(readerIndex=" + readerIndex + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceMeters=" + distanceMeters + ")";
}
}
@ -219,13 +222,21 @@ class NearestNeighbor {
List<BKDReader.IntersectState> states = new ArrayList<>();
// Add root cell for each reader into the queue:
int bytesPerDim = -1;
for(int i=0;i<readers.size();i++) {
BKDReader reader = readers.get(i);
if (bytesPerDim == -1) {
bytesPerDim = reader.getBytesPerDimension();
} else if (bytesPerDim != reader.getBytesPerDimension()) {
throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
}
byte[] minPackedValue = reader.getMinPackedValue();
byte[] maxPackedValue = reader.getMaxPackedValue();
states.add(reader.getIntersectState(visitor));
IntersectState state = reader.getIntersectState(visitor);
states.add(state);
cellQueue.offer(new Cell(i, 1, reader.getMinPackedValue(), reader.getMaxPackedValue(),
cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(),
approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
}
@ -236,12 +247,12 @@ class NearestNeighbor {
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
BKDReader reader = readers.get(cell.readerIndex);
if (reader.isLeafNode(cell.nodeID)) {
if (cell.index.isLeafNode()) {
//System.out.println(" leaf");
// Leaf block: visit all points and possibly collect them:
visitor.curDocBase = docBases.get(cell.readerIndex);
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
reader.visitLeafBlockValues(cell.nodeID, states.get(cell.readerIndex));
reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
//System.out.println(" now " + hitQueue.size() + " hits");
} else {
//System.out.println(" non-leaf");
@ -257,14 +268,23 @@ class NearestNeighbor {
continue;
}
BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
int splitDim = cell.index.getSplitDim();
// we must clone the index so that we we can recurse left and right "concurrently":
IndexTree newIndex = cell.index.clone();
byte[] splitPackedValue = cell.maxPacked.clone();
reader.copySplitValue(cell.nodeID, splitPackedValue);
cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID, cell.minPacked, splitPackedValue,
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
cell.index.pushLeft();
cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue,
approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
splitPackedValue = cell.minPacked.clone();
reader.copySplitValue(cell.nodeID, splitPackedValue);
cellQueue.offer(new Cell(cell.readerIndex, 2*cell.nodeID+1, splitPackedValue, cell.maxPacked,
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
newIndex.pushRight();
cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked,
approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
}
}

View File

@ -165,6 +165,7 @@ abstract class RangeFieldQuery extends Query {
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
// no docs in this segment indexed this field
return null;
}
checkFieldInfo(fieldInfo);
boolean allDocsMatch = true;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
@ -45,11 +46,16 @@ import org.apache.lucene.index.SortedNumericDocValues;
public class DocValuesNumbersQuery extends Query {
private final String field;
private final Set<Long> numbers;
private final LongHashSet numbers;
public DocValuesNumbersQuery(String field, Set<Long> numbers) {
public DocValuesNumbersQuery(String field, long[] numbers) {
this.field = Objects.requireNonNull(field);
this.numbers = Objects.requireNonNull(numbers, "Set of numbers must not be null");
this.numbers = new LongHashSet(numbers);
}
public DocValuesNumbersQuery(String field, Collection<Long> numbers) {
this.field = Objects.requireNonNull(field);
this.numbers = new LongHashSet(numbers.stream().mapToLong(Long::longValue).toArray());
}
public DocValuesNumbersQuery(String field, Long... numbers) {
@ -82,15 +88,11 @@ public class DocValuesNumbersQuery extends Query {
@Override
public String toString(String defaultField) {
StringBuilder sb = new StringBuilder();
sb.append(field).append(": [");
for (Long number : numbers) {
sb.append(number).append(", ");
}
if (numbers.size() > 0) {
sb.setLength(sb.length() - 2);
}
return sb.append(']').toString();
return new StringBuilder()
.append(field)
.append(": ")
.append(numbers.toString())
.toString();
}
@Override

View File

@ -25,7 +25,10 @@ import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
@ -91,13 +94,24 @@ import org.apache.lucene.util.LongBitSet;
public class DocValuesTermsQuery extends Query {
private final String field;
private final BytesRef[] terms;
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
public DocValuesTermsQuery(String field, Collection<BytesRef> terms) {
this.field = Objects.requireNonNull(field);
Objects.requireNonNull(terms, "Collection of terms must not be null");
this.terms = terms.toArray(new BytesRef[terms.size()]);
ArrayUtil.timSort(this.terms);
BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
ArrayUtil.timSort(sortedTerms);
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRef previous = null;
for (BytesRef term : sortedTerms) {
if (term.equals(previous) == false) {
builder.add(field, term);
}
previous = term;
}
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
public DocValuesTermsQuery(String field, BytesRef... terms) {
@ -124,26 +138,30 @@ public class DocValuesTermsQuery extends Query {
}
private boolean equalsTo(DocValuesTermsQuery other) {
return field.equals(other.field) &&
Arrays.equals(terms, other.terms);
// termData might be heavy to compare so check the hash code first
return termDataHashCode == other.termDataHashCode &&
termData.equals(other.termData);
}
@Override
public int hashCode() {
return 31 * classHash() + Objects.hash(field, Arrays.asList(terms));
return 31 * classHash() + termDataHashCode;
}
@Override
public String toString(String defaultField) {
StringBuilder sb = new StringBuilder();
sb.append(field).append(": [");
for (BytesRef term : terms) {
sb.append(term).append(", ");
StringBuilder builder = new StringBuilder();
boolean first = true;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
if (!first) {
builder.append(' ');
}
first = false;
builder.append(new Term(iterator.field(), term).toString());
}
if (terms.length > 0) {
sb.setLength(sb.length() - 2);
}
return sb.append(']').toString();
return builder.toString();
}
@Override
@ -155,7 +173,8 @@ public class DocValuesTermsQuery extends Query {
final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field);
final LongBitSet bits = new LongBitSet(values.getValueCount());
boolean matchesAtLeastOneTerm = false;
for (BytesRef term : terms) {
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
final long ord = values.lookupTerm(term);
if (ord >= 0) {
matchesAtLeastOneTerm = true;

View File

@ -0,0 +1,156 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.util.AbstractSet;
import java.util.Arrays;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.util.packed.PackedInts;
final class LongHashSet extends AbstractSet<Long> {
private static final long MISSING = Long.MIN_VALUE;
final long[] table;
final int mask;
final boolean hasMissingValue;
final int size;
final int hashCode;
LongHashSet(long... values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
assert tableSize >= values.length * 3L / 2;
table = new long[tableSize];
Arrays.fill(table, MISSING);
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
int hashCode = 0;
for (long value : values) {
if (value == MISSING || add(value)) {
if (value == MISSING) {
hasMissingValue = true;
}
++size;
hashCode += Long.hashCode(value);
}
}
this.hasMissingValue = hasMissingValue;
this.size = size;
this.hashCode = hashCode;
}
private boolean add(long l) {
assert l != MISSING;
final int slot = Long.hashCode(l) & mask;
for (int i = slot; ; i = (i + 1) & mask) {
if (table[i] == MISSING) {
table[i] = l;
return true;
} else if (table[i] == l) {
// already added
return false;
}
}
}
boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
}
final int slot = Long.hashCode(l) & mask;
for (int i = slot; ; i = (i + 1) & mask) {
if (table[i] == MISSING) {
return false;
} else if (table[i] == l) {
return true;
}
}
}
@Override
public int size() {
return size;
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public boolean equals(Object obj) {
if (obj != null && obj.getClass() == LongHashSet.class) {
LongHashSet that = (LongHashSet) obj;
if (hashCode != that.hashCode
|| size != that.size
|| hasMissingValue != that.hasMissingValue) {
return false;
}
for (long v : table) {
if (v != MISSING && that.contains(v) == false) {
return false;
}
}
return true;
}
return super.equals(obj);
}
@Override
public boolean contains(Object o) {
return o instanceof Long && contains(((Long) o).longValue());
}
@Override
public Iterator<Long> iterator() {
return new Iterator<Long>() {
private boolean hasNext = hasMissingValue;
private int i = -1;
private long value = MISSING;
@Override
public boolean hasNext() {
if (hasNext) {
return true;
}
while (++i < table.length) {
value = table[i];
if (value != MISSING) {
return hasNext = true;
}
}
return false;
}
@Override
public Long next() {
if (hasNext() == false) {
throw new NoSuchElementException();
}
hasNext = false;
return value;
}
};
}
}

View File

@ -93,7 +93,7 @@ public abstract class BaseRangeFieldQueryTestCase extends LuceneTestCase {
ranges[id] = new Range[] {nextRange(dimensions)};
}
if (x == 17) {
// dome docs don't have a box:
// some docs don't have a box:
ranges[id][0].isMissing = true;
if (VERBOSE) {
System.out.println(" id=" + id + " is missing");

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import org.apache.lucene.util.LuceneTestCase;
public class LongHashSetTests extends LuceneTestCase {
private void assertEquals(Set<Long> set1, LongHashSet set2) {
LuceneTestCase.assertEquals(set1, set2);
LuceneTestCase.assertEquals(set2, set1);
LuceneTestCase.assertEquals(set2, set2);
assertEquals(set1.hashCode(), set2.hashCode());
if (set1.isEmpty() == false) {
Set<Long> set3 = new HashSet<>(set1);
long removed = set3.iterator().next();
while (true) {
long next = random().nextLong();
if (next != removed && set3.add(next)) {
break;
}
}
assertNotEquals(set3, set2);
}
}
private void assertNotEquals(Set<Long> set1, LongHashSet set2) {
assertFalse(set1.equals(set2));
assertFalse(set2.equals(set1));
LongHashSet set3 = new LongHashSet(set1.stream().mapToLong(Long::longValue).toArray());
assertFalse(set2.equals(set3));
}
public void testEmpty() {
Set<Long> set1 = new HashSet<>();
LongHashSet set2 = new LongHashSet();
assertEquals(set1, set2);
}
public void testOneValue() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L));
LongHashSet set2 = new LongHashSet(42);
assertEquals(set1, set2);
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE));
set2 = new LongHashSet(Long.MIN_VALUE);
assertEquals(set1, set2);
}
public void testTwoValues() {
Set<Long> set1 = new HashSet<>(Arrays.asList(42L, Long.MAX_VALUE));
LongHashSet set2 = new LongHashSet(42, Long.MAX_VALUE);
assertEquals(set1, set2);
set1 = new HashSet<>(Arrays.asList(Long.MIN_VALUE, 42L));
set2 = new LongHashSet(Long.MIN_VALUE, 42L);
assertEquals(set1, set2);
}
public void testRandom() {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
long[] values = new long[random().nextInt(1 << random().nextInt(16))];
for (int i = 0; i < values.length; ++i) {
if (i == 0 || random().nextInt(10) < 9) {
values[i] = random().nextLong();
} else {
values[i] = values[random().nextInt(i)];
}
}
if (values.length > 0 && random().nextBoolean()) {
values[values.length/2] = Long.MIN_VALUE;
}
Set<Long> set1 = LongStream.of(values).mapToObj(Long::valueOf).collect(Collectors.toCollection(HashSet::new));
LongHashSet set2 = new LongHashSet(values);
assertEquals(set1, set2);
}
}
}

View File

@ -38,6 +38,7 @@ public class TestDocValuesTermsQuery extends LuceneTestCase {
public void testEquals() {
assertEquals(new DocValuesTermsQuery("foo", "bar"), new DocValuesTermsQuery("foo", "bar"));
assertEquals(new DocValuesTermsQuery("foo", "bar"), new DocValuesTermsQuery("foo", "bar", "bar"));
assertEquals(new DocValuesTermsQuery("foo", "bar", "baz"), new DocValuesTermsQuery("foo", "baz", "bar"));
assertFalse(new DocValuesTermsQuery("foo", "bar").equals(new DocValuesTermsQuery("foo2", "bar")));
assertFalse(new DocValuesTermsQuery("foo", "bar").equals(new DocValuesTermsQuery("foo", "baz")));

View File

@ -93,10 +93,6 @@ public class QueryUtils {
public static void checkUnequal(Query q1, Query q2) {
assertFalse(q1 + " equal to " + q2, q1.equals(q2));
assertFalse(q2 + " equal to " + q1, q2.equals(q1));
// possible this test can fail on a hash collision... if that
// happens, please change test to use a different example.
assertTrue(q1.hashCode() != q2.hashCode());
}
/** deep check that explanations of a query 'score' correctly */

View File

@ -50,6 +50,10 @@ Bug Fixes
* SOLR-9262: Connection and read timeouts are being ignored by UpdateShardHandler after SOLR-4509.
(Mark Miller, shalin)
* SOLR-9837: Fix 55% performance regression of FieldCache uninvert time of
numeric fields. (yonik)
Optimizations
----------------------
@ -62,6 +66,11 @@ Optimizations
* SOLR-9579: Make Solr's SchemaField implement Lucene's IndexableFieldType, removing the
creation of a Lucene FieldType every time a field is indexed. (John Call, yonik)
* SOLR-9822: JSON Facet API: Recover performance lost due to the DocValues transition to
an iterator API (LUCENE-7407). This only fixes calculating counts for single-valued
string fields from the FieldCache, resulting in up to 56% better throughput for those cases.
(yonik)
================== 6.4.0 ==================
@ -152,6 +161,12 @@ New Features
* SOLR-9728: Ability to specify Key Store type in solr.in.sh file for SSL (Michael Suzuki, Kevin Risden)
* SOLR-5043: New solr.dns.prevent.reverse.lookup system property that can be used to prevent long core
(re)load delays on systems with missconfigured hostname/DNS (hossman)
* SOLR-9844: FieldCache information fetched via the mbeans handler or seen via the UI now displays the total size used.
The individual cache entries in the response are now formatted better as well. (Varun Thacker)
Optimizations
----------------------
* SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have
@ -216,6 +231,22 @@ Bug Fixes
* SOLR-5260: Facet search on a docvalue field in a multi shard collection (Trym Møller, Erick Erickson)
* SOLR-9768: RecordingJsonParser produces incomplete json (Wojciech Stryszyk via ab)
* SOLR-9616: Solr throws exception when expand=true on empty index (Timo Hund via Ishan Chattopadhyaya)
* SOLR-9832: Schema modifications are not immediately visible on the coordinating node. (Steve Rowe)
* SOLR-9834: A variety of spots in the code can create a collection zk node after the collection has been
removed. (Mark Miller)
* SOLR-9707: Don't forward DeleteByQuery requests to down replicas. (Jessica Cheng Mallet via Varun Thacker)
* SOLR-9823: CoreContainer incorrectly setting MDCLoggingContext for core (Jessica Cheng Mallet via Erick Erickson)
* SOLR-1953: It may be possible for temporary files to accumulate until the Solr process is shut down.
(Karl Wright, Mark Miller)
Other Changes
----------------------
@ -245,6 +276,23 @@ Other Changes
* SOLR-9801: Upgrade jetty to 9.3.14.v20161028 (shalin)
* SOLR-9783: (Search|Top)Group[s]ShardResponseProcessor.process: turned sortWithinGroup null check into assert.
(Christine Poerschke)
* SOLR-9660: in GroupingSpecification factor [group](sort|offset|limit) into [group](sortSpec)
(Judith Silverman, Christine Poerschke)
* SOLR-9819: Upgrade commons-fileupload to 1.3.2, fixing a potential vulnerability CVE-2016-3092 (Anshum Gupta)
* SOLR-9827: ConcurrentUpdateSolrClient creates a RemoteSolrException if the remote host responded with a non-ok
response (instead of a SolrException) and includes the remote error message as part of the exception message
(Tomás Fernández Löbbe)
* SOLR-9846: OverseerAutoReplicaFailoverThread can take too long to stop and leak out of unit tests. (Mark Miller)
* SOLR-8959: Refactored TestSegmentSorting out of TestMiniSolrCloudCluster (hossman)
================== 6.3.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -16,12 +16,21 @@
*/
package org.apache.solr.handler.dataimport;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringReader;
import java.sql.Blob;
import java.sql.SQLException;
import java.util.Collections;
import java.util.Properties;
import org.apache.solr.common.util.Utils;
import org.junit.Test;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Test for PlainTextEntityProcessor
*
@ -42,6 +51,103 @@ public class TestPlainTextEntityProcessor extends AbstractDataImportHandlerTestC
assertEquals(DS.s, sw.docs.get(0).getFieldValue("x"));
}
static class BlobImpl implements Blob{
private final byte[] bytes;
BlobImpl(byte[] bytes) {
this.bytes = bytes;
}
@Override
public long length() throws SQLException {
return 0;
}
@Override
public byte[] getBytes(long pos, int length) throws SQLException {
return bytes;
}
@Override
public InputStream getBinaryStream() throws SQLException {
return new ByteArrayInputStream(bytes);
}
@Override
public long position(byte[] pattern, long start) throws SQLException {
return 0;
}
@Override
public long position(Blob pattern, long start) throws SQLException {
return 0;
}
@Override
public int setBytes(long pos, byte[] bytes) throws SQLException {
return 0;
}
@Override
public int setBytes(long pos, byte[] bytes, int offset, int len) throws SQLException {
return 0;
}
@Override
public OutputStream setBinaryStream(long pos) throws SQLException {
return null;
}
@Override
public void truncate(long len) throws SQLException {
}
@Override
public void free() throws SQLException {
}
@Override
public InputStream getBinaryStream(long pos, long length) throws SQLException {
return new ByteArrayInputStream(bytes);
}
}
@Test
public void testSimple2() throws IOException {
DataImporter di = new DataImporter();
MockDataSource.setIterator("select id, name, blob_field from lw_table4", Collections.singletonList(Utils.makeMap("blob_field",new BlobImpl(DS.s.getBytes(UTF_8)) ) ).iterator());
String dc =
" <dataConfig>" +
"<dataSource name=\"ds1\" type=\"MockDataSource\"/>\n" +
" <!-- dataSource for FieldReaderDataSource -->\n" +
" <dataSource dataField=\"root.blob_field\" name=\"fr\" type=\"FieldReaderDataSource\"/>\n" +
"\n" +
" <document name=\"items\">\n" +
" <entity dataSource=\"ds1\" name=\"root\" pk=\"id\" query=\"select id, name, blob_field from lw_table4\" transformer=\"TemplateTransformer\">\n" +
" <field column=\"id\" name=\"id\"/>\n" +
"\n" +
" <entity dataField=\"root.blob_field\" dataSource=\"fr\" format=\"text\" name=\"n1\" processor=\"PlainTextEntityProcessor\" url=\"blob_field\">\n" +
" <field column=\"plainText\" name=\"plainText\"/>\n" +
" </entity>\n" +
"\n" +
" </entity>\n" +
" </document>\n" +
"</dataConfig>";
System.out.println(dc);
di.loadAndInit(dc);
redirectTempProperties(di);
TestDocBuilder.SolrWriterImpl sw = new TestDocBuilder.SolrWriterImpl();
RequestInfo rp = new RequestInfo(null, createMap("command", "full-import"), null);
di.runCmd(rp, sw);
assertEquals(DS.s, sw.docs.get(0).getFieldValue("plainText"));
}
public static class DS extends DataSource {
static String s = "hello world";

View File

@ -390,17 +390,17 @@ About half the time for ranking is spent in the creation of weights for each fea
<!-- Query parser used to rerank top docs with a provided model -->
<queryParser name="ltr" class="org.apache.solr.ltr.search.LTRQParserPlugin">
<int name="threadModule.totalPoolThreads">10</int> <!-- Maximum threads to share for all requests -->
<int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads to use for a single requests-->
<int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads to use for a single request -->
</queryParser>
<!-- Transformer for extracting features -->
<transformer name="features" class="org.apache.solr.ltr.response.transform.LTRFeatureLoggerTransformerFactory">
<int name="threadModule.totalPoolThreads">10</int> <!-- Maximum threads to share for all requests -->
<int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads to use for a single requests-->
<int name="threadModule.numThreadsPerRequest">5</int> <!-- Maximum threads to use for a single request -->
</transformer>
</config>
```
The threadModule.totalPoolThreads option limits the total number of threads to be used across all query instances at any given time. threadModule.numThreadsPerRequest limits the number of threads used to process a single query. In the above example, 10 threads will be used to services all queries and a maximum of 5 threads to service a single query. If the solr instances is expected to receive no more than one query at a time, it is best to set both these numbers to the same value. If multiple queries need to serviced simultaneously, the numbers can be adjusted based on the expected response times. If the value of threadModule.numThreadsPerRequest is higher, the reponse time for a single query will be improved upto a point. If multiple queries are serviced simultaneously, the threadModule.totalPoolThreads imposes a contention between the queries if (threadModule.numThreadsPerRequest*total parallel queries > threadModule.totalPoolThreads).
The threadModule.totalPoolThreads option limits the total number of threads to be used across all query instances at any given time. threadModule.numThreadsPerRequest limits the number of threads used to process a single query. In the above example, 10 threads will be used to services all queries and a maximum of 5 threads to service a single query. If the solr instance is expected to receive no more than one query at a time, it is best to set both these numbers to the same value. If multiple queries need to be serviced simultaneously, the numbers can be adjusted based on the expected response times. If the value of threadModule.numThreadsPerRequest is higher, the response time for a single query will be improved upto a point. If multiple queries are serviced simultaneously, the threadModule.totalPoolThreads imposes a contention between the queries if (threadModule.numThreadsPerRequest*total parallel queries > threadModule.totalPoolThreads).

View File

@ -151,7 +151,6 @@ public abstract class FeatureLogger<FV_TYPE> {
}
public static class CSVFeatureLogger extends FeatureLogger<String> {
StringBuilder sb = new StringBuilder(500);
char keyValueSep = ':';
char featureSep = ';';
@ -171,6 +170,10 @@ public abstract class FeatureLogger<FV_TYPE> {
@Override
public String makeFeatureVector(LTRScoringQuery.FeatureInfo[] featuresInfo) {
// Allocate the buffer to a size based on the number of features instead of the
// default 16. You need space for the name, value, and two separators per feature,
// but not all the features are expected to fire, so this is just a naive estimate.
StringBuilder sb = new StringBuilder(featuresInfo.length * 3);
boolean isDense = featureFormat.equals(FeatureFormat.DENSE);
for (LTRScoringQuery.FeatureInfo featInfo:featuresInfo) {
if (featInfo.isUsed() || isDense){
@ -181,9 +184,8 @@ public abstract class FeatureLogger<FV_TYPE> {
}
}
final String features = (sb.length() > 0 ? sb.substring(0,
sb.length() - 1) : "");
sb.setLength(0);
final String features = (sb.length() > 0 ?
sb.substring(0, sb.length() - 1) : "");
return features;
}

View File

@ -205,10 +205,10 @@ public class LTRScoringQuery extends Query {
List<Feature.FeatureWeight > featureWeights = new ArrayList<>(features.size());
if (querySemaphore == null) {
createWeights(searcher, needsScores, boost, featureWeights, features);
createWeights(searcher, needsScores, featureWeights, features);
}
else{
createWeightsParallel(searcher, needsScores, boost, featureWeights, features);
createWeightsParallel(searcher, needsScores, featureWeights, features);
}
int i=0, j = 0;
if (this.extractAllFeatures) {
@ -228,7 +228,7 @@ public class LTRScoringQuery extends Query {
return new ModelWeight(modelFeaturesWeights, extractedFeatureWeights, allFeatures.size());
}
private void createWeights(IndexSearcher searcher, boolean needsScores, float boost,
private void createWeights(IndexSearcher searcher, boolean needsScores,
List<Feature.FeatureWeight > featureWeights, Collection<Feature> features) throws IOException {
final SolrQueryRequest req = getRequest();
// since the feature store is a linkedhashmap order is preserved
@ -271,7 +271,7 @@ public class LTRScoringQuery extends Query {
}
} // end of call CreateWeightCallable
private void createWeightsParallel(IndexSearcher searcher, boolean needsScores, float boost,
private void createWeightsParallel(IndexSearcher searcher, boolean needsScores,
List<Feature.FeatureWeight > featureWeights, Collection<Feature> features) throws RuntimeException {
final SolrQueryRequest req = getRequest();
@ -401,8 +401,9 @@ public class LTRScoringQuery extends Query {
/**
* Goes through all the stored feature values, and calculates the normalized
* values for all the features that will be used for scoring.
* Then calculate and return the model's score.
*/
private void makeNormalizedFeatures() {
private float makeNormalizedFeaturesAndScore() {
int pos = 0;
for (final Feature.FeatureWeight feature : modelFeatureWeights) {
final int featureId = feature.getIndex();
@ -415,6 +416,7 @@ public class LTRScoringQuery extends Query {
pos++;
}
ltrScoringModel.normalizeFeaturesInPlace(modelFeatureValuesNormalized);
return ltrScoringModel.score(modelFeatureValuesNormalized);
}
@Override
@ -491,8 +493,8 @@ public class LTRScoringQuery extends Query {
for (final Feature.FeatureWeight.FeatureScorer subSocer : featureScorers) {
subSocer.setDocInfo(docInfo);
}
if (featureScorers.size() <= 1) { // TODO: Allow the use of dense
// features in other cases
if (featureScorers.size() <= 1) {
// future enhancement: allow the use of dense features in other cases
featureTraversalScorer = new DenseModelScorer(weight, featureScorers);
} else {
featureTraversalScorer = new SparseModelScorer(weight, featureScorers);
@ -570,8 +572,7 @@ public class LTRScoringQuery extends Query {
featuresInfo[featureId].setUsed(true);
}
}
makeNormalizedFeatures();
return ltrScoringModel.score(modelFeatureValuesNormalized);
return makeNormalizedFeaturesAndScore();
}
@Override
@ -663,8 +664,7 @@ public class LTRScoringQuery extends Query {
}
}
}
makeNormalizedFeatures();
return ltrScoringModel.score(modelFeatureValuesNormalized);
return makeNormalizedFeaturesAndScore();
}
@Override

View File

@ -29,6 +29,35 @@ import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
/**
* The LTRThreadModule is optionally used by the {@link org.apache.solr.ltr.search.LTRQParserPlugin} and
* {@link org.apache.solr.ltr.response.transform.LTRFeatureLoggerTransformerFactory LTRFeatureLoggerTransformerFactory}
* classes to parallelize the creation of {@link org.apache.solr.ltr.feature.Feature.FeatureWeight Feature.FeatureWeight}
* objects.
* <p>
* Example configuration:
* <pre>
&lt;queryParser name="ltr" class="org.apache.solr.ltr.search.LTRQParserPlugin"&gt;
&lt;int name="threadModule.totalPoolThreads"&gt;10&lt;/int&gt;
&lt;int name="threadModule.numThreadsPerRequest"&gt;5&lt;/int&gt;
&lt;/queryParser&gt;
&lt;transformer name="features" class="org.apache.solr.ltr.response.transform.LTRFeatureLoggerTransformerFactory"&gt;
&lt;int name="threadModule.totalPoolThreads"&gt;10&lt;/int&gt;
&lt;int name="threadModule.numThreadsPerRequest"&gt;5&lt;/int&gt;
&lt;/transformer&gt;
</pre>
* If an individual solr instance is expected to receive no more than one query at a time, it is best
* to set <code>totalPoolThreads</code> and <code>numThreadsPerRequest</code> to the same value.
*
* If multiple queries need to be serviced simultaneously then <code>totalPoolThreads</code> and
* <code>numThreadsPerRequest</code> can be adjusted based on the expected response times.
*
* If the value of <code>numThreadsPerRequest</code> is higher, the response time for a single query
* will be improved up to a point. If multiple queries are serviced simultaneously, the value of
* <code>totalPoolThreads</code> imposes a contention between the queries if
* <code>(totalPoolThreads &lt; numThreadsPerRequest * total parallel queries)</code>.
*/
final public class LTRThreadModule implements NamedListInitializedPlugin {
public static LTRThreadModule getInstance(NamedList args) {

View File

@ -258,8 +258,7 @@ public abstract class Feature extends Query {
@Override
public void extractTerms(Set<Term> terms) {
// needs to be implemented by query subclasses
throw new UnsupportedOperationException();
// no-op
}
/**

View File

@ -29,6 +29,7 @@ import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.BoolField;
/**
* This feature returns the value of a field in the current document
@ -119,13 +120,16 @@ public class FieldValueFeature extends Feature {
return number.floatValue();
} else {
final String string = indexableField.stringValue();
// boolean values in the index are encoded with the
// chars T/F
if (string.equals("T")) {
return 1;
}
if (string.equals("F")) {
return 0;
if (string.length() == 1) {
// boolean values in the index are encoded with the
// a single char contained in TRUE_TOKEN or FALSE_TOKEN
// (see BoolField)
if (string.charAt(0) == BoolField.TRUE_TOKEN[0]) {
return 1;
}
if (string.charAt(0) == BoolField.FALSE_TOKEN[0]) {
return 0;
}
}
}
} catch (final IOException e) {

View File

@ -19,8 +19,10 @@ package org.apache.solr.ltr.feature;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -76,7 +78,10 @@ public class OriginalScoreFeature extends Feature {
return "OriginalScoreFeature [query:" + originalQuery.toString() + "]";
}
@Override
public void extractTerms(Set<Term> terms) {
w.extractTerms(terms);
}
@Override
public FeatureScorer scorer(LeafReaderContext context) throws IOException {
@ -102,6 +107,11 @@ public class OriginalScoreFeature extends Feature {
return (docInfo.hasOriginalDocScore() ? docInfo.getOriginalDocScore() : originalScorer.score());
}
@Override
public int freq() throws IOException {
return originalScorer.freq();
}
@Override
public int docID() {
return originalScorer.docID();

View File

@ -21,8 +21,10 @@ import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
@ -123,9 +125,9 @@ public class SolrFeature extends Feature {
* Weight for a SolrFeature
**/
public class SolrFeatureWeight extends FeatureWeight {
Weight solrQueryWeight;
Query query;
List<Query> queryAndFilters;
final private Weight solrQueryWeight;
final private Query query;
final private List<Query> queryAndFilters;
public SolrFeatureWeight(IndexSearcher searcher,
SolrQueryRequest request, Query originalQuery, Map<String,String[]> efi) throws IOException {
@ -174,6 +176,8 @@ public class SolrFeature extends Feature {
if (query != null) {
queryAndFilters.add(query);
solrQueryWeight = searcher.createNormalizedWeight(query, true);
} else {
solrQueryWeight = null;
}
} catch (final SyntaxError e) {
throw new FeatureException("Failed to parse feature query.", e);
@ -201,6 +205,13 @@ public class SolrFeature extends Feature {
}
}
@Override
public void extractTerms(Set<Term> terms) {
if (solrQueryWeight != null) {
solrQueryWeight.extractTerms(terms);
}
}
@Override
public FeatureScorer scorer(LeafReaderContext context) throws IOException {
Scorer solrScorer = null;

View File

@ -57,7 +57,6 @@ public class ManagedFeatureStore extends ManagedResource implements ManagedResou
/** the feature store rest endpoint **/
public static final String REST_END_POINT = "/schema/feature-store";
// TODO: reduce from public to package visibility (once tests no longer need public access)
/** name of the attribute containing the feature class **/
static final String CLASS_KEY = "class";

View File

@ -61,7 +61,6 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc
/** the model store rest endpoint **/
public static final String REST_END_POINT = "/schema/model-store";
// TODO: reduce from public to package visibility (once tests no longer need public access)
/**
* Managed model store: the name of the attribute containing all the models of
@ -124,16 +123,20 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc
if ((managedData != null) && (managedData instanceof List)) {
final List<Map<String,Object>> up = (List<Map<String,Object>>) managedData;
for (final Map<String,Object> u : up) {
try {
final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, u, managedFeatureStore);
addModel(algo);
} catch (final ModelException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
addModelFromMap(u);
}
}
}
private void addModelFromMap(Map<String,Object> modelMap) {
try {
final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, modelMap, managedFeatureStore);
addModel(algo);
} catch (final ModelException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
public synchronized void addModel(LTRScoringModel ltrScoringModel) throws ModelException {
try {
log.info("adding model {}", ltrScoringModel.getName());
@ -146,26 +149,17 @@ public class ManagedModelStore extends ManagedResource implements ManagedResourc
@SuppressWarnings("unchecked")
@Override
protected Object applyUpdatesToManagedData(Object updates) {
if (updates instanceof List) {
final List<Map<String,Object>> up = (List<Map<String,Object>>) updates;
for (final Map<String,Object> u : up) {
try {
final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, u, managedFeatureStore);
addModel(algo);
} catch (final ModelException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
addModelFromMap(u);
}
}
if (updates instanceof Map) {
final Map<String,Object> map = (Map<String,Object>) updates;
try {
final LTRScoringModel algo = fromLTRScoringModelMap(solrResourceLoader, map, managedFeatureStore);
addModel(algo);
} catch (final ModelException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
addModelFromMap(map);
}
return modelsAsManagedResources(store.getModels());

View File

@ -24,6 +24,8 @@
<field name="keywords" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="normHits" type="float" indexed="true" stored="true" />
<field name="isTrendy" type="boolean" indexed="true" stored="true" />
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>

View File

@ -32,21 +32,21 @@ public class TestFieldValueFeature extends TestRerankBase {
setuptest("solrconfig-ltr.xml", "schema.xml");
assertU(adoc("id", "1", "title", "w1", "description", "w1", "popularity",
"1"));
"1","isTrendy","true"));
assertU(adoc("id", "2", "title", "w2 2asd asdd didid", "description",
"w2 2asd asdd didid", "popularity", "2"));
assertU(adoc("id", "3", "title", "w3", "description", "w3", "popularity",
"3"));
"3","isTrendy","true"));
assertU(adoc("id", "4", "title", "w4", "description", "w4", "popularity",
"4"));
"4","isTrendy","false"));
assertU(adoc("id", "5", "title", "w5", "description", "w5", "popularity",
"5"));
"5","isTrendy","true"));
assertU(adoc("id", "6", "title", "w1 w2", "description", "w1 w2",
"popularity", "6"));
"popularity", "6","isTrendy","false"));
assertU(adoc("id", "7", "title", "w1 w2 w3 w4 w5", "description",
"w1 w2 w3 w4 w5 w8", "popularity", "7"));
"w1 w2 w3 w4 w5 w8", "popularity", "7","isTrendy","true"));
assertU(adoc("id", "8", "title", "w1 w1 w1 w2 w2 w8", "description",
"w1 w1 w1 w2 w2", "popularity", "8"));
"w1 w1 w1 w2 w2", "popularity", "8","isTrendy","false"));
// a document without the popularity field
assertU(adoc("id", "42", "title", "NO popularity", "description", "NO popularity"));
@ -169,5 +169,39 @@ public class TestFieldValueFeature extends TestRerankBase {
}
@Test
public void testBooleanValue() throws Exception {
final String fstore = "test_boolean_store";
loadFeature("trendy", FieldValueFeature.class.getCanonicalName(), fstore,
"{\"field\":\"isTrendy\"}");
loadModel("trendy-model", LinearModel.class.getCanonicalName(),
new String[] {"trendy"}, fstore, "{\"weights\":{\"trendy\":1.0}}");
SolrQuery query = new SolrQuery();
query.setQuery("id:4");
query.add("rq", "{!ltr model=trendy-model reRankDocs=4}");
query.add("fl", "[fv]");
assertJQ("/query" + query.toQueryString(),
"/response/docs/[0]/=={'[fv]':'trendy:0.0'}");
query = new SolrQuery();
query.setQuery("id:5");
query.add("rq", "{!ltr model=trendy-model reRankDocs=4}");
query.add("fl", "[fv]");
assertJQ("/query" + query.toQueryString(),
"/response/docs/[0]/=={'[fv]':'trendy:1.0'}");
// check default value is false
query = new SolrQuery();
query.setQuery("id:2");
query.add("rq", "{!ltr model=trendy-model reRankDocs=4}");
query.add("fl", "[fv]");
assertJQ("/query" + query.toQueryString(),
"/response/docs/[0]/=={'[fv]':'trendy:0.0'}");
}
}

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.ltr.feature;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestOriginalScoreScorer extends LuceneTestCase {
@Test
public void testOverridesAbstractScorerMethods() {
final Class<?> ossClass = OriginalScoreFeature.OriginalScoreWeight.OriginalScoreScorer.class;
for (final Method scorerClassMethod : Scorer.class.getDeclaredMethods()) {
final int modifiers = scorerClassMethod.getModifiers();
if (!Modifier.isAbstract(modifiers)) continue;
try {
final Method ossClassMethod = ossClass.getDeclaredMethod(
scorerClassMethod.getName(),
scorerClassMethod.getParameterTypes());
assertEquals("getReturnType() difference",
scorerClassMethod.getReturnType(),
ossClassMethod.getReturnType());
} catch (NoSuchMethodException e) {
fail(ossClass + " needs to override '" + scorerClassMethod + "'");
}
}
}
}

View File

@ -16,7 +16,7 @@
*/
package org.apache.solr.ltr.model;
//import static org.junit.internal.matchers.StringContains.containsString;
import static org.junit.internal.matchers.StringContains.containsString;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.ltr.TestRerankBase;
@ -93,30 +93,28 @@ public class TestMultipleAdditiveTreesModel extends TestRerankBase {
// test out the explain feature, make sure it returns something
query.setParam("debugQuery", "on");
String qryResult = JQ("/query" + query.toQueryString());
String qryResult = JQ("/query" + query.toQueryString());
qryResult = qryResult.replaceAll("\n", " ");
// FIXME containsString doesn't exist.
// assertThat(qryResult, containsString("\"debug\":{"));
// qryResult = qryResult.substring(qryResult.indexOf("debug"));
//
// assertThat(qryResult, containsString("\"explain\":{"));
// qryResult = qryResult.substring(qryResult.indexOf("explain"));
//
// assertThat(qryResult, containsString("multipleadditivetreesmodel"));
// assertThat(qryResult,
// containsString(MultipleAdditiveTreesModel.class.getCanonicalName()));
//
// assertThat(qryResult, containsString("-100.0 = tree 0"));
// assertThat(qryResult, containsString("50.0 = tree 0"));
// assertThat(qryResult, containsString("-20.0 = tree 1"));
// assertThat(qryResult, containsString("'matchedTitle':1.0 > 0.5"));
// assertThat(qryResult, containsString("'matchedTitle':0.0 <= 0.5"));
//
// assertThat(qryResult, containsString(" Go Right "));
// assertThat(qryResult, containsString(" Go Left "));
// assertThat(qryResult,
// containsString("'this_feature_doesnt_exist' does not exist in FV"));
assertThat(qryResult, containsString("\"debug\":{"));
qryResult = qryResult.substring(qryResult.indexOf("debug"));
assertThat(qryResult, containsString("\"explain\":{"));
qryResult = qryResult.substring(qryResult.indexOf("explain"));
assertThat(qryResult, containsString("multipleadditivetreesmodel"));
assertThat(qryResult, containsString(MultipleAdditiveTreesModel.class.getCanonicalName()));
assertThat(qryResult, containsString("-100.0 = tree 0"));
assertThat(qryResult, containsString("50.0 = tree 0"));
assertThat(qryResult, containsString("-20.0 = tree 1"));
assertThat(qryResult, containsString("'matchedTitle':1.0 > 0.5"));
assertThat(qryResult, containsString("'matchedTitle':0.0 <= 0.5"));
assertThat(qryResult, containsString(" Go Right "));
assertThat(qryResult, containsString(" Go Left "));
assertThat(qryResult, containsString("'this_feature_doesnt_exist' does not exist in FV"));
}
@Test

View File

@ -16,12 +16,20 @@
*/
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.core.ConfigSetService;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class CloudConfigSetService extends ConfigSetService {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final ZkController zkController;
public CloudConfigSetService(SolrResourceLoader loader, ZkController zkController) {
@ -31,8 +39,18 @@ public class CloudConfigSetService extends ConfigSetService {
@Override
public SolrResourceLoader createCoreResourceLoader(CoreDescriptor cd) {
// TODO: Shouldn't the collection node be created by the Collections API?
zkController.createCollectionZkNode(cd.getCloudDescriptor());
try {
// for back compat with cores that can create collections without the collections API
if (!zkController.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + cd.getCollectionName(), true)) {
CreateCollectionCmd.createCollectionZkNode(zkController.getZkClient(), cd.getCollectionName(), cd.getCloudDescriptor().getParams());
}
} catch (KeeperException e) {
SolrException.log(log, null, e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
SolrException.log(log, null, e);
}
String configName = zkController.getZkStateReader().readConfigName(cd.getCollectionName());
return new ZkSolrResourceLoader(cd.getInstanceDir(), configName, parentLoader.getClassLoader(),
cd.getSubstitutableProperties(), zkController);

View File

@ -33,7 +33,7 @@ public class CloudDescriptor {
private String roles = null;
private Integer numShards;
private String nodeName = null;
private Map<String, String> collectionParams = new HashMap<>();
private Map<String,String> collectionParams = new HashMap<>();
private volatile boolean isLeader = false;

View File

@ -25,19 +25,23 @@ import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
import org.apache.solr.cloud.OverseerCollectionMessageHandler.Cmd;
import org.apache.solr.cloud.overseer.ClusterStateMutator;
import org.apache.solr.cloud.rule.ReplicaAssigner;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.ImplicitDocRouter;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkConfigManager;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.cloud.ZooKeeperException;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
@ -46,7 +50,9 @@ import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -64,9 +70,11 @@ import static org.apache.solr.common.util.StrUtils.formatString;
public class CreateCollectionCmd implements Cmd {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final OverseerCollectionMessageHandler ocmh;
private SolrZkClient zkClient;
public CreateCollectionCmd(OverseerCollectionMessageHandler ocmh) {
this.ocmh = ocmh;
this.zkClient = ocmh.zkStateReader.getZkClient();
}
@Override
@ -84,7 +92,6 @@ public class CreateCollectionCmd implements Cmd {
ocmh.validateConfigOrThrowSolrException(configName);
try {
// look at the replication factor and see if it matches reality
// if it does not, find best nodes to create more cores
@ -157,10 +164,20 @@ public class CreateCollectionCmd implements Cmd {
}
ZkStateReader zkStateReader = ocmh.zkStateReader;
boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
ocmh.createConfNode(configName, collectionName, isLegacyCloud);
Map<String,String> collectionParams = new HashMap<>();
Map<String,Object> collectionProps = message.getProperties();
for (String propName : collectionProps.keySet()) {
if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) {
collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), (String) collectionProps.get(propName));
}
}
createCollectionZkNode(zkClient, collectionName, collectionParams);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
// wait for a while until we don't see the collection
@ -288,4 +305,129 @@ public class CreateCollectionCmd implements Cmd {
}
return configName;
}
public static void createCollectionZkNode(SolrZkClient zkClient, String collection, Map<String,String> params) {
log.debug("Check for collection zkNode:" + collection);
String collectionPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
try {
if (!zkClient.exists(collectionPath, true)) {
log.debug("Creating collection in ZooKeeper:" + collection);
try {
Map<String,Object> collectionProps = new HashMap<>();
// TODO: if collection.configName isn't set, and there isn't already a conf in zk, just use that?
String defaultConfigName = System.getProperty(ZkController.COLLECTION_PARAM_PREFIX + ZkController.CONFIGNAME_PROP, collection);
if (params.size() > 0) {
collectionProps.putAll(params);
// if the config name wasn't passed in, use the default
if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP)) {
// users can create the collection node and conf link ahead of time, or this may return another option
getConfName(zkClient, collection, collectionPath, collectionProps);
}
} else if (System.getProperty("bootstrap_confdir") != null) {
// if we are bootstrapping a collection, default the config for
// a new collection to the collection we are bootstrapping
log.info("Setting config for collection:" + collection + " to " + defaultConfigName);
Properties sysProps = System.getProperties();
for (String sprop : System.getProperties().stringPropertyNames()) {
if (sprop.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) {
collectionProps.put(sprop.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), sysProps.getProperty(sprop));
}
}
// if the config name wasn't passed in, use the default
if (!collectionProps.containsKey(ZkController.CONFIGNAME_PROP))
collectionProps.put(ZkController.CONFIGNAME_PROP, defaultConfigName);
} else if (Boolean.getBoolean("bootstrap_conf")) {
// the conf name should should be the collection name of this core
collectionProps.put(ZkController.CONFIGNAME_PROP, collection);
} else {
getConfName(zkClient, collection, collectionPath, collectionProps);
}
collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP); // we don't put numShards in the collections properties
ZkNodeProps zkProps = new ZkNodeProps(collectionProps);
zkClient.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, null, true);
} catch (KeeperException e) {
// it's okay if the node already exists
if (e.code() != KeeperException.Code.NODEEXISTS) {
throw e;
}
}
} else {
log.debug("Collection zkNode exists");
}
} catch (KeeperException e) {
// it's okay if another beats us creating the node
if (e.code() == KeeperException.Code.NODEEXISTS) {
return;
}
throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
}
}
private static void getConfName(SolrZkClient zkClient, String collection, String collectionPath, Map<String,Object> collectionProps) throws KeeperException,
InterruptedException {
// check for configName
log.debug("Looking for collection configName");
if (collectionProps.containsKey("configName")) {
log.info("configName was passed as a param {}", collectionProps.get("configName"));
return;
}
List<String> configNames = null;
int retry = 1;
int retryLimt = 6;
for (; retry < retryLimt; retry++) {
if (zkClient.exists(collectionPath, true)) {
ZkNodeProps cProps = ZkNodeProps.load(zkClient.getData(collectionPath, null, null, true));
if (cProps.containsKey(ZkController.CONFIGNAME_PROP)) {
break;
}
}
// if there is only one conf, use that
try {
configNames = zkClient.getChildren(ZkConfigManager.CONFIGS_ZKNODE, null,
true);
} catch (NoNodeException e) {
// just keep trying
}
if (configNames != null && configNames.size() == 1) {
// no config set named, but there is only 1 - use it
log.info("Only one config set found in zk - using it:" + configNames.get(0));
collectionProps.put(ZkController.CONFIGNAME_PROP, configNames.get(0));
break;
}
if (configNames != null && configNames.contains(collection)) {
log.info(
"Could not find explicit collection configName, but found config name matching collection name - using that set.");
collectionProps.put(ZkController.CONFIGNAME_PROP, collection);
break;
}
log.info("Could not find collection configName - pausing for 3 seconds and trying again - try: " + retry);
Thread.sleep(3000);
}
if (retry == retryLimt) {
log.error("Could not find configName for collection " + collection);
throw new ZooKeeperException(
SolrException.ErrorCode.SERVER_ERROR,
"Could not find configName for collection " + collection + " found:" + configNames);
}
}
}

View File

@ -125,17 +125,6 @@ class ShardLeaderElectionContextBase extends ElectionContext {
this.zkClient = zkStateReader.getZkClient();
this.shardId = shardId;
this.collection = collection;
try {
new ZkCmdExecutor(zkStateReader.getZkClient().getZkClientTimeout())
.ensureExists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection,
zkClient);
} catch (KeeperException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
@Override
@ -175,9 +164,16 @@ class ShardLeaderElectionContextBase extends ElectionContext {
void runLeaderProcess(boolean weAreReplacement, int pauseBeforeStartMs)
throws KeeperException, InterruptedException, IOException {
// register as leader - if an ephemeral is already there, wait to see if it goes away
if (!zkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) {
log.info("Will not register as leader because collection appears to be gone.");
return;
}
String parent = new Path(leaderPath).getParent().toString();
ZkCmdExecutor zcmd = new ZkCmdExecutor(30000);
zcmd.ensureExists(parent, zkClient);
// only if /collections/{collection} exists already do we succeed in creating this path
zcmd.ensureExists(parent, (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
try {
RetryUtil.retryOnThrowable(NodeExistsException.class, 60000, 5000, () -> {

View File

@ -360,8 +360,13 @@ public class LeaderElector {
public void setup(final ElectionContext context) throws InterruptedException,
KeeperException {
String electZKPath = context.electionPath + LeaderElector.ELECTION_NODE;
zkCmdExecutor.ensureExists(electZKPath, zkClient);
if (context instanceof OverseerElectionContext) {
zkCmdExecutor.ensureExists(electZKPath, zkClient);
} else {
// we use 2 param so that replica won't create /collection/{collection} if it doesn't exist
zkCmdExecutor.ensureExists(electZKPath, (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
}
this.context = context;
}

View File

@ -89,6 +89,8 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
private final int workLoopDelay;
private final int waitAfterExpiration;
private volatile Thread thread;
public OverseerAutoReplicaFailoverThread(CloudConfig config, ZkStateReader zkStateReader,
UpdateShardHandler updateShardHandler) {
@ -118,7 +120,7 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
@Override
public void run() {
this.thread = Thread.currentThread();
while (!this.isClosed) {
// work loop
log.debug("do " + this.getClass().getSimpleName() + " work loop");
@ -136,7 +138,6 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
try {
Thread.sleep(workLoopDelay);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
}
}
@ -480,6 +481,10 @@ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
@Override
public void close() {
isClosed = true;
Thread lThread = thread;
if (lThread != null) {
lThread.interrupt();
}
}
public boolean isClosed() {

View File

@ -34,7 +34,6 @@ import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
@ -1273,130 +1272,6 @@ public class ZkController {
zkClient.printLayoutToStdOut();
}
public void createCollectionZkNode(CloudDescriptor cd) {
String collection = cd.getCollectionName();
log.debug("Check for collection zkNode:" + collection);
String collectionPath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
try {
if (!zkClient.exists(collectionPath, true)) {
log.debug("Creating collection in ZooKeeper:" + collection);
try {
Map<String, Object> collectionProps = new HashMap<>();
// TODO: if collection.configName isn't set, and there isn't already a conf in zk, just use that?
String defaultConfigName = System.getProperty(COLLECTION_PARAM_PREFIX + CONFIGNAME_PROP, collection);
// params passed in - currently only done via core admin (create core commmand).
if (cd.getParams().size() > 0) {
collectionProps.putAll(cd.getParams());
// if the config name wasn't passed in, use the default
if (!collectionProps.containsKey(CONFIGNAME_PROP)) {
// TODO: getting the configName from the collectionPath should fail since we already know it doesn't exist?
getConfName(collection, collectionPath, collectionProps);
}
} else if (System.getProperty("bootstrap_confdir") != null) {
// if we are bootstrapping a collection, default the config for
// a new collection to the collection we are bootstrapping
log.info("Setting config for collection:" + collection + " to " + defaultConfigName);
Properties sysProps = System.getProperties();
for (String sprop : System.getProperties().stringPropertyNames()) {
if (sprop.startsWith(COLLECTION_PARAM_PREFIX)) {
collectionProps.put(sprop.substring(COLLECTION_PARAM_PREFIX.length()), sysProps.getProperty(sprop));
}
}
// if the config name wasn't passed in, use the default
if (!collectionProps.containsKey(CONFIGNAME_PROP))
collectionProps.put(CONFIGNAME_PROP, defaultConfigName);
} else if (Boolean.getBoolean("bootstrap_conf")) {
// the conf name should should be the collection name of this core
collectionProps.put(CONFIGNAME_PROP, cd.getCollectionName());
} else {
getConfName(collection, collectionPath, collectionProps);
}
collectionProps.remove(ZkStateReader.NUM_SHARDS_PROP); // we don't put numShards in the collections properties
ZkNodeProps zkProps = new ZkNodeProps(collectionProps);
zkClient.makePath(collectionPath, Utils.toJSON(zkProps), CreateMode.PERSISTENT, null, true);
} catch (KeeperException e) {
// it's okay if the node already exists
if (e.code() != KeeperException.Code.NODEEXISTS) {
throw e;
}
}
} else {
log.debug("Collection zkNode exists");
}
} catch (KeeperException e) {
// it's okay if another beats us creating the node
if (e.code() == KeeperException.Code.NODEEXISTS) {
return;
}
throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(ErrorCode.SERVER_ERROR, "Error creating collection node in Zookeeper", e);
}
}
private void getConfName(String collection, String collectionPath,
Map<String, Object> collectionProps) throws KeeperException,
InterruptedException {
// check for configName
log.debug("Looking for collection configName");
List<String> configNames = null;
int retry = 1;
int retryLimt = 6;
for (; retry < retryLimt; retry++) {
if (zkClient.exists(collectionPath, true)) {
ZkNodeProps cProps = ZkNodeProps.load(zkClient.getData(collectionPath, null, null, true));
if (cProps.containsKey(CONFIGNAME_PROP)) {
break;
}
}
// if there is only one conf, use that
try {
configNames = zkClient.getChildren(ZkConfigManager.CONFIGS_ZKNODE, null,
true);
} catch (NoNodeException e) {
// just keep trying
}
if (configNames != null && configNames.size() == 1) {
// no config set named, but there is only 1 - use it
log.info("Only one config set found in zk - using it:" + configNames.get(0));
collectionProps.put(CONFIGNAME_PROP, configNames.get(0));
break;
}
if (configNames != null && configNames.contains(collection)) {
log.info("Could not find explicit collection configName, but found config name matching collection name - using that set.");
collectionProps.put(CONFIGNAME_PROP, collection);
break;
}
log.info("Could not find collection configName - pausing for 3 seconds and trying again - try: " + retry);
Thread.sleep(3000);
}
if (retry == retryLimt) {
log.error("Could not find configName for collection " + collection);
throw new ZooKeeperException(
SolrException.ErrorCode.SERVER_ERROR,
"Could not find configName for collection " + collection + " found:" + configNames);
}
}
public ZkStateReader getZkStateReader() {
return zkStateReader;
}
@ -2175,7 +2050,8 @@ public class ZkController {
} else {
String parentZNodePath = getLeaderInitiatedRecoveryZnodePath(collection, shardId);
try {
zkClient.makePath(parentZNodePath, retryOnConnLoss);
// make sure we don't create /collections/{collection} if they do not exist with 2 param
zkClient.makePath(parentZNodePath, (byte[]) null, CreateMode.PERSISTENT, (Watcher) null, true, retryOnConnLoss, 2);
} catch (KeeperException.NodeExistsException nee) {
// if it exists, that's great!
}

View File

@ -109,7 +109,7 @@ public class ZkSolrResourceLoader extends SolrResourceLoader {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Error opening " + file, e);
} catch (KeeperException e) {
} catch (Exception e) {
throw new IOException("Error opening " + file, e);
}
}

View File

@ -78,11 +78,10 @@ public abstract class ConfigSetService {
IndexSchema schema = createIndexSchema(dcore, solrConfig);
NamedList properties = createConfigSetProperties(dcore, coreLoader);
return new ConfigSet(configName(dcore), solrConfig, schema, properties);
}
catch (Exception e) {
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Could not load conf for core " + dcore.getName() +
": " + e.getMessage(), e);
"Could not load conf for core " + dcore.getName() +
": " + e.getMessage(), e);
}
}

View File

@ -857,7 +857,7 @@ public class CoreContainer {
SolrCore core = null;
try {
MDCLoggingContext.setCore(core);
MDCLoggingContext.setCoreDescriptor(dcore);
SolrIdentifierValidator.validateCoreName(dcore.getName());
if (zkSys.getZkController() != null) {
zkSys.getZkController().preRegister(dcore);

View File

@ -119,7 +119,6 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.IndexSchemaFactory;
import org.apache.solr.schema.ManagedIndexSchema;
import org.apache.solr.schema.SchemaManager;
import org.apache.solr.schema.SimilarityFactory;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.SolrFieldCacheMBean;
@ -2720,13 +2719,6 @@ public final class SolrCore implements SolrInfoMBean, Closeable {
if (checkStale(zkClient, overlayPath, solrConfigversion) ||
checkStale(zkClient, solrConfigPath, overlayVersion) ||
checkStale(zkClient, managedSchmaResourcePath, managedSchemaVersion)) {
try (SolrCore solrCore = cc.solrCores.getCoreFromAnyList(coreName, true)) {
solrCore.setLatestSchema(SchemaManager.getFreshManagedSchema(solrCore));
} catch (Exception e) {
log.warn("", SolrZkClient.checkInterrupted(e));
}
log.info("core reload {}", coreName);
try {
cc.reload(coreName);

View File

@ -346,9 +346,11 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
try {
String path = ZkStateReader.CONFIGS_ZKNODE + "/" + SYSTEM_COLL + "/schema.xml";
byte[] data = IOUtils.toByteArray(Thread.currentThread().getContextClassLoader().getResourceAsStream("SystemCollectionSchema.xml"));
assert data != null && data.length > 0;
cmdExecutor.ensureExists(path, data, CreateMode.PERSISTENT, zk);
path = ZkStateReader.CONFIGS_ZKNODE + "/" + SYSTEM_COLL + "/solrconfig.xml";
data = IOUtils.toByteArray(Thread.currentThread().getContextClassLoader().getResourceAsStream("SystemCollectionSolrConfig.xml"));
assert data != null && data.length > 0;
cmdExecutor.ensureExists(path, data, CreateMode.PERSISTENT, zk);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);

Some files were not shown because too many files have changed in this diff Show More