diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f90bc126a0e..5b9fee7bba4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -55,7 +55,30 @@ Other * LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward) ======================= Lucene 6.4.0 ======================= -(No Changes) + +New features + +* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand) + +Improvements + +* LUCENE-6824: TermAutomatonQuery now rewrites to TermQuery, + PhraseQuery or MultiPhraseQuery when the word automaton is simple + (Mike McCandless) + +* LUCENE-7431: Allow a certain amount of overlap to be specified between the include + and exclude arguments of SpanNotQuery via negative pre and/or post arguments. + (Marc Morissette via David Smiley) + +* LUCENE-7544: UnifiedHighlighter: add extension points for handling custom queries. + (Michael Braun, David Smiley) + +* LUCENE-7538: Asking IndexWriter to store a too-massive text field + now throws IllegalArgumentException instead of a cryptic exception + that closes your IndexWriter (Steve Chen via Mike McCandless) + +* LUCENE-7524: Added more detailed explanation of how IDF is computed in + ClassicSimilarity and BM25Similarity. (Adrien Grand) ======================= Lucene 6.3.0 ======================= diff --git a/lucene/analysis/kuromoji/ivy.xml b/lucene/analysis/kuromoji/ivy.xml index 10eba4ee83d..eb085098be9 100644 --- a/lucene/analysis/kuromoji/ivy.xml +++ b/lucene/analysis/kuromoji/ivy.xml @@ -27,7 +27,7 @@ - + diff --git a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat index 8935809c88a..6cfad720383 100644 Binary files a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat and b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat differ diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java index 54382edce03..6ad8a689de9 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/ConnectionCostsWriter.java @@ -28,7 +28,6 @@ import org.apache.lucene.analysis.ja.dict.ConnectionCosts; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.OutputStreamDataOutput; -import org.apache.lucene.util.BitUtil; public final class ConnectionCostsWriter { diff --git a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java index 61d6f270226..1b8abbba64e 100644 --- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java @@ -33,12 +33,10 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat; -import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.packed.PackedInts; import com.ibm.icu.text.Normalizer2; @@ -133,7 +131,7 @@ public class TokenInfoDictionaryBuilder { System.out.println(" encode..."); PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton(); - Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, PackedInts.DEFAULT, true, 15); + Builder fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, 0, 0, true, true, Integer.MAX_VALUE, fstOutput, true, 15); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = -1; // first ord will be 0 String lastValue = null; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 8bf71412345..d77f84bf24f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -231,7 +231,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "6.2.0-cfs", "6.2.0-nocfs", "6.2.1-cfs", - "6.2.1-nocfs" + "6.2.1-nocfs", + "6.3.0-cfs", + "6.3.0-nocfs" }; final String[] unsupportedNames = { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-cfs.zip new file mode 100644 index 00000000000..737054d23af Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-nocfs.zip new file mode 100644 index 00000000000..14a82d7c647 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.6.3.0-nocfs.zip differ diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java index fb682fd3ed6..b16bb1566cb 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java @@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -363,8 +362,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { final Builder indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - FST_OUTPUTS, false, - PackedInts.COMPACT, true, 15); + FST_OUTPUTS, true, 15); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index 1427decaed3..2f7176542cd 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -81,9 +81,6 @@ import org.apache.lucene.util.packed.PackedInts; // loads itself in ram? public final class MemoryPostingsFormat extends PostingsFormat { - private final boolean doPackFST; - private final float acceptableOverheadRatio; - public MemoryPostingsFormat() { this(false, PackedInts.DEFAULT); } @@ -97,13 +94,11 @@ public final class MemoryPostingsFormat extends PostingsFormat { */ public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) { super("Memory"); - this.doPackFST = doPackFST; - this.acceptableOverheadRatio = acceptableOverheadRatio; } @Override public String toString() { - return "PostingsFormat(name=" + getName() + " doPackFST= " + doPackFST + ")"; + return "PostingsFormat(name=" + getName() + ")"; } private final static class TermsWriter { @@ -111,16 +106,12 @@ public final class MemoryPostingsFormat extends PostingsFormat { private final FieldInfo field; private final Builder builder; private final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - private final boolean doPackFST; - private final float acceptableOverheadRatio; private int termCount; - public TermsWriter(IndexOutput out, FieldInfo field, boolean doPackFST, float acceptableOverheadRatio) { + public TermsWriter(IndexOutput out, FieldInfo field) { this.out = out; this.field = field; - this.doPackFST = doPackFST; - this.acceptableOverheadRatio = acceptableOverheadRatio; - builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doPackFST, acceptableOverheadRatio, true, 15); + builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); } private class PostingsWriter { @@ -307,8 +298,7 @@ public final class MemoryPostingsFormat extends PostingsFormat { TermsEnum termsEnum = terms.iterator(); FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); - TermsWriter termsWriter = new TermsWriter(out, fieldInfo, - doPackFST, acceptableOverheadRatio); + TermsWriter termsWriter = new TermsWriter(out, fieldInfo); FixedBitSet docsSeen = new FixedBitSet(state.segmentInfo.maxDoc()); long sumTotalTermFreq = 0; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index a4a150ba14b..bdacc22325f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -48,7 +48,6 @@ import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -456,8 +455,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, false, - PackedInts.COMPACT, true, 15); + outputs, true, 15); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java index 67cfab6c7fb..ec551d14d1f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java @@ -64,7 +64,7 @@ public final class GrowableByteArrayDataOutput extends DataOutput { @Override public void writeString(String string) throws IOException { - int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR; + int maxLen = UnicodeUtil.maxUTF8Length(string.length()); if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) { // string is small enough that we don't need to save memory by falling back to double-pass approach // this is just an optimized writeString() that re-uses scratchBytes. diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 9611a03c265..0344c58b35c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -24,11 +24,9 @@ import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -48,7 +46,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { private final Counter iwBytesUsed; private final PackedLongValues.Builder lengths; - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private final FieldInfo fieldInfo; private long bytesUsed; private int lastDocID = -1; @@ -60,7 +58,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { this.bytesOut = bytes.getDataOutput(); this.lengths = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); this.iwBytesUsed = iwBytesUsed; - this.docsWithField = new FixedBitSet(64); + this.docsWithField = new DocsWithFieldSet(); this.bytesUsed = lengths.ramBytesUsed() + docsWithField.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } @@ -84,8 +82,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { // Should never happen! throw new RuntimeException(ioe); } - docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID); - docsWithField.set(docID); + docsWithField.add(docID); updateBytesUsed(); lastDocID = docID; @@ -112,7 +109,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { if (fieldInfoIn != fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField); + return new BufferedBinaryDocValues(lengths, maxLength, bytes.getDataInput(), docsWithField.iterator()); } }); } @@ -124,12 +121,12 @@ class BinaryDocValuesWriter extends DocValuesWriter { final DocIdSetIterator docsWithField; final DataInput bytesIterator; - BufferedBinaryDocValues(PackedLongValues lengths, int maxLength, DataInput bytesIterator, FixedBitSet docsWithFields) { + BufferedBinaryDocValues(PackedLongValues lengths, int maxLength, DataInput bytesIterator, DocIdSetIterator docsWithFields) { this.value = new BytesRefBuilder(); this.value.grow(maxLength); this.lengthsIterator = lengths.iterator(); this.bytesIterator = bytesIterator; - this.docsWithField = new BitSetIterator(docsWithFields, lengths.size()); + this.docsWithField = docsWithFields; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java index e2ece543506..15de682dec4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java @@ -430,6 +430,10 @@ final class DefaultIndexingChain extends DocConsumer { fp = getOrAddField(fieldName, fieldType, false); } if (fieldType.stored()) { + String value = field.stringValue(); + if (value != null && value.length() > IndexWriter.MAX_STORED_STRING_LENGTH) { + throw new IllegalArgumentException("stored field \"" + field.name() + "\" is too large (" + value.length() + " characters) to store"); + } try { storedFieldsWriter.writeField(fp.fieldInfo, field); } catch (Throwable th) { diff --git a/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java new file mode 100644 index 00000000000..6c0d6ddefd6 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/DocsWithFieldSet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.RamUsageEstimator; + +/** Accumulator for documents that have a value for a field. This is optimized + * for the case that all documents have a value. */ +final class DocsWithFieldSet extends DocIdSet { + + private static long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(DocsWithFieldSet.class); + + private FixedBitSet set; + private int cost = 0; + private int lastDocId = -1; + + void add(int docID) { + if (docID <= lastDocId) { + throw new IllegalArgumentException("Out of order doc ids: last=" + lastDocId + ", next=" + docID); + } + if (set != null) { + set = FixedBitSet.ensureCapacity(set, docID); + set.set(docID); + } else if (docID != cost) { + // migrate to a sparse encoding using a bit set + set = new FixedBitSet(docID + 1); + set.set(0, cost); + set.set(docID); + } + lastDocId = docID; + cost++; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + (set == null ? 0 : set.ramBytesUsed()); + } + + @Override + public DocIdSetIterator iterator() { + return set != null ? new BitSetIterator(set, cost) : DocIdSetIterator.all(cost); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 4517294f4c3..68f3b3b6b2d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -62,6 +62,7 @@ import org.apache.lucene.store.MergeInfo; import org.apache.lucene.store.RateLimitedIndexOutput; import org.apache.lucene.store.TrackingDirectoryWrapper; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CloseableThreadLocal; @@ -70,6 +71,7 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.ThreadInterruptedException; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.Version; /** @@ -258,6 +260,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable { * IndexWriterConfig#setInfoStream(InfoStream)}). */ public final static int MAX_TERM_LENGTH = DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8; + + /** + * Maximum length string for a stored field. + */ + public final static int MAX_STORED_STRING_LENGTH = ArrayUtil.MAX_ARRAY_LENGTH / UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR; + // when unrecoverable disaster strikes, we populate this with the reason that we had to close IndexWriter volatile Throwable tragedy; diff --git a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java index b0d05e4859f..4923f3ba1f3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NormValuesWriter.java @@ -22,9 +22,7 @@ import java.io.IOException; import org.apache.lucene.codecs.NormsConsumer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -32,7 +30,7 @@ import org.apache.lucene.util.packed.PackedLongValues; * segment flushes. */ class NormValuesWriter { - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private PackedLongValues.Builder pending; private final Counter iwBytesUsed; private long bytesUsed; @@ -40,7 +38,7 @@ class NormValuesWriter { private int lastDocID = -1; public NormValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { - docsWithField = new FixedBitSet(64); + docsWithField = new DocsWithFieldSet(); pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed(); this.fieldInfo = fieldInfo; @@ -54,8 +52,7 @@ class NormValuesWriter { } pending.add(value); - docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID); - docsWithField.set(docID); + docsWithField.add(docID); updateBytesUsed(); @@ -82,7 +79,7 @@ class NormValuesWriter { if (fieldInfo != NormValuesWriter.this.fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedNorms(values, docsWithField); + return new BufferedNorms(values, docsWithField.iterator()); } @Override @@ -108,9 +105,9 @@ class NormValuesWriter { final DocIdSetIterator docsWithField; private long value; - BufferedNorms(PackedLongValues values, FixedBitSet docsWithFields) { + BufferedNorms(PackedLongValues values, DocIdSetIterator docsWithFields) { this.iter = values.iterator(); - this.docsWithField = new BitSetIterator(docsWithFields, values.size()); + this.docsWithField = docsWithFields; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java index 24a701064cb..d4dd66ae73e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/NumericDocValuesWriter.java @@ -21,9 +21,7 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -34,13 +32,13 @@ class NumericDocValuesWriter extends DocValuesWriter { private PackedLongValues.Builder pending; private final Counter iwBytesUsed; private long bytesUsed; - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private final FieldInfo fieldInfo; private int lastDocID = -1; public NumericDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); - docsWithField = new FixedBitSet(64); + docsWithField = new DocsWithFieldSet(); bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed(); this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; @@ -53,8 +51,7 @@ class NumericDocValuesWriter extends DocValuesWriter { } pending.add(value); - docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID); - docsWithField.set(docID); + docsWithField.add(docID); updateBytesUsed(); @@ -83,7 +80,7 @@ class NumericDocValuesWriter extends DocValuesWriter { if (fieldInfo != NumericDocValuesWriter.this.fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedNumericDocValues(values, docsWithField); + return new BufferedNumericDocValues(values, docsWithField.iterator()); } }); } @@ -94,9 +91,9 @@ class NumericDocValuesWriter extends DocValuesWriter { final DocIdSetIterator docsWithField; private long value; - BufferedNumericDocValues(PackedLongValues values, FixedBitSet docsWithFields) { + BufferedNumericDocValues(PackedLongValues values, DocIdSetIterator docsWithFields) { this.iter = values.iterator(); - this.docsWithField = new BitSetIterator(docsWithFields, values.size()); + this.docsWithField = docsWithFields; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index 7e43e497f7b..e439caf6bda 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -22,13 +22,11 @@ import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -37,7 +35,7 @@ import org.apache.lucene.util.packed.PackedLongValues; class SortedDocValuesWriter extends DocValuesWriter { final BytesRefHash hash; private PackedLongValues.Builder pending; - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private final Counter iwBytesUsed; private long bytesUsed; // this currently only tracks differences in 'pending' private final FieldInfo fieldInfo; @@ -52,7 +50,7 @@ class SortedDocValuesWriter extends DocValuesWriter { BytesRefHash.DEFAULT_CAPACITY, new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); - docsWithField = new FixedBitSet(64); + docsWithField = new DocsWithFieldSet(); bytesUsed = pending.ramBytesUsed() + docsWithField.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } @@ -69,8 +67,7 @@ class SortedDocValuesWriter extends DocValuesWriter { } addOneValue(value); - docsWithField = FixedBitSet.ensureCapacity(docsWithField, docID); - docsWithField.set(docID); + docsWithField.add(docID); lastDocID = docID; } @@ -121,7 +118,7 @@ class SortedDocValuesWriter extends DocValuesWriter { if (fieldInfoIn != fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField); + return new BufferedSortedDocValues(hash, valueCount, ords, sortedValues, ordMap, docsWithField.iterator()); } }); } @@ -136,13 +133,13 @@ class SortedDocValuesWriter extends DocValuesWriter { final PackedLongValues.Iterator iter; final DocIdSetIterator docsWithField; - public BufferedSortedDocValues(BytesRefHash hash, int valueCount, PackedLongValues docToOrd, int[] sortedValues, int[] ordMap, FixedBitSet docsWithField) { + public BufferedSortedDocValues(BytesRefHash hash, int valueCount, PackedLongValues docToOrd, int[] sortedValues, int[] ordMap, DocIdSetIterator docsWithField) { this.hash = hash; this.valueCount = valueCount; this.sortedValues = sortedValues; this.iter = docToOrd.iterator(); this.ordMap = ordMap; - this.docsWithField = new BitSetIterator(docsWithField, docToOrd.size()); + this.docsWithField = docsWithField; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java index 3f50623d4ea..75236cc335a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedNumericDocValuesWriter.java @@ -23,9 +23,7 @@ import java.util.Arrays; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -34,7 +32,7 @@ import org.apache.lucene.util.packed.PackedLongValues; class SortedNumericDocValuesWriter extends DocValuesWriter { private PackedLongValues.Builder pending; // stream of all values private PackedLongValues.Builder pendingCounts; // count of values per doc - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private final Counter iwBytesUsed; private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private final FieldInfo fieldInfo; @@ -47,7 +45,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { this.iwBytesUsed = iwBytesUsed; pending = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); - docsWithField = new FixedBitSet(64); + docsWithField = new DocsWithFieldSet(); bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed() + docsWithField.ramBytesUsed() + RamUsageEstimator.sizeOf(currentValues); iwBytesUsed.addAndGet(bytesUsed); } @@ -76,8 +74,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { pendingCounts.add(currentUpto); currentUpto = 0; - docsWithField = FixedBitSet.ensureCapacity(docsWithField, currentDoc); - docsWithField.set(currentDoc); + docsWithField.add(currentDoc); } @Override @@ -112,7 +109,7 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { if (fieldInfoIn != fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField); + return new BufferedSortedNumericDocValues(values, valueCounts, docsWithField.iterator()); } }); } @@ -124,10 +121,10 @@ class SortedNumericDocValuesWriter extends DocValuesWriter { private int valueCount; private int valueUpto; - public BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, FixedBitSet docsWithField) { + public BufferedSortedNumericDocValues(PackedLongValues values, PackedLongValues valueCounts, DocIdSetIterator docsWithField) { valuesIter = values.iterator(); valueCountsIter = valueCounts.iterator(); - this.docsWithField = new BitSetIterator(docsWithField, values.size()); + this.docsWithField = docsWithField; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java index 35157d49c16..0f4fb5e5c71 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java @@ -24,13 +24,11 @@ import java.util.Arrays; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.ByteBlockPool; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; @@ -40,7 +38,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { final BytesRefHash hash; private PackedLongValues.Builder pending; // stream of all termIDs private PackedLongValues.Builder pendingCounts; // termIDs per doc - private FixedBitSet docsWithField; + private DocsWithFieldSet docsWithField; private final Counter iwBytesUsed; private long bytesUsed; // this only tracks differences in 'pending' and 'pendingCounts' private final FieldInfo fieldInfo; @@ -59,7 +57,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed)); pending = PackedLongValues.packedBuilder(PackedInts.COMPACT); pendingCounts = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT); - docsWithField = new FixedBitSet(64); + docsWithField = new DocsWithFieldSet(); bytesUsed = pending.ramBytesUsed() + pendingCounts.ramBytesUsed(); iwBytesUsed.addAndGet(bytesUsed); } @@ -103,8 +101,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { pendingCounts.add(count); maxCount = Math.max(maxCount, count); currentUpto = 0; - docsWithField = FixedBitSet.ensureCapacity(docsWithField, currentDoc); - docsWithField.set(currentDoc); + docsWithField.add(currentDoc); } @Override @@ -158,7 +155,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter { if (fieldInfoIn != fieldInfo) { throw new IllegalArgumentException("wrong fieldInfo"); } - return new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField); + return new BufferedSortedSetDocValues(sortedValues, ordMap, hash, ords, ordCounts, maxCount, docsWithField.iterator()); } }); } @@ -176,14 +173,14 @@ class SortedSetDocValuesWriter extends DocValuesWriter { private int ordCount; private int ordUpto; - public BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, FixedBitSet docsWithField) { + public BufferedSortedSetDocValues(int[] sortedValues, int[] ordMap, BytesRefHash hash, PackedLongValues ords, PackedLongValues ordCounts, int maxCount, DocIdSetIterator docsWithField) { this.currentDoc = new int[maxCount]; this.sortedValues = sortedValues; this.ordMap = ordMap; this.hash = hash; this.ordsIter = ords.iterator(); this.ordCountsIter = ordCounts.iterator(); - this.docsWithField = new BitSetIterator(docsWithField, ordCounts.size()); + this.docsWithField = docsWithField; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 70dbed8d800..d0bf8285cc9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -265,7 +265,7 @@ public class PhraseQuery extends Query { * Returns the relative positions of terms in this phrase. */ public int[] getPositions() { - return positions; + return positions; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index ff390b32f4d..676311806d7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -175,7 +175,9 @@ public class BM25Similarity extends Similarity { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); - return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); + return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", + Explanation.match(df, "docFreq"), + Explanation.match(docCount, "docCount")); } /** @@ -192,16 +194,14 @@ public class BM25Similarity extends Similarity { * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - float idf = 0.0f; + double idf = 0d; // sum into a double before casting into a float List details = new ArrayList<>(); for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; + Explanation idfExplain = idfExplain(collectionStats, stat); + details.add(idfExplain); + idf += idfExplain.getValue(); } - return Explanation.match(idf, "idf(), sum of:", details); + return Explanation.match((float) idf, "idf(), sum of:", details); } @Override @@ -303,7 +303,7 @@ public class BM25Similarity extends Similarity { subs.add(Explanation.match(0, "parameter b (norms omitted for field)")); return Explanation.match( (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1), - "tfNorm, computed from:", subs); + "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1) from:", subs); } else { byte norm; if (norms.advanceExact(doc)) { @@ -317,7 +317,7 @@ public class BM25Similarity extends Similarity { subs.add(Explanation.match(doclen, "fieldLength")); return Explanation.match( (freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)), - "tfNorm, computed from:", subs); + "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", subs); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java new file mode 100644 index 00000000000..a7b7614cf30 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.io.IOException; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TermStatistics; +import org.apache.lucene.util.BytesRef; + +/** + * Simple similarity that gives terms a score that is equal to their query + * boost. This similarity is typically used with disabled norms since neither + * document statistics nor index statistics are used for scoring. That said, + * if norms are enabled, they will be computed the same way as + * {@link SimilarityBase} and {@link BM25Similarity} with + * {@link SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} + * so that the {@link Similarity} can be changed after the index has been + * created. + */ +public class BooleanSimilarity extends Similarity { + + private static final Similarity BM25_SIM = new BM25Similarity(); + + /** Sole constructor */ + public BooleanSimilarity() {} + + @Override + public long computeNorm(FieldInvertState state) { + return BM25_SIM.computeNorm(state); + } + + @Override + public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new BooleanWeight(boost); + } + + private static class BooleanWeight extends SimWeight { + final float boost; + + BooleanWeight(float boost) { + this.boost = boost; + } + } + + @Override + public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + final float boost = ((BooleanWeight) weight).boost; + + return new SimScorer() { + + @Override + public float score(int doc, float freq) throws IOException { + return boost; + } + + @Override + public Explanation explain(int doc, Explanation freq) throws IOException { + Explanation queryBoostExpl = Explanation.match(boost, "query boost"); + return Explanation.match( + queryBoostExpl.getValue(), + "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:", + queryBoostExpl); + } + + @Override + public float computeSlopFactor(int distance) { + return 1f; + } + + @Override + public float computePayloadFactor(int doc, int start, int end, BytesRef payload) { + return 1f; + } + }; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index fae85a0c9df..5a1e237ed83 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -18,6 +18,9 @@ package org.apache.lucene.search.similarities; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.TermStatistics; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.SmallFloat; @@ -121,6 +124,16 @@ public class ClassicSimilarity extends TFIDFSimilarity { return 1; } + @Override + public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { + final long df = termStats.docFreq(); + final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); + final float idf = idf(df, docCount); + return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", + Explanation.match(df, "docFreq"), + Explanation.match(docCount, "docCount")); + } + /** Implemented as log((docCount+1)/(docFreq+1)) + 1. */ @Override public float idf(long docFreq, long docCount) { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index cd8acd693a8..12ab1a2d028 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -484,16 +484,14 @@ public abstract class TFIDFSimilarity extends Similarity { * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); - float idf = 0.0f; + double idf = 0d; // sum into a double before casting into a float List subs = new ArrayList<>(); for (final TermStatistics stat : termStats ) { - final long df = stat.docFreq(); - final float termIdf = idf(df, docCount); - subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")")); - idf += termIdf; + Explanation idfExplain = idfExplain(collectionStats, stat); + subs.add(idfExplain); + idf += idfExplain.getValue(); } - return Explanation.match(idf, "idf(), sum of:", subs); + return Explanation.match((float) idf, "idf(), sum of:", subs); } /** Computes a score factor based on a term's document frequency (the number diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java index 05d3f8ef481..00bcc4c1ac7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java @@ -49,19 +49,23 @@ public final class SpanNotQuery extends SpanQuery { /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude within - * dist tokens of include. */ + * dist tokens of include. Inversely, a negative + * dist value may be used to specify a certain amount of allowable + * overlap. */ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) { this(include, exclude, dist, dist); } /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude within - * pre tokens before or post tokens of include. */ + * pre tokens before or post tokens of + * include. Inversely, negative values for pre and/or + * post allow a certain amount of overlap to occur. */ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) { this.include = Objects.requireNonNull(include); this.exclude = Objects.requireNonNull(exclude); - this.pre = (pre >=0) ? pre : 0; - this.post = (post >= 0) ? post : 0; + this.pre = pre; + this.post = post; if (include.getField() != null && exclude.getField() != null && !include.getField().equals(exclude.getField())) throw new IllegalArgumentException("Clauses must have same field."); @@ -226,4 +230,4 @@ public final class SpanNotQuery extends SpanQuery { return h; } -} \ No newline at end of file +} diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java index c62d6391aea..2fcf28ad99f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRef.java @@ -84,7 +84,7 @@ public final class BytesRef implements Comparable,Cloneable { * unicode text, with no unpaired surrogates. */ public BytesRef(CharSequence text) { - this(new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * text.length()]); + this(new byte[UnicodeUtil.maxUTF8Length(text.length())]); length = UnicodeUtil.UTF16toUTF8(text, 0, text.length(), bytes); } diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java index 2bfa2f2dc70..08fda910a55 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefBuilder.java @@ -143,7 +143,7 @@ public class BytesRefBuilder { * represent the provided text. */ public void copyChars(CharSequence text, int off, int len) { - grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR); + grow(UnicodeUtil.maxUTF8Length(len)); ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes); } @@ -152,7 +152,7 @@ public class BytesRefBuilder { * represent the provided text. */ public void copyChars(char[] text, int off, int len) { - grow(len * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR); + grow(UnicodeUtil.maxUTF8Length(len)); ref.length = UnicodeUtil.UTF16toUTF8(text, off, len, ref.bytes); } diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index a21281f8a0a..20e6249f2a3 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -612,6 +612,11 @@ public final class UnicodeUtil { } return out_offset; } + + /** Returns the maximum number of utf8 bytes required to encode a utf16 (e.g., java char[], String) */ + public static int maxUTF8Length(int utf16Length) { + return Math.multiplyExact(utf16Length, MAX_UTF8_BYTES_PER_CHAR); + } /** * Utility method for {@link #UTF8toUTF16(byte[], int, int, char[])} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java index c5ab849f3f8..428edd360de 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Builder.java @@ -23,7 +23,6 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc -import org.apache.lucene.util.packed.PackedInts; // TODO: could we somehow stream an FST to disk while we // build it? @@ -69,10 +68,6 @@ public class Builder { private final int shareMaxTailLength; private final IntsRefBuilder lastInput = new IntsRefBuilder(); - - // for packing - private final boolean doPackFST; - private final float acceptableOverheadRatio; // NOTE: cutting this over to ArrayList instead loses ~6% // in build performance on 9.8M Wikipedia terms; so we @@ -99,11 +94,10 @@ public class Builder { /** * Instantiates an FST/FSA builder without any pruning. A shortcut * to {@link #Builder(FST.INPUT_TYPE, int, int, boolean, - * boolean, int, Outputs, boolean, float, - * boolean, int)} with pruning options turned off. + * boolean, int, Outputs, boolean, int)} with pruning options turned off. */ public Builder(FST.INPUT_TYPE inputType, Outputs outputs) { - this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, false, PackedInts.COMPACT, true, 15); + this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); } /** @@ -143,11 +137,6 @@ public class Builder { * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. * - * @param doPackFST Pass true to create a packed FST. - * - * @param acceptableOverheadRatio How to trade speed for space when building the FST. This option - * is only relevant when doPackFST is true. @see PackedInts#getMutable(int, int, float) - * * @param allowArrayArcs Pass false to disable the array arc optimization * while building the FST; this will make the resulting * FST smaller but slower to traverse. @@ -159,16 +148,13 @@ public class Builder { */ public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs outputs, - boolean doPackFST, float acceptableOverheadRatio, boolean allowArrayArcs, - int bytesPageBits) { + boolean allowArrayArcs, int bytesPageBits) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; - this.doPackFST = doPackFST; - this.acceptableOverheadRatio = acceptableOverheadRatio; this.allowArrayArcs = allowArrayArcs; - fst = new FST<>(inputType, outputs, doPackFST, acceptableOverheadRatio, bytesPageBits); + fst = new FST<>(inputType, outputs, bytesPageBits); bytes = fst.bytes; assert bytes != null; if (doShareSuffix) { @@ -496,11 +482,7 @@ public class Builder { //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); fst.finish(compileNode(root, lastInput.length()).node); - if (doPackFST) { - return fst.pack(this, 3, Math.max(10, (int) (getNodeCount()/4)), acceptableOverheadRatio); - } else { - return fst; - } + return fst; } private void compileAllTargets(UnCompiledNode node, int tailLength) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 4a0a3a92cf6..5ea6dabbb0e 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -24,13 +24,9 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -38,13 +34,9 @@ import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Accountables; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Constants; -import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.packed.GrowableWriter; -import org.apache.lucene.util.packed.PackedInts; // TODO: break this into WritableFST and ReadOnlyFST.. then // we can have subclasses of ReadOnlyFST to handle the @@ -90,14 +82,6 @@ public final class FST implements Accountable { static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; - // Arcs are stored as fixed-size (per entry) array, so - // that we can find an arc using binary search. We do - // this when number of arcs is > NUM_ARCS_ARRAY: - - // If set, the target node is delta coded vs current - // position: - private static final int BIT_TARGET_DELTA = 1 << 6; - // We use this as a marker (because this one flag is // illegal by itself ...): private static final byte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT; @@ -137,7 +121,9 @@ public final class FST implements Accountable { /** Don't store arcWithOutputCount anymore */ private static final int VERSION_NO_NODE_ARC_COUNTS = 5; - private static final int VERSION_CURRENT = VERSION_NO_NODE_ARC_COUNTS; + private static final int VERSION_PACKED_REMOVED = 6; + + private static final int VERSION_CURRENT = VERSION_PACKED_REMOVED; // Never serialized; just used to represent the virtual // final node w/ no arcs: @@ -168,9 +154,6 @@ public final class FST implements Accountable { public final Outputs outputs; - private final boolean packed; - private PackedInts.Reader nodeRefToAddress; - private Arc cachedRootArcs[]; /** Represents a single arc. */ @@ -273,18 +256,11 @@ public final class FST implements Accountable { return (flags & bit) != 0; } - private GrowableWriter nodeAddress; - - // TODO: we could be smarter here, and prune periodically - // as we go; high in-count nodes will "usually" become - // clear early on: - private GrowableWriter inCounts; - private final int version; // make a new empty FST, for building; Builder invokes // this ctor - FST(INPUT_TYPE inputType, Outputs outputs, boolean willPackFST, float acceptableOverheadRatio, int bytesPageBits) { + FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { this.inputType = inputType; this.outputs = outputs; version = VERSION_CURRENT; @@ -293,17 +269,8 @@ public final class FST implements Accountable { // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs bytes.writeByte((byte) 0); - if (willPackFST) { - nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio); - inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio); - } else { - nodeAddress = null; - inCounts = null; - } emptyOutput = null; - packed = false; - nodeRefToAddress = null; } public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28; @@ -324,8 +291,12 @@ public final class FST implements Accountable { // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): - version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_NO_NODE_ARC_COUNTS); - packed = in.readByte() == 1; + version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_CURRENT); + if (version < VERSION_PACKED_REMOVED) { + if (in.readByte() == 1) { + throw new CorruptIndexException("Cannot read packed FSTs anymore", in); + } + } if (in.readByte() == 1) { // accepts empty string // 1 KB blocks: @@ -334,17 +305,12 @@ public final class FST implements Accountable { emptyBytes.copyBytes(in, numBytes); // De-serialize empty-string output: - BytesReader reader; - if (packed) { - reader = emptyBytes.getForwardReader(); - } else { - reader = emptyBytes.getReverseReader(); - // NoOutputs uses 0 bytes when writing its output, - // so we have to check here else BytesStore gets - // angry: - if (numBytes > 0) { - reader.setPosition(numBytes-1); - } + BytesReader reader = emptyBytes.getReverseReader(); + // NoOutputs uses 0 bytes when writing its output, + // so we have to check here else BytesStore gets + // angry: + if (numBytes > 0) { + reader.setPosition(numBytes-1); } emptyOutput = outputs.readFinalOutput(reader); } else { @@ -364,11 +330,6 @@ public final class FST implements Accountable { default: throw new IllegalStateException("invalid input type " + t); } - if (packed) { - nodeRefToAddress = PackedInts.getReader(in); - } else { - nodeRefToAddress = null; - } startNode = in.readVLong(); if (version < VERSION_NO_NODE_ARC_COUNTS) { in.readVLong(); @@ -424,31 +385,13 @@ public final class FST implements Accountable { } else { size += bytes.ramBytesUsed(); } - if (packed) { - size += nodeRefToAddress.ramBytesUsed(); - } else if (nodeAddress != null) { - size += nodeAddress.ramBytesUsed(); - size += inCounts.ramBytesUsed(); - } size += cachedArcsBytesUsed; return size; } - @Override - public Collection getChildResources() { - List resources = new ArrayList<>(); - if (packed) { - resources.add(Accountables.namedAccountable("node ref to address", nodeRefToAddress)); - } else if (nodeAddress != null) { - resources.add(Accountables.namedAccountable("node addresses", nodeAddress)); - resources.add(Accountables.namedAccountable("in counts", inCounts)); - } - return resources; - } - @Override public String toString() { - return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs + ",packed=" + packed; + return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; } void finish(long newStartNode) throws IOException { @@ -463,16 +406,6 @@ public final class FST implements Accountable { bytes.finish(); cacheRootArcs(); } - - private long getNodeAddress(long node) { - if (nodeAddress != null) { - // Deref - return nodeAddress.get((int) node); - } else { - // Straight - return node; - } - } // Optionally caches first 128 labels @SuppressWarnings({"rawtypes","unchecked"}) @@ -527,18 +460,7 @@ public final class FST implements Accountable { if (startNode == -1) { throw new IllegalStateException("call finish first"); } - if (nodeAddress != null) { - throw new IllegalStateException("cannot save an FST pre-packed FST; it must first be packed"); - } - if (packed && !(nodeRefToAddress instanceof PackedInts.Mutable)) { - throw new IllegalStateException("cannot save a FST which has been loaded from disk "); - } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); - if (packed) { - out.writeByte((byte) 1); - } else { - out.writeByte((byte) 0); - } // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { @@ -552,16 +474,14 @@ public final class FST implements Accountable { byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; ros.writeTo(emptyOutputBytes, 0); - if (!packed) { - // reverse - final int stopAt = emptyOutputBytes.length/2; - int upto = 0; - while(upto < stopAt) { - final byte b = emptyOutputBytes[upto]; - emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; - emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; - upto++; - } + // reverse + final int stopAt = emptyOutputBytes.length/2; + int upto = 0; + while(upto < stopAt) { + final byte b = emptyOutputBytes[upto]; + emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; + emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; + upto++; } out.writeVInt(emptyOutputBytes.length); out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); @@ -577,9 +497,6 @@ public final class FST implements Accountable { t = 2; } out.writeByte(t); - if (packed) { - ((PackedInts.Mutable) nodeRefToAddress).save(out); - } out.writeVLong(startNode); if (bytes != null) { long numBytes = bytes.getPosition(); @@ -705,8 +622,6 @@ public final class FST implements Accountable { if (!targetHasArcs) { flags += BIT_STOP_NODE; - } else if (inCounts != null) { - inCounts.set((int) target.node, inCounts.get((int) target.node) + 1); } if (arc.output != NO_OUTPUT) { @@ -810,30 +725,8 @@ public final class FST implements Accountable { builder.bytes.reverse(startAddress, thisNodeAddress); - // PackedInts uses int as the index, so we cannot handle - // > 2.1B nodes when packing: - if (nodeAddress != null && builder.nodeCount == Integer.MAX_VALUE) { - throw new IllegalStateException("cannot create a packed FST with more than 2.1 billion nodes"); - } - builder.nodeCount++; - final long node; - if (nodeAddress != null) { - - // Nodes are addressed by 1+ord: - if ((int) builder.nodeCount == nodeAddress.size()) { - nodeAddress = nodeAddress.resize(ArrayUtil.oversize(nodeAddress.size() + 1, nodeAddress.getBitsPerValue())); - inCounts = inCounts.resize(ArrayUtil.oversize(inCounts.size() + 1, inCounts.getBitsPerValue())); - } - nodeAddress.set((int) builder.nodeCount, thisNodeAddress); - // System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress); - node = builder.nodeCount; - } else { - node = thisNodeAddress; - } - - //System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress); - return node; + return thisNodeAddress; } /** Fills virtual 'start' arc, ie, an empty incoming arc to @@ -876,13 +769,13 @@ public final class FST implements Accountable { arc.flags = BIT_LAST_ARC; return arc; } else { - in.setPosition(getNodeAddress(follow.target)); + in.setPosition(follow.target); arc.node = follow.target; final byte b = in.readByte(); if (b == ARCS_AS_FIXED_ARRAY) { // array: jump straight to end arc.numArcs = in.readVInt(); - if (packed || version >= VERSION_VINT_TARGET) { + if (version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); @@ -906,8 +799,6 @@ public final class FST implements Accountable { } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { - } else if (packed) { - in.readVLong(); } else { readUnpackedNodeTarget(in); } @@ -964,7 +855,7 @@ public final class FST implements Accountable { } public Arc readFirstRealTargetArc(long node, Arc arc, final BytesReader in) throws IOException { - final long address = getNodeAddress(node); + final long address = node; in.setPosition(address); //System.out.println(" readFirstRealTargtArc address=" //+ address); @@ -975,7 +866,7 @@ public final class FST implements Accountable { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); - if (packed || version >= VERSION_VINT_TARGET) { + if (version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); @@ -1002,7 +893,7 @@ public final class FST implements Accountable { if (!targetHasArcs(follow)) { return false; } else { - in.setPosition(getNodeAddress(follow.target)); + in.setPosition(follow.target); return in.readByte() == ARCS_AS_FIXED_ARRAY; } } @@ -1029,7 +920,7 @@ public final class FST implements Accountable { //System.out.println(" nextArc fake " + //arc.nextArc); - long pos = getNodeAddress(arc.nextArc); + long pos = arc.nextArc; in.setPosition(pos); final byte b = in.readByte(); @@ -1038,7 +929,7 @@ public final class FST implements Accountable { in.readVInt(); // Skip bytesPerArc: - if (packed || version >= VERSION_VINT_TARGET) { + if (version >= VERSION_VINT_TARGET) { in.readVInt(); } else { in.readInt(); @@ -1107,41 +998,18 @@ public final class FST implements Accountable { arc.nextArc = in.getPosition(); // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... - if (nodeAddress == null) { - if (!arc.flag(BIT_LAST_ARC)) { - if (arc.bytesPerArc == 0) { - // must scan - seekToNextNode(in); - } else { - in.setPosition(arc.posArcsStart); - in.skipBytes(arc.bytesPerArc * arc.numArcs); - } - } - arc.target = in.getPosition(); - } else { - arc.target = arc.node - 1; - assert arc.target > 0; - } - } else { - if (packed) { - final long pos = in.getPosition(); - final long code = in.readVLong(); - if (arc.flag(BIT_TARGET_DELTA)) { - // Address is delta-coded from current address: - arc.target = pos + code; - //System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target); - } else if (code < nodeRefToAddress.size()) { - // Deref - arc.target = nodeRefToAddress.get((int) code); - //System.out.println(" deref code=" + code + " target=" + arc.target); + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); } else { - // Absolute - arc.target = code; - //System.out.println(" abs code=" + code); + in.setPosition(arc.posArcsStart); + in.skipBytes(arc.bytesPerArc * arc.numArcs); } - } else { - arc.target = readUnpackedNodeTarget(in); } + arc.target = in.getPosition(); + } else { + arc.target = readUnpackedNodeTarget(in); arc.nextArc = in.getPosition(); } return arc; @@ -1228,7 +1096,7 @@ public final class FST implements Accountable { return null; } - in.setPosition(getNodeAddress(follow.target)); + in.setPosition(follow.target); arc.node = follow.target; @@ -1237,7 +1105,7 @@ public final class FST implements Accountable { if (in.readByte() == ARCS_AS_FIXED_ARRAY) { // Arcs are full array; do binary search: arc.numArcs = in.readVInt(); - if (packed || version >= VERSION_VINT_TARGET) { + if (version >= VERSION_VINT_TARGET) { arc.bytesPerArc = in.readVInt(); } else { arc.bytesPerArc = in.readInt(); @@ -1303,11 +1171,7 @@ public final class FST implements Accountable { } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { - if (packed) { - in.readVLong(); - } else { - readUnpackedNodeTarget(in); - } + readUnpackedNodeTarget(in); } if (flag(flags, BIT_LAST_ARC)) { @@ -1340,18 +1204,10 @@ public final class FST implements Accountable { /** Returns a {@link BytesReader} for this FST, positioned at * position 0. */ public BytesReader getBytesReader() { - if (packed) { - if (bytesArray != null) { - return new ForwardBytesReader(bytesArray); - } else { - return bytes.getForwardReader(); - } + if (bytesArray != null) { + return new ReverseBytesReader(bytesArray); } else { - if (bytesArray != null) { - return new ReverseBytesReader(bytesArray); - } else { - return bytes.getReverseReader(); - } + return bytes.getReverseReader(); } } @@ -1476,395 +1332,4 @@ public final class FST implements Accountable { } */ - // Creates a packed FST - private FST(INPUT_TYPE inputType, Outputs outputs, int bytesPageBits) { - version = VERSION_CURRENT; - packed = true; - this.inputType = inputType; - bytesArray = null; - bytes = new BytesStore(bytesPageBits); - this.outputs = outputs; - } - - /** Expert: creates an FST by packing this one. This - * process requires substantial additional RAM (currently - * up to ~8 bytes per node depending on - * acceptableOverheadRatio), but then should - * produce a smaller FST. - * - *

The implementation of this method uses ideas from - * Smaller Representation of Finite State Automata, - * which describes techniques to reduce the size of a FST. - * However, this is not a strict implementation of the - * algorithms described in this paper. - */ - FST pack(Builder builder, int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio) throws IOException { - - // NOTE: maxDerefNodes is intentionally int: we cannot - // support > 2.1B deref nodes - - // TODO: other things to try - // - renumber the nodes to get more next / better locality? - // - allow multiple input labels on an arc, so - // singular chain of inputs can take one arc (on - // wikipedia terms this could save another ~6%) - // - in the ord case, the output '1' is presumably - // very common (after NO_OUTPUT)... maybe use a bit - // for it..? - // - use spare bits in flags.... for top few labels / - // outputs / targets - - if (nodeAddress == null) { - throw new IllegalArgumentException("this FST was not built with willPackFST=true"); - } - - T NO_OUTPUT = outputs.getNoOutput(); - - Arc arc = new Arc<>(); - - final BytesReader r = getBytesReader(); - - final int topN = Math.min(maxDerefNodes, inCounts.size()); - - // Find top nodes with highest number of incoming arcs: - NodeQueue q = new NodeQueue(topN); - - // TODO: we could use more RAM efficient selection algo here... - NodeAndInCount bottom = null; - for(int node=0; node= minInCountDeref) { - if (bottom == null) { - q.add(new NodeAndInCount(node, (int) inCounts.get(node))); - if (q.size() == topN) { - bottom = q.top(); - } - } else if (inCounts.get(node) > bottom.count) { - q.insertWithOverflow(new NodeAndInCount(node, (int) inCounts.get(node))); - } - } - } - - // Free up RAM: - inCounts = null; - - final Map topNodeMap = new HashMap<>(); - for(int downTo=q.size()-1;downTo>=0;downTo--) { - NodeAndInCount n = q.pop(); - topNodeMap.put(n.node, downTo); - //System.out.println("map node=" + n.node + " inCount=" + n.count + " to newID=" + downTo); - } - - // +1 because node ords start at 1 (0 is reserved as stop node): - final GrowableWriter newNodeAddress = new GrowableWriter( - PackedInts.bitsRequired(builder.bytes.getPosition()), (int) (1 + builder.nodeCount), acceptableOverheadRatio); - - // Fill initial coarse guess: - for(int node=1;node<=builder.nodeCount;node++) { - newNodeAddress.set(node, 1 + builder.bytes.getPosition() - nodeAddress.get(node)); - } - - int absCount; - int deltaCount; - int topCount; - int nextCount; - - FST fst; - - // Iterate until we converge: - while(true) { - - //System.out.println("\nITER"); - boolean changed = false; - - // for assert: - boolean negDelta = false; - - fst = new FST<>(inputType, outputs, builder.bytes.getBlockBits()); - - final BytesStore writer = fst.bytes; - - // Skip 0 byte since 0 is reserved target: - writer.writeByte((byte) 0); - - absCount = deltaCount = topCount = nextCount = 0; - - int changedCount = 0; - - long addressError = 0; - - //int totWasted = 0; - - // Since we re-reverse the bytes, we now write the - // nodes backwards, so that BIT_TARGET_NEXT is - // unchanged: - for(int node=(int) builder.nodeCount;node>=1;node--) { - final long address = writer.getPosition(); - - //System.out.println(" node: " + node + " address=" + address); - if (address != newNodeAddress.get(node)) { - addressError = address - newNodeAddress.get(node); - //System.out.println(" change: " + (address - newNodeAddress[node])); - changed = true; - newNodeAddress.set(node, address); - changedCount++; - } - - int nodeArcCount = 0; - int bytesPerArc = 0; - - boolean retry = false; - - // for assert: - boolean anyNegDelta = false; - - // Retry loop: possibly iterate more than once, if - // this is an array'd node and bytesPerArc changes: - writeNode: - while(true) { // retry writing this node - - //System.out.println(" cycle: retry"); - readFirstRealTargetArc(node, arc, r); - - final boolean useArcArray = arc.bytesPerArc != 0; - if (useArcArray) { - // Write false first arc: - if (bytesPerArc == 0) { - bytesPerArc = arc.bytesPerArc; - } - writer.writeByte(ARCS_AS_FIXED_ARRAY); - writer.writeVInt(arc.numArcs); - writer.writeVInt(bytesPerArc); - //System.out.println("node " + node + ": " + arc.numArcs + " arcs"); - } - - int maxBytesPerArc = 0; - //int wasted = 0; - while(true) { // iterate over all arcs for this node - //System.out.println(" cycle next arc"); - - final long arcStartPos = writer.getPosition(); - nodeArcCount++; - - byte flags = 0; - - if (arc.isLast()) { - flags += BIT_LAST_ARC; - } - /* - if (!useArcArray && nodeUpto < nodes.length-1 && arc.target == nodes[nodeUpto+1]) { - flags += BIT_TARGET_NEXT; - } - */ - if (!useArcArray && node != 1 && arc.target == node-1) { - flags += BIT_TARGET_NEXT; - if (!retry) { - nextCount++; - } - } - if (arc.isFinal()) { - flags += BIT_FINAL_ARC; - if (arc.nextFinalOutput != NO_OUTPUT) { - flags += BIT_ARC_HAS_FINAL_OUTPUT; - } - } else { - assert arc.nextFinalOutput == NO_OUTPUT; - } - if (!targetHasArcs(arc)) { - flags += BIT_STOP_NODE; - } - - if (arc.output != NO_OUTPUT) { - flags += BIT_ARC_HAS_OUTPUT; - } - - final long absPtr; - final boolean doWriteTarget = targetHasArcs(arc) && (flags & BIT_TARGET_NEXT) == 0; - if (doWriteTarget) { - - final Integer ptr = topNodeMap.get(arc.target); - if (ptr != null) { - absPtr = ptr; - } else { - absPtr = topNodeMap.size() + newNodeAddress.get((int) arc.target) + addressError; - } - - long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition() - 2; - if (delta < 0) { - //System.out.println("neg: " + delta); - anyNegDelta = true; - delta = 0; - } - - if (delta < absPtr) { - flags |= BIT_TARGET_DELTA; - } - } else { - absPtr = 0; - } - - assert flags != ARCS_AS_FIXED_ARRAY; - writer.writeByte(flags); - - fst.writeLabel(writer, arc.label); - - if (arc.output != NO_OUTPUT) { - outputs.write(arc.output, writer); - } - if (arc.nextFinalOutput != NO_OUTPUT) { - outputs.writeFinalOutput(arc.nextFinalOutput, writer); - } - - if (doWriteTarget) { - - long delta = newNodeAddress.get((int) arc.target) + addressError - writer.getPosition(); - if (delta < 0) { - anyNegDelta = true; - //System.out.println("neg: " + delta); - delta = 0; - } - - if (flag(flags, BIT_TARGET_DELTA)) { - //System.out.println(" delta"); - writer.writeVLong(delta); - if (!retry) { - deltaCount++; - } - } else { - /* - if (ptr != null) { - System.out.println(" deref"); - } else { - System.out.println(" abs"); - } - */ - writer.writeVLong(absPtr); - if (!retry) { - if (absPtr >= topNodeMap.size()) { - absCount++; - } else { - topCount++; - } - } - } - } - - if (useArcArray) { - final int arcBytes = (int) (writer.getPosition() - arcStartPos); - //System.out.println(" " + arcBytes + " bytes"); - maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes); - // NOTE: this may in fact go "backwards", if - // somehow (rarely, possibly never) we use - // more bytesPerArc in this rewrite than the - // incoming FST did... but in this case we - // will retry (below) so it's OK to ovewrite - // bytes: - //wasted += bytesPerArc - arcBytes; - writer.skipBytes((int) (arcStartPos + bytesPerArc - writer.getPosition())); - } - - if (arc.isLast()) { - break; - } - - readNextRealArc(arc, r); - } - - if (useArcArray) { - if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc)) { - // converged - //System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted); - //totWasted += wasted; - break; - } - } else { - break; - } - - //System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc); - - // Retry: - bytesPerArc = maxBytesPerArc; - writer.truncate(address); - nodeArcCount = 0; - retry = true; - anyNegDelta = false; - } - - negDelta |= anyNegDelta; - } - - if (!changed) { - // We don't renumber the nodes (just reverse their - // order) so nodes should only point forward to - // other nodes because we only produce acyclic FSTs - // w/ nodes only pointing "forwards": - assert !negDelta; - //System.out.println("TOT wasted=" + totWasted); - // Converged! - break; - } - } - - long maxAddress = 0; - for (long key : topNodeMap.keySet()) { - maxAddress = Math.max(maxAddress, newNodeAddress.get((int) key)); - } - - PackedInts.Mutable nodeRefToAddressIn = PackedInts.getMutable(topNodeMap.size(), - PackedInts.bitsRequired(maxAddress), acceptableOverheadRatio); - for(Map.Entry ent : topNodeMap.entrySet()) { - nodeRefToAddressIn.set(ent.getValue(), newNodeAddress.get(ent.getKey())); - } - fst.nodeRefToAddress = nodeRefToAddressIn; - - fst.startNode = newNodeAddress.get((int) startNode); - //System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode); - - if (emptyOutput != null) { - fst.setEmptyOutput(emptyOutput); - } - - fst.bytes.finish(); - fst.cacheRootArcs(); - - //final int size = fst.sizeInBytes(); - //System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount); - - return fst; - } - - private static class NodeAndInCount implements Comparable { - final int node; - final int count; - - public NodeAndInCount(int node, int count) { - this.node = node; - this.count = count; - } - - @Override - public int compareTo(NodeAndInCount other) { - if (count > other.count) { - return 1; - } else if (count < other.count) { - return -1; - } else { - // Tie-break: smaller node compares as greater than - return other.node - node; - } - } - } - - private static class NodeQueue extends PriorityQueue { - public NodeQueue(int topN) { - super(topN, false); - } - - @Override - public boolean lessThan(NodeAndInCount a, NodeAndInCount b) { - final int cmp = a.compareTo(b); - assert cmp != 0; - return cmp < 0; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java index 41426f96426..d9845861528 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/package-info.java @@ -24,7 +24,6 @@ *

  • Fast and low memory overhead construction of the minimal FST * (but inputs must be provided in sorted order)
  • *
  • Low object overhead and quick deserialization (byte[] representation)
  • - *
  • Optional two-pass compression: {@link org.apache.lucene.util.fst.FST#pack FST.pack()}
  • *
  • {@link org.apache.lucene.util.fst.Util#getByOutput Lookup-by-output} when the * outputs are in sorted order (e.g., ordinals or file pointers)
  • *
  • Pluggable {@link org.apache.lucene.util.fst.Outputs Outputs} representation
  • diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java index 3820733ecca..37a7e4c5822 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java @@ -37,7 +37,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase { // create a small string such that the single pass approach is used int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1); String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length); - byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR]; + byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())]; int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8); GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8); @@ -61,7 +61,7 @@ public class TestGrowableByteArrayDataOutput extends LuceneTestCase { int num = atLeast(100); for (int i = 0; i < num; i++) { String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass); - byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR]; + byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())]; int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8); GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocsWithFieldSet.java b/lucene/core/src/test/org/apache/lucene/index/TestDocsWithFieldSet.java new file mode 100644 index 00000000000..b719adfd7c2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocsWithFieldSet.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestDocsWithFieldSet extends LuceneTestCase { + + public void testDense() throws IOException { + DocsWithFieldSet set = new DocsWithFieldSet(); + DocIdSetIterator it = set.iterator(); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + set.add(0); + it = set.iterator(); + assertEquals(0, it.nextDoc()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + + long ramBytesUsed = set.ramBytesUsed(); + for (int i = 1; i < 1000; ++i) { + set.add(i); + } + assertEquals(ramBytesUsed, set.ramBytesUsed()); + it = set.iterator(); + for (int i = 0; i < 1000; ++i) { + assertEquals(i, it.nextDoc()); + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + } + + public void testSparse() throws IOException { + DocsWithFieldSet set = new DocsWithFieldSet(); + int doc = random().nextInt(10000); + set.add(doc); + DocIdSetIterator it = set.iterator(); + assertEquals(doc, it.nextDoc()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + int doc2 = doc + TestUtil.nextInt(random(), 1, 100); + set.add(doc2); + it = set.iterator(); + assertEquals(doc, it.nextDoc()); + assertEquals(doc2, it.nextDoc()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + } + + public void testDenseThenSparse() throws IOException { + int denseCount = random().nextInt(10000); + int nextDoc = denseCount + random().nextInt(10000); + DocsWithFieldSet set = new DocsWithFieldSet(); + for (int i = 0; i < denseCount; ++i) { + set.add(i); + } + set.add(nextDoc); + DocIdSetIterator it = set.iterator(); + for (int i = 0; i < denseCount; ++i) { + assertEquals(i, it.nextDoc()); + } + assertEquals(nextDoc, it.nextDoc()); + assertEquals(DocIdSetIterator.NO_MORE_DOCS, it.nextDoc()); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 6f112a7f748..a99576305a1 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -97,6 +97,7 @@ import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton; +import org.junit.Ignore; import org.junit.Test; public class TestIndexWriter extends LuceneTestCase { @@ -2768,5 +2769,34 @@ public class TestIndexWriter extends LuceneTestCase { dir.close(); } + @Ignore("requires running tests with biggish heap") + public void testMassiveField() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + final IndexWriter w = new IndexWriter(dir, iwc); + + StringBuilder b = new StringBuilder(); + while (b.length() <= IndexWriter.MAX_STORED_STRING_LENGTH) { + b.append("x "); + } + + final Document doc = new Document(); + //doc.add(new TextField("big", b.toString(), Field.Store.YES)); + doc.add(new StoredField("big", b.toString())); + Exception e = expectThrows(IllegalArgumentException.class, () -> {w.addDocument(doc);}); + assertEquals("stored field \"big\" is too large (" + b.length() + " characters) to store", e.getMessage()); + + // make sure writer is still usable: + Document doc2 = new Document(); + doc2.add(new StringField("id", "foo", Field.Store.YES)); + w.addDocument(doc2); + + DirectoryReader r = DirectoryReader.open(w); + assertEquals(1, r.numDocs()); + r.close(); + w.close(); + dir.close(); + } + } diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java new file mode 100644 index 00000000000..15b1448eb6e --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestBooleanSimilarity.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similarities; + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BoostQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +public class TestBooleanSimilarity extends LuceneTestCase { + + public void testTermScoreIsEqualToBoost() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, + newIndexWriterConfig()); + Document doc = new Document(); + doc.add(new StringField("foo", "bar", Store.NO)); + doc.add(new StringField("foo", "baz", Store.NO)); + w.addDocument(doc); + doc = new Document(); + doc.add(new StringField("foo", "bar", Store.NO)); + doc.add(new StringField("foo", "bar", Store.NO)); + w.addDocument(doc); + + DirectoryReader reader = w.getReader(); + w.close(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new BooleanSimilarity()); + TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2); + assertEquals(2, topDocs.totalHits); + assertEquals(1f, topDocs.scoreDocs[0].score, 0f); + assertEquals(1f, topDocs.scoreDocs[1].score, 0f); + + topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1); + assertEquals(1, topDocs.totalHits); + assertEquals(1f, topDocs.scoreDocs[0].score, 0f); + + topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1); + assertEquals(1, topDocs.totalHits); + assertEquals(3f, topDocs.scoreDocs[0].score, 0f); + + reader.close(); + dir.close(); + } + + public void testPhraseScoreIsEqualToBoost() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, + newIndexWriterConfig().setSimilarity(new BooleanSimilarity())); + Document doc = new Document(); + doc.add(new TextField("foo", "bar baz quux", Store.NO)); + w.addDocument(doc); + + DirectoryReader reader = w.getReader(); + w.close(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(new BooleanSimilarity()); + + PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux"); + + TopDocs topDocs = searcher.search(query, 2); + assertEquals(1, topDocs.totalHits); + assertEquals(1f, topDocs.scoreDocs[0].score, 0f); + + topDocs = searcher.search(new BoostQuery(query, 7), 2); + assertEquals(1, topDocs.totalHits); + assertEquals(7f, topDocs.scoreDocs[0].score, 0f); + + reader.close(); + dir.close(); + } + + public void testSameNormsAsBM25() { + BooleanSimilarity sim1 = new BooleanSimilarity(); + BM25Similarity sim2 = new BM25Similarity(); + sim2.setDiscountOverlaps(true); + for (int iter = 0; iter < 100; ++iter) { + final int length = TestUtil.nextInt(random(), 1, 100); + final int position = random().nextInt(length); + final int numOverlaps = random().nextInt(50); + final float boost = random().nextFloat() * 10; + FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100, boost); + assertEquals( + sim2.computeNorm(state), + sim1.computeNorm(state), + 0f); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java index b18a38df2d5..d699719e478 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java @@ -274,20 +274,42 @@ public class TestBasics extends LuceneTestCase { assertTrue(searcher.explain(query, 849).getValue() > 0.0f); } - public void testSpanNotWindowNeg() throws Exception { + public void testSpanNotWindowNegPost() throws Exception { //test handling of invalid window < 0 SpanQuery near = spanNearOrderedQuery("field", 4, "eight", "one"); SpanQuery or = spanOrQuery("field", "forty"); - SpanQuery query = spanNotQuery(near, or); - + SpanQuery query = spanNotQuery(near, or, 0, -1); checkHits(query, new int[] {801, 821, 831, 851, 861, 871, 881, 891, 1801, 1821, 1831, 1851, 1861, 1871, 1881, 1891}); + query = spanNotQuery(near, or, 0, -2); + checkHits(query, new int[] + {801, 821, 831, 841, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1841, 1851, 1861, 1871, 1881, 1891}); + assertTrue(searcher.explain(query, 801).getValue() > 0.0f); assertTrue(searcher.explain(query, 891).getValue() > 0.0f); } - + + public void testSpanNotWindowNegPre() throws Exception { + //test handling of invalid window < 0 + SpanQuery near = spanNearOrderedQuery("field", 4, "eight", "one"); + SpanQuery or = spanOrQuery("field", "forty"); + SpanQuery query = spanNotQuery(near, or, -2, 0); + checkHits(query, new int[] + {801, 821, 831, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1851, 1861, 1871, 1881, 1891}); + + query = spanNotQuery(near, or, -3, 0); + checkHits(query, new int[] + {801, 821, 831, 841, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1841, 1851, 1861, 1871, 1881, 1891}); + + assertTrue(searcher.explain(query, 801).getValue() > 0.0f); + assertTrue(searcher.explain(query, 891).getValue() > 0.0f); + } + public void testSpanNotWindowDoubleExcludesBefore() throws Exception { //test hitting two excludes before an include SpanQuery near = spanNearOrderedQuery("field", 2, "forty", "two"); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java index 2d5e05cf8e5..2b5b919f385 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java @@ -99,7 +99,6 @@ public class TestSpans extends LuceneTestCase { "s2 s1 s1 xx xx s2 xx s2 xx s1 xx xx xx xx xx s2 xx", "r1 s11", "r1 s21" - }; private void checkHits(Query query, int[] results) throws IOException { @@ -406,42 +405,54 @@ public class TestSpans extends LuceneTestCase { } - - public void testSpanNots() throws Throwable{ - assertEquals("SpanNotIncludeExcludeSame1", 0, spanCount("s2", "s2", 0, 0), 0); - assertEquals("SpanNotIncludeExcludeSame2", 0, spanCount("s2", "s2", 10, 10), 0); - - //focus on behind - assertEquals("SpanNotS2NotS1_6_0", 1, spanCount("s2", "s1", 6, 0)); - assertEquals("SpanNotS2NotS1_5_0", 2, spanCount("s2", "s1", 5, 0)); - assertEquals("SpanNotS2NotS1_3_0", 3, spanCount("s2", "s1", 3, 0)); - assertEquals("SpanNotS2NotS1_2_0", 4, spanCount("s2", "s1", 2, 0)); - assertEquals("SpanNotS2NotS1_0_0", 4, spanCount("s2", "s1", 0, 0)); - - //focus on both - assertEquals("SpanNotS2NotS1_3_1", 2, spanCount("s2", "s1", 3, 1)); - assertEquals("SpanNotS2NotS1_2_1", 3, spanCount("s2", "s1", 2, 1)); - assertEquals("SpanNotS2NotS1_1_1", 3, spanCount("s2", "s1", 1, 1)); - assertEquals("SpanNotS2NotS1_10_10", 0, spanCount("s2", "s1", 10, 10)); - - //focus on ahead - assertEquals("SpanNotS1NotS2_10_10", 0, spanCount("s1", "s2", 10, 10)); - assertEquals("SpanNotS1NotS2_0_1", 3, spanCount("s1", "s2", 0, 1)); - assertEquals("SpanNotS1NotS2_0_2", 3, spanCount("s1", "s2", 0, 2)); - assertEquals("SpanNotS1NotS2_0_3", 2, spanCount("s1", "s2", 0, 3)); - assertEquals("SpanNotS1NotS2_0_4", 1, spanCount("s1", "s2", 0, 4)); - assertEquals("SpanNotS1NotS2_0_8", 0, spanCount("s1", "s2", 0, 8)); - - //exclude doesn't exist - assertEquals("SpanNotS1NotS3_8_8", 3, spanCount("s1", "s3", 8, 8)); - //include doesn't exist - assertEquals("SpanNotS3NotS1_8_8", 0, spanCount("s3", "s1", 8, 8)); + public void testSpanNots() throws Throwable { + assertEquals("SpanNotIncludeExcludeSame1", 0, spanCount("s2", 0, "s2", 0, 0), 0); + assertEquals("SpanNotIncludeExcludeSame2", 0, spanCount("s2", 0, "s2", 10, 10), 0); + + //focus on behind + assertEquals("SpanNotS2NotS1_6_0", 1, spanCount("s2", 0, "s1", 6, 0)); + assertEquals("SpanNotS2NotS1_5_0", 2, spanCount("s2", 0, "s1", 5, 0)); + assertEquals("SpanNotS2NotS1_3_0", 3, spanCount("s2", 0, "s1", 3, 0)); + assertEquals("SpanNotS2NotS1_2_0", 4, spanCount("s2", 0, "s1", 2, 0)); + assertEquals("SpanNotS2NotS1_0_0", 4, spanCount("s2", 0, "s1", 0, 0)); + + //focus on both + assertEquals("SpanNotS2NotS1_3_1", 2, spanCount("s2", 0, "s1", 3, 1)); + assertEquals("SpanNotS2NotS1_2_1", 3, spanCount("s2", 0, "s1", 2, 1)); + assertEquals("SpanNotS2NotS1_1_1", 3, spanCount("s2", 0, "s1", 1, 1)); + assertEquals("SpanNotS2NotS1_10_10", 0, spanCount("s2", 0, "s1", 10, 10)); + + //focus on ahead + assertEquals("SpanNotS1NotS2_10_10", 0, spanCount("s1", 0, "s2", 10, 10)); + assertEquals("SpanNotS1NotS2_0_1", 3, spanCount("s1", 0, "s2", 0, 1)); + assertEquals("SpanNotS1NotS2_0_2", 3, spanCount("s1", 0, "s2", 0, 2)); + assertEquals("SpanNotS1NotS2_0_3", 2, spanCount("s1", 0, "s2", 0, 3)); + assertEquals("SpanNotS1NotS2_0_4", 1, spanCount("s1", 0, "s2", 0, 4)); + assertEquals("SpanNotS1NotS2_0_8", 0, spanCount("s1", 0, "s2", 0, 8)); + + //exclude doesn't exist + assertEquals("SpanNotS1NotS3_8_8", 3, spanCount("s1", 0, "s3", 8, 8)); + + //include doesn't exist + assertEquals("SpanNotS3NotS1_8_8", 0, spanCount("s3", 0, "s1", 8, 8)); + + // Negative values + assertEquals("SpanNotS2S1NotXXNeg_0_0", 1, spanCount("s2 s1", 10, "xx", 0, 0)); + assertEquals("SpanNotS2S1NotXXNeg_1_1", 1, spanCount("s2 s1", 10, "xx", -1, -1)); + assertEquals("SpanNotS2S1NotXXNeg_0_2", 2, spanCount("s2 s1", 10, "xx", 0, -2)); + assertEquals("SpanNotS2S1NotXXNeg_1_2", 2, spanCount("s2 s1", 10, "xx", -1, -2)); + assertEquals("SpanNotS2S1NotXXNeg_2_1", 2, spanCount("s2 s1", 10, "xx", -2, -1)); + assertEquals("SpanNotS2S1NotXXNeg_3_1", 2, spanCount("s2 s1", 10, "xx", -3, -1)); + assertEquals("SpanNotS2S1NotXXNeg_1_3", 2, spanCount("s2 s1", 10, "xx", -1, -3)); + assertEquals("SpanNotS2S1NotXXNeg_2_2", 3, spanCount("s2 s1", 10, "xx", -2, -2)); } - - private int spanCount(String include, String exclude, int pre, int post) throws IOException{ - SpanQuery iq = spanTermQuery(field, include); + + + private int spanCount(String include, int slop, String exclude, int pre, int post) throws IOException{ + String[] includeTerms = include.split(" +"); + SpanQuery iq = includeTerms.length == 1 ? spanTermQuery(field, include) : spanNearOrderedQuery(field, slop, includeTerms); SpanQuery eq = spanTermQuery(field, exclude); SpanQuery snq = spanNotQuery(iq, eq, pre, post); Spans spans = snq.createWeight(searcher, false, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java index 7bbd606e4e0..15251ad9b5e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java @@ -111,7 +111,7 @@ public class TestUnicodeUtil extends LuceneTestCase { int num = atLeast(50000); for (int i = 0; i < num; i++) { final String s = TestUtil.randomUnicodeString(random()); - final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR]; + final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())]; final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); assertEquals(s.codePointCount(0, s.length()), UnicodeUtil.codePointCount(new BytesRef(utf8, 0, utf8Len))); @@ -137,7 +137,7 @@ public class TestUnicodeUtil extends LuceneTestCase { int num = atLeast(50000); for (int i = 0; i < num; i++) { final String s = TestUtil.randomUnicodeString(random()); - final byte[] utf8 = new byte[s.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR]; + final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())]; final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); utf32 = ArrayUtil.grow(utf32, utf8Len); final int utf32Len = UnicodeUtil.UTF8toUTF32(new BytesRef(utf8, 0, utf8Len), utf32); @@ -208,7 +208,7 @@ public class TestUnicodeUtil extends LuceneTestCase { int num = atLeast(5000); for (int i = 0; i < num; i++) { String unicode = TestUtil.randomUnicodeString(random()); - byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR]; + byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(unicode.length())]; int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8); assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length())); } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java index 6434c1cde76..1c1d1d419da 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java @@ -41,7 +41,7 @@ public class TestUTF32ToUTF8 extends LuceneTestCase { private boolean matches(ByteRunAutomaton a, int code) { char[] chars = Character.toChars(code); - byte[] b = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * chars.length]; + byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)]; final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); return a.run(b, 0, len); } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java index bdec65cbb2d..a02bf8afafe 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TimeUnits; -import org.apache.lucene.util.packed.PackedInts; import org.junit.Ignore; import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite; @@ -47,16 +46,14 @@ public class Test2BFST extends LuceneTestCase { Directory dir = new MMapDirectory(createTempDir("2BFST")); - for(int doPackIter=0;doPackIter<2;doPackIter++) { - boolean doPack = doPackIter == 1; - + for(int iter=0;iter<1;iter++) { // Build FST w/ NoOutputs and stop when nodeCount > 2.2B - if (!doPack) { + { System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs outputs = NoOutputs.getSingleton(); Object NO_OUTPUT = outputs.getNoOutput(); final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - doPack, PackedInts.COMPACT, true, 15); + true, 15); int count = 0; Random r = new Random(seed); @@ -135,10 +132,10 @@ public class Test2BFST extends LuceneTestCase { // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { - System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); + System.out.println("\nTEST: 3 GB size; outputs=bytes"); Outputs outputs = ByteSequenceOutputs.getSingleton(); final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - doPack, PackedInts.COMPACT, true, 15); + true, 15); byte[] outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); @@ -212,10 +209,10 @@ public class Test2BFST extends LuceneTestCase { // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { - System.out.println("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); + System.out.println("\nTEST: 3 GB size; outputs=long"); Outputs outputs = PositiveIntOutputs.getSingleton(); final Builder b = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, - doPack, PackedInts.COMPACT, true, 15); + true, 15); long output = 1; diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 39b3282a28c..6b218cf3b9d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -76,7 +76,6 @@ import org.apache.lucene.util.fst.FST.Arc; import org.apache.lucene.util.fst.FST.BytesReader; import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.Util.Result; -import org.apache.lucene.util.packed.PackedInts; import static org.apache.lucene.util.fst.FSTTester.getRandomString; import static org.apache.lucene.util.fst.FSTTester.simpleRandomString; @@ -328,9 +327,7 @@ public class TestFSTs extends LuceneTestCase { writer.close(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final boolean doRewrite = random().nextBoolean(); - - Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, doRewrite, PackedInts.DEFAULT, true, 15); + Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); boolean storeOrd = random().nextBoolean(); if (VERBOSE) { @@ -464,16 +461,14 @@ public class TestFSTs extends LuceneTestCase { private int inputMode; private final Outputs outputs; private final Builder builder; - private final boolean doPack; - public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs outputs, boolean doPack, boolean noArcArrays) { + public VisitTerms(Path dirOut, Path wordsFileIn, int inputMode, int prune, Outputs outputs, boolean noArcArrays) { this.dirOut = dirOut; this.wordsFileIn = wordsFileIn; this.inputMode = inputMode; this.outputs = outputs; - this.doPack = doPack; - builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, doPack, PackedInts.DEFAULT, !noArcArrays, 15); + builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs, !noArcArrays, 15); } protected abstract T getOutput(IntsRef input, int ord) throws IOException; @@ -622,7 +617,6 @@ public class TestFSTs extends LuceneTestCase { boolean storeOrds = false; boolean storeDocFreqs = false; boolean verify = true; - boolean doPack = false; boolean noArcArrays = false; Path wordsFileIn = null; Path dirOut = null; @@ -647,8 +641,6 @@ public class TestFSTs extends LuceneTestCase { storeOrds = true; } else if (args[idx].equals("-noverify")) { verify = false; - } else if (args[idx].equals("-pack")) { - doPack = true; } else if (args[idx].startsWith("-")) { System.err.println("Unrecognized option: " + args[idx]); System.exit(-1); @@ -677,7 +669,7 @@ public class TestFSTs extends LuceneTestCase { final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(); final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(); final PairOutputs outputs = new PairOutputs<>(o1, o2); - new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { Random rand; @Override public PairOutputs.Pair getOutput(IntsRef input, int ord) { @@ -691,7 +683,7 @@ public class TestFSTs extends LuceneTestCase { } else if (storeOrds) { // Store only ords final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { @Override public Long getOutput(IntsRef input, int ord) { return (long) ord; @@ -700,7 +692,7 @@ public class TestFSTs extends LuceneTestCase { } else if (storeDocFreqs) { // Store only docFreq final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { Random rand; @Override public Long getOutput(IntsRef input, int ord) { @@ -714,7 +706,7 @@ public class TestFSTs extends LuceneTestCase { // Store nothing final NoOutputs outputs = NoOutputs.getSingleton(); final Object NO_OUTPUT = outputs.getNoOutput(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) { + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs, noArcArrays) { @Override public Object getOutput(IntsRef input, int ord) { return NO_OUTPUT; @@ -1118,7 +1110,7 @@ public class TestFSTs extends LuceneTestCase { public void testFinalOutputOnEndState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, random().nextBoolean(), PackedInts.DEFAULT, true, 15); + final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); builder.add(Util.toUTF32("stat", new IntsRefBuilder()), 17L); builder.add(Util.toUTF32("station", new IntsRefBuilder()), 10L); final FST fst = builder.finish(); @@ -1132,8 +1124,7 @@ public class TestFSTs extends LuceneTestCase { public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - final boolean willRewrite = random().nextBoolean(); - final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15); + final Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); final FST fst = builder.finish(); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java index 553a636ed6a..6b4cc74a48e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java @@ -19,8 +19,10 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.FilteringTokenFilter; @@ -30,6 +32,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Terms; import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.Query; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; @@ -50,7 +53,9 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { private final LeafReader leafReader; private final CharacterRunAutomaton preMemIndexFilterAutomaton; - public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) { + public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper, + CharacterRunAutomaton[] automata, Analyzer analyzer, + Function> multiTermQueryRewrite) { super(field, extractedTerms, phraseHelper, automata); this.analyzer = analyzer; // Automata (Wildcards / MultiTermQuery): @@ -68,7 +73,8 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // preFilter for MemoryIndex - preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases); + preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases, + multiTermQueryRewrite); } else { memoryIndex = null; leafReader = null; @@ -155,7 +161,8 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { */ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms, CharacterRunAutomaton[] automata, - PhraseHelper strictPhrases) { + PhraseHelper strictPhrases, + Function> multiTermQueryRewrite) { List allAutomata = new ArrayList<>(); if (terms.length > 0) { allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms)))); @@ -163,7 +170,7 @@ public class AnalysisOffsetStrategy extends FieldOffsetStrategy { Collections.addAll(allAutomata, automata); for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) { Collections.addAll(allAutomata, - MultiTermHighlighting.extractAutomata(spanQuery, field, true));//true==lookInSpan + MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan } if (allAutomata.size() == 1) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java index 9498af584a8..e85fa3bffa9 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java @@ -20,8 +20,10 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Comparator; import java.util.List; +import java.util.function.Function; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -69,34 +71,44 @@ class MultiTermHighlighting { * Extracts all MultiTermQueries for {@code field}, and returns equivalent * automata that will match terms. */ - public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan) { + public static CharacterRunAutomaton[] extractAutomata(Query query, String field, boolean lookInSpan, + Function> preRewriteFunc) { List list = new ArrayList<>(); - if (query instanceof BooleanQuery) { + Collection customSubQueries = preRewriteFunc.apply(query); + if (customSubQueries != null) { + for (Query sub : customSubQueries) { + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); + } + } else if (query instanceof BooleanQuery) { for (BooleanClause clause : (BooleanQuery) query) { if (!clause.isProhibited()) { - list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field, lookInSpan, preRewriteFunc))); } } } else if (query instanceof ConstantScoreQuery) { - list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field, lookInSpan, + preRewriteFunc))); } else if (query instanceof DisjunctionMaxQuery) { for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanOrQuery) { for (Query sub : ((SpanOrQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNearQuery) { for (Query sub : ((SpanNearQuery) query).getClauses()) { - list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(sub, field, lookInSpan, preRewriteFunc))); } } else if (lookInSpan && query instanceof SpanNotQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field, lookInSpan, + preRewriteFunc))); } else if (lookInSpan && query instanceof SpanPositionCheckQuery) { - list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field, lookInSpan, + preRewriteFunc))); } else if (lookInSpan && query instanceof SpanMultiTermQueryWrapper) { - list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, lookInSpan))); + list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper) query).getWrappedQuery(), field, + lookInSpan, preRewriteFunc))); } else if (query instanceof AutomatonQuery) { final AutomatonQuery aq = (AutomatonQuery) query; if (aq.getField().equals(field)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index 5225041f9be..95d51c917da 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -40,7 +40,7 @@ import java.util.function.Function; public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", - spanQuery -> null, true); + spanQuery -> null, query -> null, true); //TODO it seems this ought to be a general thing on Spans? private static final Comparator SPANS_COMPARATOR = (o1, o2) -> { @@ -69,11 +69,14 @@ public class PhraseHelper { * {@code rewriteQueryPred} is an extension hook to override the default choice of * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten, * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten. + * Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries + * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked. * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones. */ public PhraseHelper(Query query, String field, Function rewriteQueryPred, - boolean ignoreQueriesNeedingRewrite) { + Function> preExtractRewriteFunction, + boolean ignoreQueriesNeedingRewrite) { this.fieldName = field; // if null then don't require field match // filter terms to those we want positionInsensitiveTerms = field != null ? new FieldFilteringTermHashSet(field) : new HashSet<>(); @@ -98,6 +101,18 @@ public class PhraseHelper { } } + @Override + protected void extract(Query query, float boost, Map terms) throws IOException { + Collection newQueriesToExtract = preExtractRewriteFunction.apply(query); + if (newQueriesToExtract != null) { + for (Query newQuery : newQueriesToExtract) { + extract(newQuery, boost, terms); + } + } else { + super.extract(query, boost, terms); + } + } + @Override protected boolean isQueryUnsupported(Class clazz) { if (clazz.isAssignableFrom(MultiTermQuery.class)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 72be180c177..5f09d84f033 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; import java.util.List; @@ -732,7 +733,8 @@ public class UnifiedHighlighter { OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata); switch (offsetSource) { case ANALYSIS: - return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer()); + return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(), + this::preMultiTermQueryRewrite); case NONE_NEEDED: return NoOpOffsetStrategy.INSTANCE; case TERM_VECTORS: @@ -776,13 +778,14 @@ public class UnifiedHighlighter { boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES); boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY); return highlightPhrasesStrictly ? - new PhraseHelper(query, field, this::requiresRewrite, !handleMultiTermQuery) : + new PhraseHelper(query, field, this::requiresRewrite, this::preSpanQueryRewrite, !handleMultiTermQuery) : PhraseHelper.NONE; } protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet highlightFlags) { return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY) - ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES)) + ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES), + this::preMultiTermQueryRewrite) : ZERO_LEN_AUTOMATA_ARRAY; } @@ -830,6 +833,32 @@ public class UnifiedHighlighter { return null; } + /** + * When highlighting phrases accurately, we may need to handle custom queries that aren't supported in the + * {@link org.apache.lucene.search.highlight.WeightedSpanTermExtractor} as called by the {@code PhraseHelper}. + * Should custom query types be needed, this method should be overriden to return a collection of queries if appropriate, + * or null if nothing to do. If the query is not custom, simply returning null will allow the default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needs to be rewritten, otherwise null. + */ + protected Collection preSpanQueryRewrite(Query query) { + return null; + } + + /** + * When dealing with multi term queries / span queries, we may need to handle custom queries that aren't supported + * by the default automata extraction in {@code MultiTermHighlighting}. This can be overridden to return a collection + * of queries if appropriate, or null if nothing to do. If query is not custom, simply returning null will allow the + * default rules to apply. + * + * @param query Query to be highlighted + * @return A Collection of Query object(s) if needst o be rewritten, otherwise null. + */ + protected Collection preMultiTermQueryRewrite(Query query) { + return null; + } + private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) { return new DocIdSetIterator() { int idx = -1; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java index 63f0bb1ca5b..ddc9507d62b 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java @@ -20,6 +20,8 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.List; +import java.util.Objects; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.Analyzer; @@ -56,6 +58,7 @@ import org.apache.lucene.search.spans.SpanNotQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @@ -933,4 +936,89 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase { ir.close(); } + public void testCustomSpanQueryHighlighting() throws Exception { + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + Document doc = new Document(); + doc.add(new Field("body", "alpha bravo charlie delta echo foxtrot golf hotel india juliet", fieldType)); + doc.add(newTextField("id", "id", Field.Store.YES)); + + iw.addDocument(doc); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected List preMultiTermQueryRewrite(Query query) { + if (query instanceof MyWrapperSpanQuery) { + return Collections.singletonList(((MyWrapperSpanQuery) query).originalQuery); + } + return null; + } + }; + + int docId = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; + + WildcardQuery wildcardQuery = new WildcardQuery(new Term("body", "foxtr*")); + SpanMultiTermQueryWrapper wildcardQueryWrapper = new SpanMultiTermQueryWrapper<>(wildcardQuery); + + SpanQuery wrappedQuery = new MyWrapperSpanQuery(wildcardQueryWrapper); + + BooleanQuery query = new BooleanQuery.Builder() + .add(wrappedQuery, BooleanClause.Occur.SHOULD) + .build(); + + int[] docIds = new int[]{docId}; + + String snippets[] = highlighter.highlightFields(new String[]{"body"}, query, docIds, new int[]{2}).get("body"); + assertEquals(1, snippets.length); + assertEquals("alpha bravo charlie delta echo foxtrot golf hotel india juliet", snippets[0]); + ir.close(); + } + + private static class MyWrapperSpanQuery extends SpanQuery { + + private final SpanQuery originalQuery; + + private MyWrapperSpanQuery(SpanQuery originalQuery) { + this.originalQuery = Objects.requireNonNull(originalQuery); + } + + @Override + public String getField() { + return originalQuery.getField(); + } + + @Override + public String toString(String field) { + return "(Wrapper[" + originalQuery.toString(field)+"])"; + } + + @Override + public SpanWeight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return originalQuery.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newOriginalQuery = originalQuery.rewrite(reader); + if (newOriginalQuery != originalQuery) { + return new MyWrapperSpanQuery((SpanQuery)newOriginalQuery); + } + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + return originalQuery.equals(((MyWrapperSpanQuery)o).originalQuery); + } + + @Override + public int hashCode() { + return originalQuery.hashCode(); + } + } + } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java index 5fecdc6d5bc..dafb6e23460 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterStrictPhrases.java @@ -17,6 +17,8 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.analysis.MockAnalyzer; @@ -29,14 +31,17 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanNearQuery; @@ -401,4 +406,76 @@ public class TestUnifiedHighlighterStrictPhrases extends LuceneTestCase { Object o = highlighter.highlightWithoutSearcher("body", new MatchNoDocsQuery(), content, 1); assertEquals(content, o); } + + public void testPreSpanQueryRewrite() throws IOException { + indexWriter.addDocument(newDoc("There is no accord and satisfaction with this - Consideration of the accord is arbitrary.")); + initReaderSearcherHighlighter(); + + highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { + @Override + protected Collection preSpanQueryRewrite(Query query) { + if (query instanceof MyQuery) { + return Collections.singletonList(((MyQuery)query).wrapped); + } + return null; + } + }; + highlighter.setHighlightPhrasesStrictly(true); + + BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); + Query phraseQuery = new BoostQuery(new PhraseQuery("body", "accord", "and", "satisfaction"), 2.0f); + Query oredTerms = new BooleanQuery.Builder() + .setMinimumNumberShouldMatch(2) + .add(new TermQuery(new Term("body", "accord")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "satisfaction")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("body", "consideration")), BooleanClause.Occur.SHOULD) + .build(); + Query proximityBoostingQuery = new MyQuery(oredTerms); + Query totalQuery = bqBuilder + .add(phraseQuery, BooleanClause.Occur.SHOULD) + .add(proximityBoostingQuery, BooleanClause.Occur.SHOULD) + .build(); + TopDocs topDocs = searcher.search(totalQuery, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String[] snippets = highlighter.highlight("body", totalQuery, topDocs); + assertArrayEquals(new String[]{"There is no accord and satisfaction with this - Consideration of the accord is arbitrary."}, snippets); + } + + private static class MyQuery extends Query { + + private final Query wrapped; + + MyQuery(Query wrapped) { + this.wrapped = wrapped; + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { + return wrapped.createWeight(searcher, needsScores, boost); + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newWrapped = wrapped.rewrite(reader); + if (newWrapped != wrapped) { + return new MyQuery(newWrapped); + } + return this; + } + + @Override + public String toString(String field) { + return "[[["+wrapped.toString(field)+"]]]"; + } + + @Override + public boolean equals(Object obj) { + return obj != null && obj.getClass() == getClass() && wrapped.equals(((MyQuery)wrapped).wrapped); + } + + @Override + public int hashCode() { + return wrapped.hashCode(); + } + } } diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index f42e7010672..251ceb2c56c 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -228,7 +228,7 @@ org.bouncycastle.version = 1.45 /org.carrot2.attributes/attributes-binder = 1.3.1 /org.carrot2.shaded/carrot2-guava = 18.0 -/org.carrot2/carrot2-mini = 3.12.0 +/org.carrot2/carrot2-mini = 3.15.0 org.carrot2.morfologik.version = 2.1.1 /org.carrot2/morfologik-fsa = ${org.carrot2.morfologik.version} diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index b49ea794476..d83b9155e5b 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -50,7 +50,6 @@ import org.apache.lucene.util.fst.PairOutputs.Pair; import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -354,8 +353,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { final Builder> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - FST_OUTPUTS, false, - PackedInts.COMPACT, true, 15); + FST_OUTPUTS, true, 15); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java index 530a9b63993..fbf3dc3ca0e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonQuery.java @@ -23,9 +23,10 @@ import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; @@ -34,6 +35,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Transition; @@ -183,6 +185,10 @@ public class TermAutomatonQuery extends Query { det = Operations.removeDeadStates(Operations.determinize(automaton, maxDeterminizedStates)); + + if (det.isAccept(0)) { + throw new IllegalStateException("cannot accept the empty string"); + } } @Override @@ -396,4 +402,82 @@ public class TermAutomatonQuery extends Query { return null; } } + + public Query rewrite(IndexReader reader) throws IOException { + if (Operations.isEmpty(det)) { + return new MatchNoDocsQuery(); + } + + IntsRef single = Operations.getSingleton(det); + if (single != null && single.length == 1) { + return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset]))); + } + + // TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery? + + // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage: + MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder(); + PhraseQuery.Builder pq = new PhraseQuery.Builder(); + + Transition t = new Transition(); + int state = 0; + int pos = 0; + query: + while (true) { + int count = det.initTransition(state, t); + if (count == 0) { + if (det.isAccept(state) == false) { + mpq = null; + pq = null; + } + break; + } else if (det.isAccept(state)) { + mpq = null; + pq = null; + break; + } + int dest = -1; + List terms = new ArrayList<>(); + boolean matchesAny = false; + for(int i=0;i= t.min && anyTermID <= t.max; + + if (matchesAny == false) { + for(int termID=t.min;termID<=t.max;termID++) { + terms.add(new Term(field, idToTerm.get(termID))); + } + } + } + if (matchesAny == false) { + mpq.add(terms.toArray(new Term[terms.size()]), pos); + if (pq != null) { + if (terms.size() == 1) { + pq.add(terms.get(0), pos); + } else { + pq = null; + } + } + } + state = dest; + pos++; + } + + if (pq != null) { + return pq.build(); + } else if (mpq != null) { + return mpq.build(); + } + + // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"? + return this; + } } diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java index 2467e9927c4..6055e0076cb 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java @@ -296,7 +296,6 @@ public class TestTermAutomatonQuery extends LuceneTestCase { while (scorer instanceof AssertingScorer) { scorer = ((AssertingScorer) scorer).getIn(); } - assert scorer instanceof TermAutomatonScorer; } @Override @@ -683,7 +682,7 @@ public class TestTermAutomatonQuery extends LuceneTestCase { w.addDocument(doc); doc = new Document(); - doc.add(newTextField("field", "comes here", Field.Store.NO)); + doc.add(newTextField("field", "comes foo", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); @@ -691,9 +690,11 @@ public class TestTermAutomatonQuery extends LuceneTestCase { TermAutomatonQuery q = new TermAutomatonQuery("field"); int init = q.createState(); int s1 = q.createState(); + int s2 = q.createState(); q.addTransition(init, s1, "here"); - q.addTransition(s1, init, "comes"); - q.setAccept(init, true); + q.addTransition(s1, s2, "comes"); + q.addTransition(s2, s1, "here"); + q.setAccept(s1, true); q.finish(); assertEquals(1, s.search(q, 1).totalHits); @@ -779,8 +780,186 @@ public class TestTermAutomatonQuery extends LuceneTestCase { // System.out.println("DOT: " + q.toDot()); assertEquals(0, s.search(q, 1).totalHits); - w.close(); - r.close(); - dir.close(); + IOUtils.close(w, r, dir); + } + + public void testEmptyString() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + q.setAccept(initState, true); + try { + q.finish(); + fail("did not hit exc"); + } catch (IllegalStateException ise) { + // expected + } + } + + public void testRewriteNoMatch() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + assertTrue(q.rewrite(r) instanceof MatchNoDocsQuery); + IOUtils.close(w, r, dir); + } + + public void testRewriteTerm() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + q.addTransition(initState, s1, "foo"); + q.setAccept(s1, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + Query rewrite = q.rewrite(r); + assertTrue(rewrite instanceof TermQuery); + assertEquals(new Term("field", "foo"), ((TermQuery) rewrite).getTerm()); + IOUtils.close(w, r, dir); + } + + public void testRewriteSimplePhrase() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + int s2 = q.createState(); + q.addTransition(initState, s1, "foo"); + q.addTransition(s1, s2, "bar"); + q.setAccept(s2, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + Query rewrite = q.rewrite(r); + assertTrue(rewrite instanceof PhraseQuery); + Term[] terms = ((PhraseQuery) rewrite).getTerms(); + assertEquals(new Term("field", "foo"), terms[0]); + assertEquals(new Term("field", "bar"), terms[1]); + + int[] positions = ((PhraseQuery) rewrite).getPositions(); + assertEquals(0, positions[0]); + assertEquals(1, positions[1]); + + IOUtils.close(w, r, dir); + } + + public void testRewritePhraseWithAny() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + int s2 = q.createState(); + int s3 = q.createState(); + q.addTransition(initState, s1, "foo"); + q.addAnyTransition(s1, s2); + q.addTransition(s2, s3, "bar"); + q.setAccept(s3, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + Query rewrite = q.rewrite(r); + assertTrue(rewrite instanceof PhraseQuery); + Term[] terms = ((PhraseQuery) rewrite).getTerms(); + assertEquals(new Term("field", "foo"), terms[0]); + assertEquals(new Term("field", "bar"), terms[1]); + + int[] positions = ((PhraseQuery) rewrite).getPositions(); + assertEquals(0, positions[0]); + assertEquals(2, positions[1]); + + IOUtils.close(w, r, dir); + } + + public void testRewriteSimpleMultiPhrase() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + q.addTransition(initState, s1, "foo"); + q.addTransition(initState, s1, "bar"); + q.setAccept(s1, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + Query rewrite = q.rewrite(r); + assertTrue(rewrite instanceof MultiPhraseQuery); + Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays(); + assertEquals(1, terms.length); + assertEquals(2, terms[0].length); + assertEquals(new Term("field", "foo"), terms[0][0]); + assertEquals(new Term("field", "bar"), terms[0][1]); + + int[] positions = ((MultiPhraseQuery) rewrite).getPositions(); + assertEquals(1, positions.length); + assertEquals(0, positions[0]); + + IOUtils.close(w, r, dir); + } + + public void testRewriteMultiPhraseWithAny() throws Exception { + TermAutomatonQuery q = new TermAutomatonQuery("field"); + int initState = q.createState(); + int s1 = q.createState(); + int s2 = q.createState(); + int s3 = q.createState(); + q.addTransition(initState, s1, "foo"); + q.addTransition(initState, s1, "bar"); + q.addAnyTransition(s1, s2); + q.addTransition(s2, s3, "baz"); + q.setAccept(s3, true); + q.finish(); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "x y z", Field.Store.NO)); + w.addDocument(doc); + + IndexReader r = w.getReader(); + Query rewrite = q.rewrite(r); + assertTrue(rewrite instanceof MultiPhraseQuery); + Term[][] terms = ((MultiPhraseQuery) rewrite).getTermArrays(); + assertEquals(2, terms.length); + assertEquals(2, terms[0].length); + assertEquals(new Term("field", "foo"), terms[0][0]); + assertEquals(new Term("field", "bar"), terms[0][1]); + assertEquals(1, terms[1].length); + assertEquals(new Term("field", "baz"), terms[1][0]); + + int[] positions = ((MultiPhraseQuery) rewrite).getPositions(); + assertEquals(2, positions.length); + assertEquals(0, positions[0]); + assertEquals(2, positions[1]); + + IOUtils.close(w, r, dir); } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java index 370672494a2..3d20412152a 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java @@ -26,7 +26,6 @@ import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.*; -import org.apache.lucene.util.packed.PackedInts; /** * Finite state automata based implementation of "autocomplete" functionality. @@ -237,8 +236,7 @@ public class FSTCompletionBuilder { final Object empty = outputs.getNoOutput(); final Builder builder = new Builder<>( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, - shareMaxTailLength, outputs, false, - PackedInts.DEFAULT, true, 15); + shareMaxTailLength, outputs, true, 15); BytesRefBuilder scratch = new BytesRefBuilder(); BytesRef entry; diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java index 9a14b5daa73..dee7d8405c0 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/CheckHits.java @@ -368,8 +368,9 @@ public class CheckHits { boolean productOf = descr.endsWith("product of:"); boolean sumOf = descr.endsWith("sum of:"); boolean maxOf = descr.endsWith("max of:"); + boolean computedOf = descr.matches(".*, computed as .* from:"); boolean maxTimesOthers = false; - if (!(productOf || sumOf || maxOf)) { + if (!(productOf || sumOf || maxOf || computedOf)) { // maybe 'max plus x times others' int k1 = descr.indexOf("max plus "); if (k1>=0) { @@ -387,9 +388,9 @@ public class CheckHits { // TODO: this is a TERRIBLE assertion!!!! Assert.assertTrue( q+": multi valued explanation description=\""+descr - +"\" must be 'max of plus x times others' or end with 'product of'" + +"\" must be 'max of plus x times others', 'computed as x from:' or end with 'product of'" +" or 'sum of:' or 'max of:' - "+expl, - productOf || sumOf || maxOf || maxTimesOthers); + productOf || sumOf || maxOf || computedOf || maxTimesOthers); float sum = 0; float product = 1; float max = 0; @@ -410,7 +411,8 @@ public class CheckHits { } else if (maxTimesOthers) { combined = max + x * (sum - max); } else { - Assert.assertTrue("should never get here!",false); + Assert.assertTrue("should never get here!", computedOf); + combined = value; } Assert.assertEquals(q+": actual subDetails combined=="+combined+ " != value="+value+" Explanation: "+expl, diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java index 136d7e508de..43b6c3cae31 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/RandomSimilarity.java @@ -91,6 +91,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper { allSims = new ArrayList<>(); allSims.add(new ClassicSimilarity()); allSims.add(new BM25Similarity()); + allSims.add(new BooleanSimilarity()); for (BasicModel basicModel : BASIC_MODELS) { for (AfterEffect afterEffect : AFTER_EFFECTS) { for (Normalization normalization : NORMALIZATIONS) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index 0e4facc5249..96b43534562 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene70.Lucene70Codec; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; -import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.RandomSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @@ -213,7 +212,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule { TimeZone randomTimeZone = randomTimeZone(random()); timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone); TimeZone.setDefault(timeZone); - similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random()); + similarity = new RandomSimilarity(random()); // Check codec restrictions once at class level. try { diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 11b132572ed..8e6a4ea023c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -40,7 +40,6 @@ import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.packed.PackedInts; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -273,25 +272,14 @@ public class FSTTester { System.out.println("\nTEST: prune1=" + prune1 + " prune2=" + prune2); } - final boolean willRewrite = random.nextBoolean(); - final Builder builder = new Builder<>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, prune1, prune2, prune1==0 && prune2==0, allowRandomSuffixSharing ? random.nextBoolean() : true, allowRandomSuffixSharing ? TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, outputs, - willRewrite, - PackedInts.DEFAULT, true, 15); - if (LuceneTestCase.VERBOSE) { - if (willRewrite) { - System.out.println("TEST: packed FST"); - } else { - System.out.println("TEST: non-packed FST"); - } - } for(InputOutput pair : pairs) { if (pair.output instanceof List) { @@ -306,7 +294,7 @@ public class FSTTester { } FST fst = builder.finish(); - if (random.nextBoolean() && fst != null && !willRewrite) { + if (random.nextBoolean() && fst != null) { IOContext context = LuceneTestCase.newIOContext(random); IndexOutput out = dir.createOutput("fst.bin", context); fst.save(out); diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index cfe045da50b..efd1c942d27 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -70,7 +70,7 @@ Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this r Versions of Major Components --------------------- Apache Tika 1.13 -Carrot2 3.12.0 +Carrot2 3.15.0 Velocity 1.7 and Velocity Tools 2.0 Apache UIMA 2.3.1 Apache ZooKeeper 3.4.6 @@ -81,6 +81,9 @@ Detailed Change List New Features ---------------------- +* SOLR-9293: Solrj client support for hierarchical clusters and other topics + marker. (Dawid Weiss) + * SOLR-9681: FacetModule / JSON Facet API added the ability to add filters directly to any facet command. The filters are applied after any domain change operations. Example: { type:terms, field:category, filter:"user:yonik" } @@ -96,11 +99,21 @@ New Features * SOLR-8542: Adds Solr Learning to Rank (LTR) plugin for reranking results with machine learning models. (Michael Nilsson, Diego Ceccarelli, Joshua Pantony, Jon Dorando, Naveen Santhapuri, Alessandro Benedetti, David Grohmann, Christine Poerschke) +* SOLR-9055: Make collection backup/restore extensible. (Hrishikesh Gadre, Varun Thacker, Mark Miller) + +* SOLR-9682: JSON Facet API: added "param" query type to facet domain filter specification to obtain + filters via query parameters. (yonik) + +* SOLR-9038: Add a command-line tool to manage the snapshots functionality (Hrishikesh Gadre via yonik) + Optimizations ---------------------- * SOLR-9704: Facet Module / JSON Facet API: Optimize blockChildren facets that have filters specified by using those filters as acceptDocs. (yonik) +* SOLR-9726: Reduce number of lookupOrd calls made by the DocValuesFacets.getCounts method. + (Jonny Marks via Christine Poerschke) + Bug Fixes ---------------------- * SOLR-9701: NPE in export handler when "fl" parameter is omitted. @@ -109,15 +122,43 @@ Bug Fixes * SOLR-9433: SolrCore clean-up logic uses incorrect path to delete dataDir on failure to create a core. (Evan Sayer, shalin) +* SOLR-9360: Solr script not properly checking SOLR_PID + (Alessandro Benedetti via Erick Erickson) + +* SOLR-9716: RecoveryStrategy sends prep recovery command without setting read time out which can cause + replica recovery to hang indefinitely on network partitions. (Cao Manh Dat, shalin) + +* SOLR-9624: In Admin UI, do not attempt to highlight CSV output (Alexandre Rafalovitch) + +* SOLR-9005: In files example, add a guard condition to javascript URP script (Alexandre Rafalovitch) + +* SOLR-9519: JSON Facet API: don't stop at an empty facet bucket if any sub-facets still have a chance + of matching something due to filter exclusions (which can widen the domain again). + (Michael Sun, yonik) + +* SOLR-9740: A bug in macro expansion of multi-valued parameters caused non-expanded values + after the first expanded value in the same multi-valued parameter to be dropped. + (Erik Hatcher, yonik) + + Other Changes ---------------------- +* SOLR-7539: Upgrade the clustering plugin to Carrot2 3.15.0. (Dawid Weiss) + * SOLR-9621: Remove several Guava & Apache Commons calls in favor of java 8 alternatives. (Michael Braun via David Smiley) * SOLR-9720: Refactor Responsewriters to remove dependencies on TupleStream, Tuple, Explanation (noble) +* SOLR-9717: Refactor '/export' to not hardcode the JSON output and to use an API (noble) + +* SOLR-9739: JavabinCodec implements PushWriter interface (noble) + +* SOLR-8332: Factor HttpShardHandler[Factory]'s url shuffling out into a ReplicaListTransformer class. + (Christine Poerschke, Noble Paul) + ================== 6.3.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/bin/solr b/solr/bin/solr index 1d8edfa90f7..880fcef9981 100755 --- a/solr/bin/solr +++ b/solr/bin/solr @@ -495,7 +495,7 @@ function solr_pid_by_port() { # extract the value of the -Djetty.port parameter from a running Solr process function jetty_port() { SOLR_PID="$1" - SOLR_PROC=`ps auxww | grep -w $SOLR_PID | grep start\.jar | grep jetty.port` + SOLR_PROC=`ps auxww | grep -w $SOLR_PID | grep start\.jar | grep jetty\.port` IFS=' ' read -a proc_args <<< "$SOLR_PROC" for arg in "${proc_args[@]}" do @@ -543,10 +543,10 @@ function get_info() { done < <(find "$SOLR_PID_DIR" -name "solr-*.pid" -type f) else # no pid files but check using ps just to be sure - numSolrs=`ps auxww | grep start\.jar | grep solr.solr.home | grep -v grep | wc -l | sed -e 's/^[ \t]*//'` + numSolrs=`ps auxww | grep start\.jar | grep solr\.solr\.home | grep -v grep | wc -l | sed -e 's/^[ \t]*//'` if [ "$numSolrs" != "0" ]; then echo -e "\nFound $numSolrs Solr nodes: " - PROCESSES=$(ps auxww | grep start\.jar | grep solr.solr.home | grep -v grep | awk '{print $2}' | sort -r) + PROCESSES=$(ps auxww | grep start\.jar | grep solr\.solr\.home | grep -v grep | awk '{print $2}' | sort -r) for ID in $PROCESSES do port=`jetty_port "$ID"` @@ -1345,7 +1345,7 @@ if [[ "$SCRIPT_CMD" == "start" ]]; then if [ -z "$SOLR_PID" ]; then # not found using the pid file ... but use ps to ensure not found - SOLR_PID=`ps auxww | grep start\.jar | grep -w $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r` + SOLR_PID=`ps auxww | grep start\.jar | grep -w "\-Djetty\.port=$SOLR_PORT" | grep -v grep | awk '{print $2}' | sort -r` fi if [ "$SOLR_PID" != "" ]; then @@ -1358,7 +1358,7 @@ else SOLR_PID=`solr_pid_by_port "$SOLR_PORT"` if [ -z "$SOLR_PID" ]; then # not found using the pid file ... but use ps to ensure not found - SOLR_PID=`ps auxww | grep start\.jar | grep -w $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r` + SOLR_PID=`ps auxww | grep start\.jar | grep -w "\-Djetty\.port=$SOLR_PORT" | grep -v grep | awk '{print $2}' | sort -r` fi if [ "$SOLR_PID" != "" ]; then stop_solr "$SOLR_SERVER_DIR" "$SOLR_PORT" "$STOP_KEY" "$SOLR_PID" @@ -1659,7 +1659,7 @@ function launch_solr() { exit # subshell! fi else - SOLR_PID=`ps auxww | grep start\.jar | grep -w $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r` + SOLR_PID=`ps auxww | grep start\.jar | grep -w "\-Djetty\.port=$SOLR_PORT" | grep -v grep | awk '{print $2}' | sort -r` echo -e "\nStarted Solr server on port $SOLR_PORT (pid=$SOLR_PID). Happy searching!\n" exit # subshell! fi @@ -1668,7 +1668,7 @@ function launch_solr() { else echo -e "NOTE: Please install lsof as this script needs it to determine if Solr is listening on port $SOLR_PORT." sleep 10 - SOLR_PID=`ps auxww | grep start\.jar | grep -w $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r` + SOLR_PID=`ps auxww | grep start\.jar | grep -w "\-Djetty\.port=$SOLR_PORT" | grep -v grep | awk '{print $2}' | sort -r` echo -e "\nStarted Solr server on port $SOLR_PORT (pid=$SOLR_PID). Happy searching!\n" return; fi diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java index 42a2de98ae8..6275c906f02 100644 --- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java +++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java @@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Map; @@ -44,9 +45,6 @@ import org.apache.solr.util.plugin.SolrCoreAware; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Maps; - - /** * Provides a plugin for performing cluster analysis. This can either be applied to * search results (e.g., via Carrot2) or for @@ -68,12 +66,12 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar /** * Declaration-order list of search clustering engines. */ - private final LinkedHashMap searchClusteringEngines = Maps.newLinkedHashMap(); - + private final LinkedHashMap searchClusteringEngines = new LinkedHashMap<>(); + /** * Declaration order list of document clustering engines. */ - private final LinkedHashMap documentClusteringEngines = Maps.newLinkedHashMap(); + private final LinkedHashMap documentClusteringEngines = new LinkedHashMap<>(); /** * An unmodifiable view of {@link #searchClusteringEngines}. @@ -173,7 +171,7 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar if (engine != null) { checkAvailable(name, engine); DocListAndSet results = rb.getResults(); - Map docIds = Maps.newHashMapWithExpectedSize(results.docList.size()); + Map docIds = new HashMap<>(results.docList.size()); SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds); Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req); diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java index a8548eceb8c..951cce5c4b0 100644 --- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java +++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java @@ -58,6 +58,8 @@ import org.carrot2.core.Document; import org.carrot2.core.IClusteringAlgorithm; import org.carrot2.core.LanguageCode; import org.carrot2.core.attribute.AttributeNames; +import org.carrot2.shaded.guava.common.base.MoreObjects; +import org.carrot2.shaded.guava.common.base.Strings; import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder; @@ -69,12 +71,6 @@ import org.carrot2.util.resource.ResourceLookup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.base.Objects; -import com.google.common.base.Strings; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - /** * Search results clustering engine based on Carrot2 clustering algorithms. * @@ -155,7 +151,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { // Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute // of this component. This by-name convention lookup is used to simplify configuring algorithms. String componentName = initParams.get(ClusteringEngine.ENGINE_NAME); - log.info("Initializing Clustering Engine '" + Objects.firstNonNull(componentName, "") + "'"); + log.info("Initializing Clustering Engine '" + + MoreObjects.firstNonNull(componentName, "") + "'"); if (!Strings.isNullOrEmpty(componentName)) { IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml"); @@ -268,7 +265,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { protected Set getFieldsToLoad(SolrQueryRequest sreq){ SolrParams solrParams = sreq.getParams(); - HashSet fields = Sets.newHashSet(getFieldsForClustering(sreq)); + HashSet fields = new HashSet<>(getFieldsForClustering(sreq)); fields.add(idFieldName); fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url")); fields.addAll(getCustomFieldsMap(solrParams).keySet()); @@ -295,7 +292,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { + " must not be blank."); } - final Set fields = Sets.newHashSet(); + final Set fields = new HashSet<>(); fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]"))); fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]"))); return fields; @@ -319,7 +316,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { Map customFields = getCustomFieldsMap(solrParams); // Parse language code map string into a map - Map languageCodeMap = Maps.newHashMap(); + Map languageCodeMap = new HashMap<>(); if (StringUtils.isNotBlank(languageField)) { for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) { final String[] split = pair.split(":"); @@ -340,7 +337,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { if (produceSummary) { highlighter = HighlightComponent.getHighlighter(core); if (highlighter != null){ - Map args = Maps.newHashMap(); + Map args = new HashMap<>(); snippetFieldAry = snippetFieldSpec.split("[, ]"); args.put(HighlightParams.FIELDS, snippetFieldAry); args.put(HighlightParams.HIGHLIGHT, "true"); @@ -466,10 +463,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { * custom field names. */ private Map getCustomFieldsMap(SolrParams solrParams) { - Map customFields = Maps.newHashMap(); + Map customFields = new HashMap<>(); String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME); if (customFieldsSpec != null) { - customFields = Maps.newHashMap(); + customFields = new HashMap<>(); for (String customFieldSpec : customFieldsSpec) { String [] split = customFieldSpec.split(":"); if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) { @@ -501,7 +498,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { private List> clustersToNamedList(List carrotClusters, SolrParams solrParams) { - List> result = Lists.newArrayList(); + List> result = new ArrayList<>(); clustersToNamedList(carrotClusters, result, solrParams.getBool( CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt( CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE)); @@ -534,7 +531,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { // Add documents List docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments(); - List docList = Lists.newArrayList(); + List docList = new ArrayList<>(); cluster.add("docs", docList); for (Document doc : docs) { docList.add(doc.getField(SOLR_DOCUMENT_ID)); @@ -542,7 +539,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine { // Add subclusters if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) { - List> subclusters = Lists.newArrayList(); + List> subclusters = new ArrayList<>(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels); diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java index 71a22fe9343..d0fb0d5ded7 100644 --- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java +++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java @@ -16,10 +16,10 @@ */ package org.apache.solr.handler.clustering.carrot2; +import java.util.Arrays; +import java.util.HashSet; import java.util.Set; -import com.google.common.collect.ImmutableSet; - /** * Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration). * @lucene.experimental @@ -50,7 +50,7 @@ public final class CarrotParams { */ public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir"; - static final Set CARROT_PARAM_NAMES = ImmutableSet.of( + static final Set CARROT_PARAM_NAMES = new HashSet<>(Arrays.asList( ALGORITHM, TITLE_FIELD_NAME, @@ -66,8 +66,8 @@ public final class CarrotParams { NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, RESOURCES_DIR, - LANGUAGE_CODE_MAP); - + LANGUAGE_CODE_MAP)); + /** No instances. */ private CarrotParams() {} } diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java index 3caaf54f27a..569b1bb2307 100644 --- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java +++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java @@ -16,7 +16,9 @@ */ package org.apache.solr.handler.clustering.carrot2; -import java.util.Collection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; @@ -26,6 +28,7 @@ import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.core.SolrCore; +import org.apache.solr.schema.IndexSchema; import org.carrot2.core.LanguageCode; import org.carrot2.core.attribute.Init; import org.carrot2.core.attribute.Processing; @@ -37,9 +40,6 @@ import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Multimap; - /** * An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop * words from a field's StopFilter to the default stop words used in Carrot2, @@ -67,7 +67,7 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto /** * A lazily-built cache of stop words per field. */ - private Multimap solrStopWords = HashMultimap.create(); + private HashMap> solrStopWords = new HashMap<>(); /** * Carrot2's default lexical resources to use in addition to Solr's stop @@ -79,31 +79,34 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto * Obtains stop words for a field from the associated * {@link StopFilterFactory}, if any. */ - private Collection getSolrStopWordsForField(String fieldName) { + private List getSolrStopWordsForField(String fieldName) { // No need to synchronize here, Carrot2 ensures that instances // of this class are not used by multiple threads at a time. - if (!solrStopWords.containsKey(fieldName)) { - final Analyzer fieldAnalyzer = core.getLatestSchema().getFieldType(fieldName) - .getIndexAnalyzer(); - if (fieldAnalyzer instanceof TokenizerChain) { - final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer) - .getTokenFilterFactories(); - for (TokenFilterFactory factory : filterFactories) { - if (factory instanceof StopFilterFactory) { - // StopFilterFactory holds the stop words in a CharArraySet - solrStopWords.put(fieldName, - ((StopFilterFactory) factory).getStopWords()); - } + synchronized (solrStopWords) { + if (!solrStopWords.containsKey(fieldName)) { + solrStopWords.put(fieldName, new ArrayList<>()); - if (factory instanceof CommonGramsFilterFactory) { - solrStopWords.put(fieldName, - ((CommonGramsFilterFactory) factory) - .getCommonWords()); + IndexSchema schema = core.getLatestSchema(); + final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer(); + if (fieldAnalyzer instanceof TokenizerChain) { + final TokenFilterFactory[] filterFactories = + ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories(); + for (TokenFilterFactory factory : filterFactories) { + if (factory instanceof StopFilterFactory) { + // StopFilterFactory holds the stop words in a CharArraySet + CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords(); + solrStopWords.get(fieldName).add(stopWords); + } + + if (factory instanceof CommonGramsFilterFactory) { + CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords(); + solrStopWords.get(fieldName).add(commonWords); + } } } } + return solrStopWords.get(fieldName); } - return solrStopWords.get(fieldName); } @Override diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java index 752570de3c1..3d6f3d39e5f 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java @@ -17,6 +17,9 @@ package org.apache.solr.handler.clustering.carrot2; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -45,9 +48,6 @@ import org.carrot2.core.LanguageCode; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Test; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; - /** * */ @@ -211,7 +211,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { // stoplabels.mt, so we're expecting only one cluster with label "online". final List> clusters = checkEngine( getClusteringEngine(engineName), 1, params); - assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online")); + assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online")); } @Test @@ -226,7 +226,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { // only one cluster with label "online". final List> clusters = checkEngine( getClusteringEngine("lexical-resource-check"), 1, params); - assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online")); + assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online")); } @Test @@ -243,9 +243,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { final List> clusters = checkEngine( getClusteringEngine("lexical-resource-check"), 2, params); - assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0))); - assertEquals(ImmutableList.of("solrownstopword"), - getLabels(clusters.get(1))); + assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0))); + assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1))); } @Test @@ -395,8 +394,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default"); Map engines = getSearchClusteringEngines(comp); assertEquals( - Lists.newArrayList("stc", "default", "mock"), - Lists.newArrayList(engines.keySet())); + Arrays.asList("stc", "default", "mock"), + new ArrayList<>(engines.keySet())); assertEquals( LingoClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); @@ -407,8 +406,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order"); Map engines = getSearchClusteringEngines(comp); assertEquals( - Lists.newArrayList("unavailable", "lingo", "stc", "mock", "default"), - Lists.newArrayList(engines.keySet())); + Arrays.asList("unavailable", "lingo", "stc", "mock", "default"), + new ArrayList<>(engines.keySet())); assertEquals( LingoClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); @@ -419,8 +418,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups"); Map engines = getSearchClusteringEngines(comp); assertEquals( - Lists.newArrayList("", "default"), - Lists.newArrayList(engines.keySet())); + Arrays.asList("", "default"), + new ArrayList<>(engines.keySet())); assertEquals( MockClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java index 5fb13f88519..2c95da3da9a 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java @@ -15,6 +15,7 @@ * limitations under the License. */ package org.apache.solr.handler.clustering.carrot2; +import java.util.ArrayList; import java.util.List; import org.carrot2.core.Cluster; @@ -29,8 +30,6 @@ import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Output; -import com.google.common.collect.Lists; - /** * A mock Carrot2 clustering algorithm that outputs input documents as clusters. * Useful only in tests. @@ -56,7 +55,7 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements @Override public void process() throws ProcessingException { - clusters = Lists.newArrayListWithCapacity(documents.size()); + clusters = new ArrayList<>(); for (Document document : documents) { final Cluster cluster = new Cluster(); diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java index dfd762f5211..f39fcd968b8 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java @@ -16,6 +16,7 @@ */ package org.apache.solr.handler.clustering.carrot2; +import java.util.ArrayList; import java.util.List; import org.carrot2.core.Cluster; @@ -36,8 +37,6 @@ import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Output; -import com.google.common.collect.Lists; - /** * A mock Carrot2 clustering algorithm that outputs stem of each token of each * document as a separate cluster. Useful only in tests. @@ -64,7 +63,7 @@ public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase final AllTokens allTokens = preprocessingContext.allTokens; final AllWords allWords = preprocessingContext.allWords; final AllStems allStems = preprocessingContext.allStems; - clusters = Lists.newArrayListWithCapacity(allTokens.image.length); + clusters = new ArrayList<>(); for (int i = 0; i < allTokens.image.length; i++) { if (allTokens.wordIndex[i] >= 0) { clusters.add(new Cluster(new String( diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java index 0346209e82e..32e47d82fa6 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java @@ -16,6 +16,7 @@ */ package org.apache.solr.handler.clustering.carrot2; +import java.util.ArrayList; import java.util.List; import org.carrot2.core.Cluster; @@ -33,7 +34,6 @@ import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Output; -import com.google.common.collect.Lists; /** * A mock Carrot2 clustering algorithm that outputs each token of each document @@ -58,8 +58,7 @@ public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase public void process() throws ProcessingException { final PreprocessingContext preprocessingContext = preprocessing.preprocess( documents, "", LanguageCode.ENGLISH); - clusters = Lists - .newArrayListWithCapacity(preprocessingContext.allTokens.image.length); + clusters = new ArrayList<>(); for (char[] token : preprocessingContext.allTokens.image) { if (token != null) { clusters.add(new Cluster(new String(token))); diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java index e32f5af857b..9f69040a9ed 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/LexicalResourcesCheckClusteringAlgorithm.java @@ -16,6 +16,7 @@ */ package org.apache.solr.handler.clustering.carrot2; +import java.util.ArrayList; import java.util.List; import org.carrot2.core.Cluster; @@ -33,8 +34,6 @@ import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Output; -import com.google.common.collect.Lists; - /** * A mock implementation of Carrot2 clustering algorithm for testing whether the * customized lexical resource lookup works correctly. This algorithm ignores @@ -60,7 +59,7 @@ public class LexicalResourcesCheckClusteringAlgorithm extends @Override public void process() throws ProcessingException { - clusters = Lists.newArrayList(); + clusters = new ArrayList<>(); if (wordsToCheck == null) { return; } diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java index 899bcbc0e9d..ba978a5dace 100644 --- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java +++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java @@ -15,13 +15,13 @@ * limitations under the License. */ package org.apache.solr.handler.clustering.carrot2; -import com.google.common.collect.Lists; import org.carrot2.core.*; import org.carrot2.core.attribute.AttributeNames; import org.carrot2.core.attribute.Processing; import org.carrot2.util.attribute.*; import org.carrot2.util.attribute.constraint.IntRange; +import java.util.ArrayList; import java.util.List; @Bindable(prefix = "MockClusteringAlgorithm") @@ -62,7 +62,7 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements @Override public void process() throws ProcessingException { - clusters = Lists.newArrayList(); + clusters = new ArrayList<>(); if (documents == null) { return; } diff --git a/solr/core/src/java/org/apache/solr/cloud/BackupCmd.java b/solr/core/src/java/org/apache/solr/cloud/BackupCmd.java index b859d8ea0d6..a4012f05fc7 100644 --- a/solr/core/src/java/org/apache/solr/cloud/BackupCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/BackupCmd.java @@ -26,6 +26,7 @@ import java.util.Map; import java.util.Optional; import java.util.Properties; +import org.apache.lucene.util.Version; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.cloud.ClusterState; @@ -35,6 +36,7 @@ import org.apache.solr.common.cloud.Replica.State; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.ZkNodeProps; +import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; @@ -68,31 +70,13 @@ public class BackupCmd implements OverseerCollectionMessageHandler.Cmd { public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { String collectionName = message.getStr(COLLECTION_PROP); String backupName = message.getStr(NAME); - ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); - String asyncId = message.getStr(ASYNC); String repo = message.getStr(CoreAdminParams.BACKUP_REPOSITORY); - String commitName = message.getStr(CoreAdminParams.COMMIT_NAME); - Optional snapshotMeta = Optional.empty(); - if (commitName != null) { - SolrZkClient zkClient = ocmh.overseer.getZkController().getZkClient(); - snapshotMeta = SolrSnapshotManager.getCollectionLevelSnapshot(zkClient, collectionName, commitName); - if (!snapshotMeta.isPresent()) { - throw new SolrException(ErrorCode.BAD_REQUEST, "Snapshot with name " + commitName - + " does not exist for collection " + collectionName); - } - if (snapshotMeta.get().getStatus() != SnapshotStatus.Successful) { - throw new SolrException(ErrorCode.BAD_REQUEST, "Snapshot with name " + commitName + " for collection " + collectionName - + " has not completed successfully. The status is " + snapshotMeta.get().getStatus()); - } - } - - Map requestMap = new HashMap<>(); Instant startTime = Instant.now(); CoreContainer cc = ocmh.overseer.getZkController().getCoreContainer(); BackupRepository repository = cc.newBackupRepository(Optional.ofNullable(repo)); - BackupManager backupMgr = new BackupManager(repository, ocmh.zkStateReader, collectionName); + BackupManager backupMgr = new BackupManager(repository, ocmh.zkStateReader); // Backup location URI location = repository.createURI(message.getStr(CoreAdminParams.BACKUP_LOCATION)); @@ -106,50 +90,16 @@ public class BackupCmd implements OverseerCollectionMessageHandler.Cmd { // Create a directory to store backup details. repository.createDirectory(backupPath); - log.info("Starting backup of collection={} with backupName={} at location={}", collectionName, backupName, - backupPath); - - Collection shardsToConsider = Collections.emptySet(); - if (snapshotMeta.isPresent()) { - shardsToConsider = snapshotMeta.get().getShards(); - } - - for (Slice slice : ocmh.zkStateReader.getClusterState().getCollection(collectionName).getActiveSlices()) { - Replica replica = null; - - if (snapshotMeta.isPresent()) { - if (!shardsToConsider.contains(slice.getName())) { - log.warn("Skipping the backup for shard {} since it wasn't part of the collection {} when snapshot {} was created.", - slice.getName(), collectionName, snapshotMeta.get().getName()); - continue; - } - replica = selectReplicaWithSnapshot(snapshotMeta.get(), slice); - } else { - // Note - Actually this can return a null value when there is no leader for this shard. - replica = slice.getLeader(); - if (replica == null) { - throw new SolrException(ErrorCode.SERVER_ERROR, "No 'leader' replica available for shard " + slice.getName() + " of collection " + collectionName); - } + String strategy = message.getStr(CollectionAdminParams.INDEX_BACKUP_STRATEGY, CollectionAdminParams.COPY_FILES_STRATEGY); + switch (strategy) { + case CollectionAdminParams.COPY_FILES_STRATEGY: { + copyIndexFiles(backupPath, message, results); + break; } - - String coreName = replica.getStr(CORE_NAME_PROP); - - ModifiableSolrParams params = new ModifiableSolrParams(); - params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.BACKUPCORE.toString()); - params.set(NAME, slice.getName()); - params.set(CoreAdminParams.BACKUP_REPOSITORY, repo); - params.set(CoreAdminParams.BACKUP_LOCATION, backupPath.toASCIIString()); // note: index dir will be here then the "snapshot." + slice name - params.set(CORE_NAME_PROP, coreName); - if (snapshotMeta.isPresent()) { - params.set(CoreAdminParams.COMMIT_NAME, snapshotMeta.get().getName()); + case CollectionAdminParams.NO_INDEX_BACKUP_STRATEGY: { + break; } - - ocmh.sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap); - log.debug("Sent backup request to core={} for backupName={}", coreName, backupName); } - log.debug("Sent backup requests to all shard leaders for backupName={}", backupName); - - ocmh.processResponses(results, shardHandler, true, "Could not backup all replicas", asyncId, requestMap); log.info("Starting to backup ZK data for backupName={}", backupName); @@ -168,6 +118,7 @@ public class BackupCmd implements OverseerCollectionMessageHandler.Cmd { properties.put(BackupManager.COLLECTION_NAME_PROP, collectionName); properties.put(COLL_CONF, configName); properties.put(BackupManager.START_TIME_PROP, startTime.toString()); + properties.put(BackupManager.INDEX_VERSION_PROP, Version.LATEST.toString()); //TODO: Add MD5 of the configset. If during restore the same name configset exists then we can compare checksums to see if they are the same. //if they are not the same then we can throw an error or have an 'overwriteConfig' flag //TODO save numDocs for the shardLeader. We can use it to sanity check the restore. @@ -202,4 +153,73 @@ public class BackupCmd implements OverseerCollectionMessageHandler.Cmd { return r.get(); } + + private void copyIndexFiles(URI backupPath, ZkNodeProps request, NamedList results) throws Exception { + String collectionName = request.getStr(COLLECTION_PROP); + String backupName = request.getStr(NAME); + String asyncId = request.getStr(ASYNC); + String repoName = request.getStr(CoreAdminParams.BACKUP_REPOSITORY); + ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(); + Map requestMap = new HashMap<>(); + + String commitName = request.getStr(CoreAdminParams.COMMIT_NAME); + Optional snapshotMeta = Optional.empty(); + if (commitName != null) { + SolrZkClient zkClient = ocmh.overseer.getZkController().getZkClient(); + snapshotMeta = SolrSnapshotManager.getCollectionLevelSnapshot(zkClient, collectionName, commitName); + if (!snapshotMeta.isPresent()) { + throw new SolrException(ErrorCode.BAD_REQUEST, "Snapshot with name " + commitName + + " does not exist for collection " + collectionName); + } + if (snapshotMeta.get().getStatus() != SnapshotStatus.Successful) { + throw new SolrException(ErrorCode.BAD_REQUEST, "Snapshot with name " + commitName + " for collection " + collectionName + + " has not completed successfully. The status is " + snapshotMeta.get().getStatus()); + } + } + + log.info("Starting backup of collection={} with backupName={} at location={}", collectionName, backupName, + backupPath); + + Collection shardsToConsider = Collections.emptySet(); + if (snapshotMeta.isPresent()) { + shardsToConsider = snapshotMeta.get().getShards(); + } + + for (Slice slice : ocmh.zkStateReader.getClusterState().getCollection(collectionName).getActiveSlices()) { + Replica replica = null; + + if (snapshotMeta.isPresent()) { + if (!shardsToConsider.contains(slice.getName())) { + log.warn("Skipping the backup for shard {} since it wasn't part of the collection {} when snapshot {} was created.", + slice.getName(), collectionName, snapshotMeta.get().getName()); + continue; + } + replica = selectReplicaWithSnapshot(snapshotMeta.get(), slice); + } else { + // Note - Actually this can return a null value when there is no leader for this shard. + replica = slice.getLeader(); + if (replica == null) { + throw new SolrException(ErrorCode.SERVER_ERROR, "No 'leader' replica available for shard " + slice.getName() + " of collection " + collectionName); + } + } + + String coreName = replica.getStr(CORE_NAME_PROP); + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.BACKUPCORE.toString()); + params.set(NAME, slice.getName()); + params.set(CoreAdminParams.BACKUP_REPOSITORY, repoName); + params.set(CoreAdminParams.BACKUP_LOCATION, backupPath.toASCIIString()); // note: index dir will be here then the "snapshot." + slice name + params.set(CORE_NAME_PROP, coreName); + if (snapshotMeta.isPresent()) { + params.set(CoreAdminParams.COMMIT_NAME, snapshotMeta.get().getName()); + } + + ocmh.sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap); + log.debug("Sent backup request to core={} for backupName={}", coreName, backupName); + } + log.debug("Sent backup requests to all shard leaders for backupName={}", backupName); + + ocmh.processResponses(results, shardHandler, true, "Could not backup all replicas", asyncId, requestMap); + } } diff --git a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java index 90e515a9fde..02b7cbd8e66 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/RecoveryStrategy.java @@ -19,6 +19,7 @@ package org.apache.solr.cloud; import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -572,24 +573,44 @@ public class RecoveryStrategy extends Thread implements Closeable { private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice) throws SolrServerException, IOException, InterruptedException, ExecutionException { - try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) { - client.setConnectionTimeout(30000); - WaitForState prepCmd = new WaitForState(); - prepCmd.setCoreName(leaderCoreName); - prepCmd.setNodeName(zkController.getNodeName()); - prepCmd.setCoreNodeName(coreZkNodeName); - prepCmd.setState(Replica.State.RECOVERING); - prepCmd.setCheckLive(true); - prepCmd.setOnlyIfLeader(true); - final Slice.State state = slice.getState(); - if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) { - prepCmd.setOnlyIfLeaderActive(true); + WaitForState prepCmd = new WaitForState(); + prepCmd.setCoreName(leaderCoreName); + prepCmd.setNodeName(zkController.getNodeName()); + prepCmd.setCoreNodeName(coreZkNodeName); + prepCmd.setState(Replica.State.RECOVERING); + prepCmd.setCheckLive(true); + prepCmd.setOnlyIfLeader(true); + final Slice.State state = slice.getState(); + if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) { + prepCmd.setOnlyIfLeaderActive(true); + } + + final int maxTries = 30; + for (int numTries = 0; numTries < maxTries; numTries++) { + try { + sendPrepRecoveryCmd(leaderBaseUrl, prepCmd); + break; + } catch (ExecutionException e) { + SolrServerException solrException = (SolrServerException) e.getCause(); + if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) { + LOG.warn("Socket timeout when send prep recovery cmd, retrying.. "); + continue; + } + throw e; } + } + } + + private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd) + throws SolrServerException, IOException, InterruptedException, ExecutionException { + try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) { + client.setConnectionTimeout(10000); + client.setSoTimeout(10000); HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd); prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest; - + LOG.info("Sending prep recovery command to [{}]; [{}]", leaderBaseUrl, prepCmd.toString()); - + mrr.future.get(); } } diff --git a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java index 63d56865700..4e7fb581b34 100644 --- a/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/RestoreCmd.java @@ -87,7 +87,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd { URI location = repository.createURI(message.getStr(CoreAdminParams.BACKUP_LOCATION)); URI backupPath = repository.resolve(location, backupName); ZkStateReader zkStateReader = ocmh.zkStateReader; - BackupManager backupMgr = new BackupManager(repository, zkStateReader, restoreCollectionName); + BackupManager backupMgr = new BackupManager(repository, zkStateReader); Properties properties = backupMgr.readBackupProperties(location, backupName); String backupCollection = properties.getProperty(BackupManager.COLLECTION_NAME_PROP); diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java index 16b4e03d0a7..8195a64504c 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrCore.java +++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java @@ -110,7 +110,6 @@ import org.apache.solr.response.RubyResponseWriter; import org.apache.solr.response.SchemaXmlResponseWriter; import org.apache.solr.response.SmileResponseWriter; import org.apache.solr.response.SolrQueryResponse; -import org.apache.solr.response.SortingResponseWriter; import org.apache.solr.response.XMLResponseWriter; import org.apache.solr.response.transform.TransformerFactory; import org.apache.solr.rest.ManagedResourceStorage; @@ -2332,7 +2331,6 @@ public final class SolrCore implements SolrInfoMBean, Closeable { m.put("raw", new RawResponseWriter()); m.put(CommonParams.JAVABIN, new BinaryResponseWriter()); m.put("csv", new CSVResponseWriter()); - m.put("xsort", new SortingResponseWriter()); m.put("schema.xml", new SchemaXmlResponseWriter()); m.put("smile", new SmileResponseWriter()); m.put(ReplicationHandler.FILE_STREAM, getFileStreamWriter()); @@ -2350,12 +2348,21 @@ public final class SolrCore implements SolrInfoMBean, Closeable { @Override public void write(OutputStream out, SolrQueryRequest req, SolrQueryResponse response) throws IOException { RawWriter rawWriter = (RawWriter) response.getValues().get(ReplicationHandler.FILE_STREAM); - if(rawWriter!=null) rawWriter.write(out); + if (rawWriter != null) { + rawWriter.write(out); + if (rawWriter instanceof Closeable) ((Closeable) rawWriter).close(); + } + } @Override public String getContentType(SolrQueryRequest request, SolrQueryResponse response) { - return BinaryResponseParser.BINARY_CONTENT_TYPE; + RawWriter rawWriter = (RawWriter) response.getValues().get(ReplicationHandler.FILE_STREAM); + if (rawWriter != null) { + return rawWriter.getContentType(); + } else { + return BinaryResponseParser.BINARY_CONTENT_TYPE; + } } }; } @@ -2365,6 +2372,9 @@ public final class SolrCore implements SolrInfoMBean, Closeable { } public interface RawWriter { + default String getContentType() { + return BinaryResponseParser.BINARY_CONTENT_TYPE; + } void write(OutputStream os) throws IOException ; } diff --git a/solr/core/src/java/org/apache/solr/core/backup/BackupManager.java b/solr/core/src/java/org/apache/solr/core/backup/BackupManager.java index c80b2b7c279..726e5b9799c 100644 --- a/solr/core/src/java/org/apache/solr/core/backup/BackupManager.java +++ b/solr/core/src/java/org/apache/solr/core/backup/BackupManager.java @@ -68,7 +68,7 @@ public class BackupManager { protected final ZkStateReader zkStateReader; protected final BackupRepository repository; - public BackupManager(BackupRepository repository, ZkStateReader zkStateReader, String collectionName) { + public BackupManager(BackupRepository repository, ZkStateReader zkStateReader) { this.repository = Objects.requireNonNull(repository); this.zkStateReader = Objects.requireNonNull(zkStateReader); } @@ -126,6 +126,7 @@ public class BackupManager { * * @param backupLoc The base path used to store the backup data. * @param backupId The unique name for the backup. + * @param collectionName The name of the collection whose meta-data is to be returned. * @return the meta-data information for the backed-up collection. * @throws IOException in case of errors. */ diff --git a/solr/core/src/java/org/apache/solr/core/snapshots/SolrSnapshotsTool.java b/solr/core/src/java/org/apache/solr/core/snapshots/SolrSnapshotsTool.java new file mode 100644 index 00000000000..cb1c52c1a7a --- /dev/null +++ b/solr/core/src/java/org/apache/solr/core/snapshots/SolrSnapshotsTool.java @@ -0,0 +1,468 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.core.snapshots; + +import java.io.Closeable; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.lang.invoke.MethodHandles; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Optional; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.apache.hadoop.fs.Path; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.response.CollectionAdminResponse; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.params.CollectionAdminParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.snapshots.CollectionSnapshotMetaData.CoreSnapshotMetaData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * This class provides utility functions required for Solr snapshots functionality. + */ +public class SolrSnapshotsTool implements Closeable { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final DateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z", Locale.getDefault()); + + private static final String CREATE = "create"; + private static final String DELETE = "delete"; + private static final String LIST = "list"; + private static final String DESCRIBE = "describe"; + private static final String PREPARE_FOR_EXPORT = "prepare-snapshot-export"; + private static final String EXPORT_SNAPSHOT = "export"; + private static final String HELP = "help"; + private static final String COLLECTION = "c"; + private static final String TEMP_DIR = "t"; + private static final String DEST_DIR = "d"; + private static final String SOLR_ZK_ENSEMBLE = "z"; + private static final String HDFS_PATH_PREFIX = "p"; + private static final String BACKUP_REPO_NAME = "r"; + private static final String ASYNC_REQ_ID = "i"; + private static final List OPTION_HELP_ORDER = Arrays.asList(CREATE, DELETE, LIST, DESCRIBE, + PREPARE_FOR_EXPORT, EXPORT_SNAPSHOT, HELP, SOLR_ZK_ENSEMBLE, COLLECTION, DEST_DIR, BACKUP_REPO_NAME, + ASYNC_REQ_ID, TEMP_DIR, HDFS_PATH_PREFIX); + + private final CloudSolrClient solrClient; + + public SolrSnapshotsTool(String solrZkEnsemble) { + solrClient = (new CloudSolrClient.Builder()).withZkHost(solrZkEnsemble).build(); + } + + @Override + public void close() throws IOException { + if (solrClient != null) { + solrClient.close(); + } + } + + public void createSnapshot(String collectionName, String snapshotName) { + CollectionAdminRequest.CreateSnapshot createSnap = new CollectionAdminRequest.CreateSnapshot(collectionName, snapshotName); + CollectionAdminResponse resp; + try { + resp = createSnap.process(solrClient); + Preconditions.checkState(resp.getStatus() == 0, "The CREATESNAPSHOT request failed. The status code is " + resp.getStatus()); + System.out.println("Successfully created snapshot with name " + snapshotName + " for collection " + collectionName); + + } catch (Exception e) { + log.error("Failed to create a snapshot with name " + snapshotName + " for collection " + collectionName, e); + System.out.println("Failed to create a snapshot with name " + snapshotName + " for collection " + collectionName + +" due to following error : "+e.getLocalizedMessage()); + } + } + + public void deleteSnapshot(String collectionName, String snapshotName) { + CollectionAdminRequest.DeleteSnapshot deleteSnap = new CollectionAdminRequest.DeleteSnapshot(collectionName, snapshotName); + CollectionAdminResponse resp; + try { + resp = deleteSnap.process(solrClient); + Preconditions.checkState(resp.getStatus() == 0, "The DELETESNAPSHOT request failed. The status code is " + resp.getStatus()); + System.out.println("Successfully deleted snapshot with name " + snapshotName + " for collection " + collectionName); + + } catch (Exception e) { + log.error("Failed to delete a snapshot with name " + snapshotName + " for collection " + collectionName, e); + System.out.println("Failed to delete a snapshot with name " + snapshotName + " for collection " + collectionName + +" due to following error : "+e.getLocalizedMessage()); + } + } + + @SuppressWarnings("rawtypes") + public void listSnapshots(String collectionName) { + CollectionAdminRequest.ListSnapshots listSnaps = new CollectionAdminRequest.ListSnapshots(collectionName); + CollectionAdminResponse resp; + try { + resp = listSnaps.process(solrClient); + Preconditions.checkState(resp.getStatus() == 0, "The LISTSNAPSHOTS request failed. The status code is " + resp.getStatus()); + + NamedList apiResult = (NamedList) resp.getResponse().get(SolrSnapshotManager.SNAPSHOTS_INFO); + for (int i = 0; i < apiResult.size(); i++) { + System.out.println(apiResult.getName(i)); + } + + } catch (Exception e) { + log.error("Failed to list snapshots for collection " + collectionName, e); + System.out.println("Failed to list snapshots for collection " + collectionName + +" due to following error : "+e.getLocalizedMessage()); + } + } + + public void describeSnapshot(String collectionName, String snapshotName) { + try { + Collection snaps = listCollectionSnapshots(collectionName); + for (CollectionSnapshotMetaData m : snaps) { + if (snapshotName.equals(m.getName())) { + System.out.println("Name: " + m.getName()); + System.out.println("Status: " + m.getStatus()); + System.out.println("Time of creation: " + dateFormat.format(m.getCreationDate())); + System.out.println("Total number of cores with snapshot: " + m.getReplicaSnapshots().size()); + System.out.println("-----------------------------------"); + for (CoreSnapshotMetaData n : m.getReplicaSnapshots()) { + StringBuilder builder = new StringBuilder(); + builder.append("Core [name="); + builder.append(n.getCoreName()); + builder.append(", leader="); + builder.append(n.isLeader()); + builder.append(", generation="); + builder.append(n.getGenerationNumber()); + builder.append(", indexDirPath="); + builder.append(n.getIndexDirPath()); + builder.append("]\n"); + System.out.println(builder.toString()); + } + } + } + } catch (Exception e) { + log.error("Failed to fetch snapshot details", e); + System.out.println("Failed to fetch snapshot details due to following error : " + e.getLocalizedMessage()); + } + } + + public Map> getIndexFilesPathForSnapshot(String collectionName, String snapshotName, Optional pathPrefix) + throws SolrServerException, IOException { + Map> result = new HashMap<>(); + + Collection snaps = listCollectionSnapshots(collectionName); + Optional meta = Optional.empty(); + for (CollectionSnapshotMetaData m : snaps) { + if (snapshotName.equals(m.getName())) { + meta = Optional.of(m); + } + } + + if (!meta.isPresent()) { + throw new IllegalArgumentException("The snapshot named " + snapshotName + + " is not found for collection " + collectionName); + } + + DocCollection collectionState = solrClient.getZkStateReader().getClusterState().getCollection(collectionName); + for (Slice s : collectionState.getSlices()) { + List replicaSnaps = meta.get().getReplicaSnapshotsForShard(s.getName()); + // Prepare a list of *existing* replicas (since one or more replicas could have been deleted after the snapshot creation). + List availableReplicas = new ArrayList<>(); + for (CoreSnapshotMetaData m : replicaSnaps) { + if (isReplicaAvailable(s, m.getCoreName())) { + availableReplicas.add(m); + } + } + + if (availableReplicas.isEmpty()) { + throw new IllegalArgumentException( + "The snapshot named " + snapshotName + " not found for shard " + + s.getName() + " of collection " + collectionName); + } + + // Prefer a leader replica (at the time when the snapshot was created). + CoreSnapshotMetaData coreSnap = availableReplicas.get(0); + for (CoreSnapshotMetaData m : availableReplicas) { + if (m.isLeader()) { + coreSnap = m; + } + } + + String indexDirPath = coreSnap.getIndexDirPath(); + if (pathPrefix.isPresent()) { + // If the path prefix is specified, rebuild the path to the index directory. + Path t = new Path(coreSnap.getIndexDirPath()); + indexDirPath = (new Path(pathPrefix.get(), t.toUri().getPath())).toString(); + } + + List paths = new ArrayList<>(); + for (String fileName : coreSnap.getFiles()) { + Path p = new Path(indexDirPath, fileName); + paths.add(p.toString()); + } + + result.put(s.getName(), paths); + } + + return result; + } + + public void buildCopyListings(String collectionName, String snapshotName, String localFsPath, Optional pathPrefix) + throws SolrServerException, IOException { + Map> paths = getIndexFilesPathForSnapshot(collectionName, snapshotName, pathPrefix); + for (Map.Entry> entry : paths.entrySet()) { + StringBuilder filesBuilder = new StringBuilder(); + for (String filePath : entry.getValue()) { + filesBuilder.append(filePath); + filesBuilder.append("\n"); + } + + String files = filesBuilder.toString().trim(); + try (Writer w = new OutputStreamWriter(new FileOutputStream(new File(localFsPath, entry.getKey())), StandardCharsets.UTF_8)) { + w.write(files); + } + } + } + + public void backupCollectionMetaData(String collectionName, String snapshotName, String backupLoc) throws SolrServerException, IOException { + // Backup the collection meta-data + CollectionAdminRequest.Backup backup = new CollectionAdminRequest.Backup(collectionName, snapshotName); + backup.setIndexBackupStrategy(CollectionAdminParams.NO_INDEX_BACKUP_STRATEGY); + backup.setLocation(backupLoc); + CollectionAdminResponse resp = backup.process(solrClient); + Preconditions.checkState(resp.getStatus() == 0, "The request failed. The status code is " + resp.getStatus()); + } + + public void prepareForExport(String collectionName, String snapshotName, String localFsPath, Optional pathPrefix, String destPath) { + try { + buildCopyListings(collectionName, snapshotName, localFsPath, pathPrefix); + System.out.println("Successfully prepared copylisting for the snapshot export."); + } catch (Exception e) { + log.error("Failed to prepare a copylisting for snapshot with name " + snapshotName + " for collection " + + collectionName, e); + System.out.println("Failed to prepare a copylisting for snapshot with name " + snapshotName + " for collection " + + collectionName + " due to following error : " + e.getLocalizedMessage()); + System.exit(1); + } + + try { + backupCollectionMetaData(collectionName, snapshotName, destPath); + System.out.println("Successfully backed up collection meta-data"); + } catch (Exception e) { + log.error("Failed to backup collection meta-data for collection " + collectionName, e); + System.out.println("Failed to backup collection meta-data for collection " + collectionName + + " due to following error : " + e.getLocalizedMessage()); + System.exit(1); + } + } + + public void exportSnapshot(String collectionName, String snapshotName, String destPath, Optional backupRepo, + Optional asyncReqId) { + try { + CollectionAdminRequest.Backup backup = new CollectionAdminRequest.Backup(collectionName, snapshotName); + backup.setIndexBackupStrategy(CollectionAdminParams.COPY_FILES_STRATEGY); + backup.setLocation(destPath); + if (backupRepo.isPresent()) { + backup.setRepositoryName(backupRepo.get()); + } + if (asyncReqId.isPresent()) { + backup.setAsyncId(asyncReqId.get()); + } + CollectionAdminResponse resp = backup.process(solrClient); + Preconditions.checkState(resp.getStatus() == 0, "The request failed. The status code is " + resp.getStatus()); + } catch (Exception e) { + log.error("Failed to backup collection meta-data for collection " + collectionName, e); + System.out.println("Failed to backup collection meta-data for collection " + collectionName + + " due to following error : " + e.getLocalizedMessage()); + System.exit(1); + } + } + + public static void main(String[] args) throws IOException { + CommandLineParser parser = new PosixParser(); + Options options = new Options(); + + options.addOption(null, CREATE, true, "This command will create a snapshot with the specified name"); + options.addOption(null, DELETE, true, "This command will delete a snapshot with the specified name"); + options.addOption(null, LIST, false, "This command will list all the named snapshots for the specified collection."); + options.addOption(null, DESCRIBE, true, "This command will print details for a named snapshot for the specified collection."); + options.addOption(null, PREPARE_FOR_EXPORT, true, "This command will prepare copylistings for the specified snapshot." + + " This command should only be used only if Solr is deployed with Hadoop and collection index files are stored on a shared" + + " file-system e.g. HDFS"); + options.addOption(null, EXPORT_SNAPSHOT, true, "This command will create a backup for the specified snapshot."); + options.addOption(null, HELP, false, "This command will print the help message for the snapshots related commands."); + options.addOption(TEMP_DIR, true, "This parameter specifies the path of a temporary directory on local filesystem" + + " during prepare-snapshot-export command."); + options.addOption(DEST_DIR, true, "This parameter specifies the path on shared file-system (e.g. HDFS) where the snapshot related" + + " information should be stored."); + options.addOption(COLLECTION, true, "This parameter specifies the name of the collection to be used during snapshot operation"); + options.addOption(SOLR_ZK_ENSEMBLE, true, "This parameter specifies the Solr Zookeeper ensemble address"); + options.addOption(HDFS_PATH_PREFIX, true, "This parameter specifies the HDFS URI prefix to be used" + + " during snapshot export preparation. This is applicable only if the Solr collection index files are stored on HDFS."); + options.addOption(BACKUP_REPO_NAME, true, "This parameter specifies the name of the backup repository to be used" + + " during snapshot export preparation"); + options.addOption(ASYNC_REQ_ID, true, "This parameter specifies the async request identifier to be used" + + " during snapshot export preparation"); + + CommandLine cmd = null; + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.out.println(e.getLocalizedMessage()); + printHelp(options); + System.exit(1); + } + + if (cmd.hasOption(CREATE) || cmd.hasOption(DELETE) || cmd.hasOption(LIST) || cmd.hasOption(DESCRIBE) + || cmd.hasOption(PREPARE_FOR_EXPORT) || cmd.hasOption(EXPORT_SNAPSHOT)) { + try (SolrSnapshotsTool tool = new SolrSnapshotsTool(cmd.getOptionValue(SOLR_ZK_ENSEMBLE))) { + if (cmd.hasOption(CREATE)) { + String snapshotName = cmd.getOptionValue(CREATE); + String collectionName = cmd.getOptionValue(COLLECTION); + tool.createSnapshot(collectionName, snapshotName); + + } else if (cmd.hasOption(DELETE)) { + String snapshotName = cmd.getOptionValue(DELETE); + String collectionName = cmd.getOptionValue(COLLECTION); + tool.deleteSnapshot(collectionName, snapshotName); + + } else if (cmd.hasOption(LIST)) { + String collectionName = cmd.getOptionValue(COLLECTION); + tool.listSnapshots(collectionName); + + } else if (cmd.hasOption(DESCRIBE)) { + String snapshotName = cmd.getOptionValue(DESCRIBE); + String collectionName = cmd.getOptionValue(COLLECTION); + tool.describeSnapshot(collectionName, snapshotName); + + } else if (cmd.hasOption(PREPARE_FOR_EXPORT)) { + String snapshotName = cmd.getOptionValue(PREPARE_FOR_EXPORT); + String collectionName = cmd.getOptionValue(COLLECTION); + String localFsDir = requiredArg(options, cmd, TEMP_DIR); + String hdfsOpDir = requiredArg(options, cmd, DEST_DIR); + Optional pathPrefix = Optional.ofNullable(cmd.getOptionValue(HDFS_PATH_PREFIX)); + + if (pathPrefix.isPresent()) { + try { + new URI(pathPrefix.get()); + } catch (URISyntaxException e) { + System.out.println( + "The specified File system path prefix " + pathPrefix.get() + + " is invalid. The error is " + e.getLocalizedMessage()); + System.exit(1); + } + } + tool.prepareForExport(collectionName, snapshotName, localFsDir, pathPrefix, hdfsOpDir); + + } else if (cmd.hasOption(EXPORT_SNAPSHOT)) { + String snapshotName = cmd.getOptionValue(EXPORT_SNAPSHOT); + String collectionName = cmd.getOptionValue(COLLECTION); + String destDir = requiredArg(options, cmd, DEST_DIR); + Optional backupRepo = Optional.ofNullable(cmd.getOptionValue(BACKUP_REPO_NAME)); + Optional asyncReqId = Optional.ofNullable(cmd.getOptionValue(ASYNC_REQ_ID)); + + tool.exportSnapshot(collectionName, snapshotName, destDir, backupRepo, asyncReqId); + } + } + } else if (cmd.hasOption(HELP)) { + printHelp(options); + } else { + System.out.println("Unknown command specified."); + printHelp(options); + } + } + + private static String requiredArg(Options options, CommandLine cmd, String optVal) { + if (!cmd.hasOption(optVal)) { + System.out.println("Please specify the value for option " + optVal); + printHelp(options); + System.exit(1); + } + return cmd.getOptionValue(optVal); + } + + private static boolean isReplicaAvailable (Slice s, String coreName) { + for (Replica r: s.getReplicas()) { + if (coreName.equals(r.getCoreName())) { + return true; + } + } + return false; + } + + private Collection listCollectionSnapshots(String collectionName) + throws SolrServerException, IOException { + CollectionAdminRequest.ListSnapshots listSnapshots = new CollectionAdminRequest.ListSnapshots(collectionName); + CollectionAdminResponse resp = listSnapshots.process(solrClient); + + Preconditions.checkState(resp.getStatus() == 0); + + NamedList apiResult = (NamedList) resp.getResponse().get(SolrSnapshotManager.SNAPSHOTS_INFO); + + Collection result = new ArrayList<>(); + for (int i = 0; i < apiResult.size(); i++) { + result.add(new CollectionSnapshotMetaData((NamedList)apiResult.getVal(i))); + } + + return result; + } + + private static void printHelp(Options options) { + StringBuilder helpFooter = new StringBuilder(); + helpFooter.append("Examples: \n"); + helpFooter.append("snapshotscli.sh --create snapshot-1 -c books -z localhost:2181 \n"); + helpFooter.append("snapshotscli.sh --list -c books -z localhost:2181 \n"); + helpFooter.append("snapshotscli.sh --describe snapshot-1 -c books -z localhost:2181 \n"); + helpFooter.append("snapshotscli.sh --export snapshot-1 -c books -z localhost:2181 -b repo -l backupPath -i req_0 \n"); + helpFooter.append("snapshotscli.sh --delete snapshot-1 -c books -z localhost:2181 \n"); + + HelpFormatter formatter = new HelpFormatter(); + formatter.setOptionComparator(new OptionComarator<>()); + formatter.printHelp("SolrSnapshotsTool", null, options, helpFooter.toString(), false); + } + + private static class OptionComarator implements Comparator { + + public int compare(T o1, T o2) { + String s1 = o1.hasLongOpt() ? o1.getLongOpt() : o1.getOpt(); + String s2 = o2.hasLongOpt() ? o2.getLongOpt() : o2.getOpt(); + return OPTION_HELP_ORDER.indexOf(s1) - OPTION_HELP_ORDER.indexOf(s2); + } +} + +} diff --git a/solr/core/src/java/org/apache/solr/handler/ExportHandler.java b/solr/core/src/java/org/apache/solr/handler/ExportHandler.java new file mode 100644 index 00000000000..9c75ef0fc46 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/ExportHandler.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.handler; + + +import java.util.HashMap; +import java.util.Map; + +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.MapSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.handler.component.SearchHandler; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; + +import static org.apache.solr.common.params.CommonParams.JSON; + +public class ExportHandler extends SearchHandler { + @Override + public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { + try { + super.handleRequestBody(req, rsp); + } catch (Exception e) { + rsp.setException(e); + } + String wt = req.getParams().get(CommonParams.WT, JSON); + if("xsort".equals(wt)) wt = JSON; + Map map = new HashMap<>(1); + map.put(CommonParams.WT, ReplicationHandler.FILE_STREAM); + req.setParams(SolrParams.wrapDefaults(new MapSolrParams(map),req.getParams())); + rsp.add(ReplicationHandler.FILE_STREAM, new ExportWriter(req, rsp, wt)); + } +} diff --git a/solr/core/src/java/org/apache/solr/response/SortingResponseWriter.java b/solr/core/src/java/org/apache/solr/handler/ExportWriter.java similarity index 84% rename from solr/core/src/java/org/apache/solr/response/SortingResponseWriter.java rename to solr/core/src/java/org/apache/solr/handler/ExportWriter.java index 56c4f27dd91..98ab22fa839 100644 --- a/solr/core/src/java/org/apache/solr/response/SortingResponseWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/ExportWriter.java @@ -14,17 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.solr.response; +package org.apache.solr.handler; + +import java.io.Closeable; import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; import java.io.PrintWriter; -import java.io.Writer; import java.lang.invoke.MethodHandles; -import java.util.ArrayList; +import java.nio.charset.StandardCharsets; import java.util.Date; import java.util.List; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; @@ -40,11 +44,18 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.LongValues; +import org.apache.solr.client.solrj.impl.BinaryResponseParser; +import org.apache.solr.common.IteratorWriter; +import org.apache.solr.common.MapWriter; +import org.apache.solr.common.MapWriter.EntryWriter; +import org.apache.solr.common.PushWriter; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; -import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrRequestInfo; +import org.apache.solr.response.JSONResponseWriter; +import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.BoolField; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; @@ -61,24 +72,65 @@ import org.apache.solr.search.SyntaxError; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +import static org.apache.solr.common.util.Utils.makeMap; -public class SortingResponseWriter implements QueryResponseWriter { - +public class ExportWriter implements SolrCore.RawWriter, Closeable { private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private OutputStreamWriter respWriter; + final SolrQueryRequest req; + final SolrQueryResponse res; + FieldWriter[] fieldWriters; + int totalHits = 0; + FixedBitSet[] sets = null; + PushWriter writer; + private String wt; + + + ExportWriter(SolrQueryRequest req, SolrQueryResponse res, String wt) { + this.req = req; + this.res = res; + this.wt = wt; - public void init(NamedList args) { - /* NOOP */ } - public String getContentType(SolrQueryRequest req, SolrQueryResponse res) { - return "application/json"; + @Override + public String getContentType() { + if ("javabin".equals(wt)) { + return BinaryResponseParser.BINARY_CONTENT_TYPE; + } else return "json"; } - public void write(Writer writer, SolrQueryRequest req, SolrQueryResponse res) throws IOException { - Exception e1 = res.getException(); - if(e1 != null) { - if(!(e1 instanceof IgnoreException)) { - writeException(e1, writer, false); + @Override + public void close() throws IOException { + if (writer != null) writer.close(); + if (respWriter != null) { + respWriter.flush(); + respWriter.close(); + } + + } + + protected void writeException(Exception e, PushWriter w, boolean log) throws IOException { + w.writeMap(mw -> { + mw.put("responseHeader", singletonMap("status", 400)) + .put("response", makeMap( + "numFound", 0, + "docs", singletonList(singletonMap("EXCEPTION", e.getMessage())))); + }); + if (log) { + SolrException.log(logger, e); + } + } + + public void write(OutputStream os) throws IOException { + respWriter = new OutputStreamWriter(os, StandardCharsets.UTF_8); + writer = JSONResponseWriter.getPushWriter(respWriter, req, res); + Exception exception = res.getException(); + if (exception != null) { + if (!(exception instanceof IgnoreException)) { + writeException(exception, writer, false); } return; } @@ -113,8 +165,6 @@ public class SortingResponseWriter implements QueryResponseWriter { // You'll have to uncomment the if below to hit the null pointer exception. // This is such an unusual case (i.e. an empty index) that catching this concdition here is probably OK. // This came to light in the very artifical case of indexing a single doc to Cloud. - int totalHits = 0; - FixedBitSet[] sets = null; if (req.getContext().get("totalHits") != null) { totalHits = ((Integer)req.getContext().get("totalHits")).intValue(); sets = (FixedBitSet[]) req.getContext().get("export"); @@ -145,8 +195,6 @@ public class SortingResponseWriter implements QueryResponseWriter { } } - FieldWriter[] fieldWriters = null; - try { fieldWriters = getFieldWriters(fields, req.getSearcher()); } catch (Exception e) { @@ -154,9 +202,17 @@ public class SortingResponseWriter implements QueryResponseWriter { return; } - writer.write("{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":"+totalHits+", \"docs\":["); + writer.writeMap(m -> { + m.put("responseHeader", singletonMap("status", 0)); + m.put("response", (MapWriter) mw -> { + mw.put("numFound", totalHits); + mw.put("docs", (IteratorWriter) iw -> writeDocs(req, iw, sort)); + }); + }); + } + protected void writeDocs(SolrQueryRequest req, IteratorWriter.ItemWriter writer, Sort sort) throws IOException { //Write the data. List leaves = req.getSearcher().getTopReaderContext().leaves(); SortDoc sortDoc = getSortDoc(req.getSearcher(), sort.getSort()); @@ -165,7 +221,6 @@ public class SortingResponseWriter implements QueryResponseWriter { SortQueue queue = new SortQueue(queueSize, sortDoc); SortDoc[] outDocs = new SortDoc[queueSize]; - boolean commaNeeded = false; while(count < totalHits) { //long begin = System.nanoTime(); queue.reset(); @@ -192,19 +247,17 @@ public class SortingResponseWriter implements QueryResponseWriter { } } - //long end = System.nanoTime(); + //long end = System.nanoTime(); count += (outDocsIndex+1); try { for(int i=outDocsIndex; i>=0; --i) { SortDoc s = outDocs[i]; - if(commaNeeded){writer.write(',');} - writer.write('{'); - writeDoc(s, leaves, fieldWriters, sets, writer); - writer.write('}'); - commaNeeded = true; - s.reset(); + writer.add((MapWriter) ew -> { + writeDoc(s, leaves, ew); + s.reset(); + }); } } catch(Throwable e) { Throwable ex = e; @@ -224,54 +277,24 @@ public class SortingResponseWriter implements QueryResponseWriter { } } } - - //System.out.println("Sort Time 2:"+Long.toString(total/1000000)); - writer.write("]}}"); - writer.flush(); } - public static class IgnoreException extends IOException { - public void printStackTrace(PrintWriter pw) { - pw.print("Early Client Disconnect"); - - } - - public String getMessage() { - return "Early Client Disconnect"; - } - } - - protected void writeDoc(SortDoc sortDoc, List leaves, - FieldWriter[] fieldWriters, - FixedBitSet[] sets, - Writer out) throws IOException{ + EntryWriter ew) throws IOException { int ord = sortDoc.ord; FixedBitSet set = sets[ord]; set.clear(sortDoc.docId); LeafReaderContext context = leaves.get(ord); int fieldIndex = 0; - for(FieldWriter fieldWriter : fieldWriters) { - if(fieldWriter.write(sortDoc.docId, context.reader(), out, fieldIndex)){ + for (FieldWriter fieldWriter : fieldWriters) { + if (fieldWriter.write(sortDoc.docId, context.reader(), ew, fieldIndex)) { ++fieldIndex; } } } - protected void writeException(Exception e, Writer out, boolean log) throws IOException{ - out.write("{\"responseHeader\": {\"status\": 400}, \"response\":{\"numFound\":0, \"docs\":["); - out.write("{\"EXCEPTION\":\""); - writeStr(e.getMessage(), out); - out.write("\"}"); - out.write("]}}"); - out.flush(); - if(log) { - SolrException.log(logger, e); - } - } - protected FieldWriter[] getFieldWriters(String[] fields, SolrIndexSearcher searcher) throws IOException { IndexSchema schema = searcher.getSchema(); FieldWriter[] writers = new FieldWriter[fields.length]; @@ -291,50 +314,49 @@ public class SortingResponseWriter implements QueryResponseWriter { boolean multiValued = schemaField.multiValued(); FieldType fieldType = schemaField.getType(); - if(fieldType instanceof TrieIntField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, true); + if (fieldType instanceof TrieIntField) { + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, true); } else { writers[i] = new IntFieldWriter(field); } } else if (fieldType instanceof TrieLongField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, true); + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, true); } else { writers[i] = new LongFieldWriter(field); } } else if (fieldType instanceof TrieFloatField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, true); + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, true); } else { writers[i] = new FloatFieldWriter(field); } - } else if(fieldType instanceof TrieDoubleField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, true); + } else if (fieldType instanceof TrieDoubleField) { + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, true); } else { writers[i] = new DoubleFieldWriter(field); } - } else if(fieldType instanceof StrField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, false); + } else if (fieldType instanceof StrField) { + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, false); } else { writers[i] = new StringFieldWriter(field, fieldType); } } else if (fieldType instanceof TrieDateField) { if (multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, false); + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, false); } else { writers[i] = new DateFieldWriter(field); } - } else if(fieldType instanceof BoolField) { - if(multiValued) { - writers[i] = new MultiFieldWriter(field, fieldType, true); + } else if (fieldType instanceof BoolField) { + if (multiValued) { + writers[i] = new MultiFieldWriter(field, fieldType, schemaField, true); } else { writers[i] = new BoolFieldWriter(field, fieldType); } - } - else { + } else { throw new IOException("Export fields must either be one of the following types: int,float,long,double,string,date,boolean"); } } @@ -398,8 +420,8 @@ public class SortingResponseWriter implements QueryResponseWriter { // _and_ since "F" happens to sort before "T" (thus false sorts "less" than true) // we can just use the existing StringValue here. LeafReader reader = searcher.getSlowAtomicReader(); - SortedDocValues vals = reader.getSortedDocValues(field); - if(reverse) { + SortedDocValues vals = reader.getSortedDocValues(field); + if (reverse) { sortValues[i] = new StringValue(vals, field, new IntDesc()); } else { sortValues[i] = new StringValue(vals, field, new IntAsc()); @@ -439,8 +461,8 @@ public class SortingResponseWriter implements QueryResponseWriter { private void populate() { Object[] heap = getHeapArray(); cache = new SortDoc[heap.length]; - for(int i=1; i0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write(Integer.toString(val)); + ew.put(this.field, val); return true; } } @@ -1328,57 +1343,31 @@ public class SortingResponseWriter implements QueryResponseWriter { class MultiFieldWriter extends FieldWriter { private String field; private FieldType fieldType; + private SchemaField schemaField; private boolean numeric; private CharsRefBuilder cref = new CharsRefBuilder(); - public MultiFieldWriter(String field, FieldType fieldType, boolean numeric) { + public MultiFieldWriter(String field, FieldType fieldType, SchemaField schemaField, boolean numeric) { this.field = field; this.fieldType = fieldType; + this.schemaField = schemaField; this.numeric = numeric; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + + public boolean write(int docId, LeafReader reader, EntryWriter out, int fieldIndex) throws IOException { SortedSetDocValues vals = DocValues.getSortedSet(reader, this.field); - List ords; - if (vals.advance(docId) == docId) { - ords = new ArrayList(); - long o = -1; - while((o = vals.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { - ords.add(o); - } - assert ords.size() > 0; - } else { - return false; - } - - - if(fieldIndex>0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write('['); - int v = 0; - for(long ord : ords) { - BytesRef ref = vals.lookupOrd(ord); - fieldType.indexedToReadable(ref, cref); - if(v > 0) { - out.write(','); - } - - if(!numeric) { - out.write('"'); - } - - writeStr(cref.toString(), out); - - if(!numeric) { - out.write('"'); - } - ++v; - } - out.write("]"); + if (vals.advance(docId) != docId) return false; + out.put(this.field, + (IteratorWriter) w -> { + long o; + while((o = vals.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { + BytesRef ref = vals.lookupOrd(o); + fieldType.indexedToReadable(ref, cref); + IndexableField f = fieldType.createField(schemaField, cref.toString(), 1.0f); + if (f == null) w.add(cref.toString()); + else w.add(fieldType.toObject(f)); + } + }); return true; } } @@ -1390,7 +1379,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.field = field; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { NumericDocValues vals = DocValues.getNumeric(reader, this.field); long val; if (vals.advance(docId) == docId) { @@ -1398,14 +1387,7 @@ public class SortingResponseWriter implements QueryResponseWriter { } else { val = 0; } - if(fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write(Long.toString(val)); + ew.put(field, val); return true; } } @@ -1417,7 +1399,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.field = field; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { NumericDocValues vals = DocValues.getNumeric(reader, this.field); long val; if (vals.advance(docId) == docId) { @@ -1425,17 +1407,7 @@ public class SortingResponseWriter implements QueryResponseWriter { } else { val = 0; } - - if (fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write('"'); - writeStr(new Date(val).toInstant().toString(), out); - out.write('"'); + ew.put(this.field, new Date(val)); return true; } } @@ -1450,7 +1422,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.fieldType = fieldType; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { SortedDocValues vals = DocValues.getSorted(reader, this.field); if (vals.advance(docId) != docId) { return false; @@ -1459,17 +1431,7 @@ public class SortingResponseWriter implements QueryResponseWriter { BytesRef ref = vals.lookupOrd(ord); fieldType.indexedToReadable(ref, cref); - - if (fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - //out.write('"'); - writeStr(cref.toString(), out); - //out.write('"'); + ew.put(this.field, "true".equals(cref.toString())); return true; } } @@ -1481,7 +1443,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.field = field; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { NumericDocValues vals = DocValues.getNumeric(reader, this.field); int val; if (vals.advance(docId) == docId) { @@ -1489,14 +1451,7 @@ public class SortingResponseWriter implements QueryResponseWriter { } else { val = 0; } - if(fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write(Float.toString(Float.intBitsToFloat(val))); + ew.put(this.field, Float.intBitsToFloat(val)); return true; } } @@ -1508,7 +1463,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.field = field; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { NumericDocValues vals = DocValues.getNumeric(reader, this.field); long val; if (vals.advance(docId) == docId) { @@ -1516,14 +1471,7 @@ public class SortingResponseWriter implements QueryResponseWriter { } else { val = 0; } - if(fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(':'); - out.write(Double.toString(Double.longBitsToDouble(val))); + ew.put(this.field, Double.longBitsToDouble(val)); return true; } } @@ -1538,7 +1486,7 @@ public class SortingResponseWriter implements QueryResponseWriter { this.fieldType = fieldType; } - public boolean write(int docId, LeafReader reader, Writer out, int fieldIndex) throws IOException { + public boolean write(int docId, LeafReader reader, EntryWriter ew, int fieldIndex) throws IOException { SortedDocValues vals = DocValues.getSorted(reader, this.field); if (vals.advance(docId) != docId) { return false; @@ -1547,64 +1495,11 @@ public class SortingResponseWriter implements QueryResponseWriter { BytesRef ref = vals.lookupOrd(ord); fieldType.indexedToReadable(ref, cref); - if(fieldIndex > 0) { - out.write(','); - } - out.write('"'); - out.write(this.field); - out.write('"'); - out.write(":"); - out.write('"'); - writeStr(cref.toString(), out); - out.write('"'); + ew.put(this.field, cref.toString()); return true; } } - private void writeStr(String val, Writer writer) throws IOException { - for (int i=0; i '#' && ch != '\\' && ch < '\u2028') || ch == ' ') { // fast path - writer.write(ch); - continue; - } - switch(ch) { - case '"': - case '\\': - writer.write('\\'); - writer.write(ch); - break; - case '\r': writer.write('\\'); writer.write('r'); break; - case '\n': writer.write('\\'); writer.write('n'); break; - case '\t': writer.write('\\'); writer.write('t'); break; - case '\b': writer.write('\\'); writer.write('b'); break; - case '\f': writer.write('\\'); writer.write('f'); break; - case '\u2028': // fallthrough - case '\u2029': - unicodeEscape(writer,ch); - break; - // case '/': - default: { - if (ch <= 0x1F) { - unicodeEscape(writer,ch); - } else { - writer.write(ch); - } - } - } - } - } - - private static char[] hexdigits = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; - protected static void unicodeEscape(Appendable out, int ch) throws IOException { - out.append('\\'); - out.append('u'); - out.append(hexdigits[(ch>>>12) ]); - out.append(hexdigits[(ch>>>8) & 0xf]); - out.append(hexdigits[(ch>>>4) & 0xf]); - out.append(hexdigits[(ch) & 0xf]); - } - public abstract class PriorityQueue { protected int size = 0; protected final int maxSize; @@ -1802,4 +1697,15 @@ public class SortingResponseWriter implements QueryResponseWriter { return (Object[]) heap; } } + + public class IgnoreException extends IOException { + public void printStackTrace(PrintWriter pw) { + pw.print("Early Client Disconnect"); + } + + public String getMessage() { + return "Early Client Disconnect"; + } + } + } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index e290ccb5ca2..01095a1143b 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -734,8 +734,14 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to check the existance of " + uri + ". Is it valid?", ex); } + String strategy = req.getParams().get(CollectionAdminParams.INDEX_BACKUP_STRATEGY, CollectionAdminParams.COPY_FILES_STRATEGY); + if (!CollectionAdminParams.INDEX_BACKUP_STRATEGIES.contains(strategy)) { + throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown index backup strategy " + strategy); + } + Map params = req.getParams().getAll(null, NAME, COLLECTION_PROP, CoreAdminParams.COMMIT_NAME); params.put(CoreAdminParams.BACKUP_LOCATION, location); + params.put(CollectionAdminParams.INDEX_BACKUP_STRATEGY, strategy); return params; }), RESTORE_OP(RESTORE, (req, rsp, h) -> { diff --git a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java index 45cfeb14e06..d370beff4f2 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/PrepRecoveryOp.java @@ -37,6 +37,7 @@ import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.update.CommitUpdateCommand; import org.apache.solr.util.RefCounted; +import org.apache.solr.util.TestInjection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,6 +47,8 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp { @Override public void execute(CallInfo it) throws Exception { + assert TestInjection.injectPrepRecoveryOpPauseForever(); + final SolrParams params = it.req.getParams(); String cname = params.get(CoreAdminParams.CORE); diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java index 609e4337667..6a55a0d917b 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandler.java @@ -17,6 +17,7 @@ package org.apache.solr.handler.component; import java.lang.invoke.MethodHandles; import java.net.ConnectException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; @@ -116,7 +117,7 @@ public class HttpShardHandler extends ShardHandler { private List getURLs(String shard, String preferredHostAddress) { List urls = shardToURLs.get(shard); if (urls == null) { - urls = httpShardHandlerFactory.makeURLList(shard); + urls = httpShardHandlerFactory.buildURLList(shard); if (preferredHostAddress != null && urls.size() > 1) { preferCurrentHostForDistributedReq(preferredHostAddress, urls); } @@ -320,6 +321,8 @@ public class HttpShardHandler extends ShardHandler { } } + final ReplicaListTransformer replicaListTransformer = httpShardHandlerFactory.getReplicaListTransformer(req); + if (shards != null) { List lst = StrUtils.splitSmart(shards, ",", true); rb.shards = lst.toArray(new String[lst.size()]); @@ -404,7 +407,11 @@ public class HttpShardHandler extends ShardHandler { for (int i=0; i shardUrls; + if (rb.shards[i] != null) { + shardUrls = StrUtils.splitSmart(rb.shards[i], "|", true); + replicaListTransformer.transform(shardUrls); + } else { if (clusterState == null) { clusterState = zkController.getClusterState(); slices = clusterState.getSlicesMap(cloudDescriptor.getCollectionName()); @@ -421,26 +428,25 @@ public class HttpShardHandler extends ShardHandler { // throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "no such shard: " + sliceName); } - Map sliceShards = slice.getReplicasMap(); - - // For now, recreate the | delimited list of equivalent servers - StringBuilder sliceShardsStr = new StringBuilder(); - boolean first = true; - for (Replica replica : sliceShards.values()) { + final Collection allSliceReplicas = slice.getReplicasMap().values(); + final List eligibleSliceReplicas = new ArrayList<>(allSliceReplicas.size()); + for (Replica replica : allSliceReplicas) { if (!clusterState.liveNodesContain(replica.getNodeName()) || replica.getState() != Replica.State.ACTIVE) { continue; } - if (first) { - first = false; - } else { - sliceShardsStr.append('|'); - } - String url = ZkCoreNodeProps.getCoreUrl(replica); - sliceShardsStr.append(url); + eligibleSliceReplicas.add(replica); } - if (sliceShardsStr.length() == 0) { + replicaListTransformer.transform(eligibleSliceReplicas); + + shardUrls = new ArrayList<>(eligibleSliceReplicas.size()); + for (Replica replica : eligibleSliceReplicas) { + String url = ZkCoreNodeProps.getCoreUrl(replica); + shardUrls.add(url); + } + + if (shardUrls.isEmpty()) { boolean tolerant = rb.req.getParams().getBool(ShardParams.SHARDS_TOLERANT, false); if (!tolerant) { // stop the check when there are no replicas available for a shard @@ -448,9 +454,19 @@ public class HttpShardHandler extends ShardHandler { "no servers hosting shard: " + rb.slices[i]); } } - - rb.shards[i] = sliceShardsStr.toString(); } + // And now recreate the | delimited list of equivalent servers + final StringBuilder sliceShardsStr = new StringBuilder(); + boolean first = true; + for (String shardUrl : shardUrls) { + if (first) { + first = false; + } else { + sliceShardsStr.append('|'); + } + sliceShardsStr.append(shardUrl); + } + rb.shards[i] = sliceShardsStr.toString(); } } String shards_rows = params.get(ShardParams.SHARDS_ROWS); diff --git a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java index d1e1ed5d468..e1b743a88fa 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java +++ b/solr/core/src/java/org/apache/solr/handler/component/HttpShardHandlerFactory.java @@ -31,13 +31,13 @@ import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.URLUtil; import org.apache.solr.core.PluginInfo; import org.apache.solr.update.UpdateShardHandlerConfig; +import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.util.DefaultSolrThreadFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.ArrayBlockingQueue; @@ -84,6 +84,8 @@ public class HttpShardHandlerFactory extends ShardHandlerFactory implements org. private final Random r = new Random(); + private final ReplicaListTransformer shufflingReplicaListTransformer = new ShufflingReplicaListTransformer(r); + // URL scheme to be used in distributed search. static final String INIT_URL_SCHEME = "urlScheme"; @@ -227,12 +229,12 @@ public class HttpShardHandlerFactory extends ShardHandlerFactory implements org. } /** - * Creates a randomized list of urls for the given shard. + * Creates a list of urls for the given shard. * * @param shard the urls for the shard, separated by '|' * @return A list of valid urls (including protocol) that are replicas for the shard */ - public List makeURLList(String shard) { + public List buildURLList(String shard) { List urls = StrUtils.splitSmart(shard, "|", true); // convert shard to URL @@ -240,17 +242,14 @@ public class HttpShardHandlerFactory extends ShardHandlerFactory implements org. urls.set(i, buildUrl(urls.get(i))); } - // - // Shuffle the list instead of use round-robin by default. - // This prevents accidental synchronization where multiple shards could get in sync - // and query the same replica at the same time. - // - if (urls.size() > 1) - Collections.shuffle(urls, r); - return urls; } + ReplicaListTransformer getReplicaListTransformer(final SolrQueryRequest req) + { + return shufflingReplicaListTransformer; + } + /** * Creates a new completion service for use by a single set of distributed requests. */ diff --git a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java index 09fc74b8026..c0484eca0dc 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java +++ b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java @@ -973,8 +973,7 @@ public class QueryComponent extends SearchComponent // Merge the docs via a priority queue so we don't have to sort *all* of the // documents... we only need to order the top (rows+start) - ShardFieldSortedHitQueue queue; - queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount(), rb.req.getSearcher()); + final ShardFieldSortedHitQueue queue = new ShardFieldSortedHitQueue(sortFields, ss.getOffset() + ss.getCount(), rb.req.getSearcher()); NamedList shardInfo = null; if(rb.req.getParams().getBool(ShardParams.SHARDS_INFO, false)) { diff --git a/solr/core/src/java/org/apache/solr/handler/component/ReplicaListTransformer.java b/solr/core/src/java/org/apache/solr/handler/component/ReplicaListTransformer.java new file mode 100644 index 00000000000..bf30fa61726 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/component/ReplicaListTransformer.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.util.List; + +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.params.ShardParams; + +interface ReplicaListTransformer { + + /** + * Transforms the passed in list of choices. Transformations can include (but are not limited to) + * reordering of elements (e.g. via shuffling) and removal of elements (i.e. filtering). + * + * @param choices - a list of choices to transform, typically the choices are {@link Replica} objects but choices + * can also be {@link String} objects such as URLs passed in via the {@link ShardParams#SHARDS} parameter. + */ + public void transform(List choices); + +} diff --git a/solr/core/src/java/org/apache/solr/handler/component/ShufflingReplicaListTransformer.java b/solr/core/src/java/org/apache/solr/handler/component/ShufflingReplicaListTransformer.java new file mode 100644 index 00000000000..428e3489cf4 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/handler/component/ShufflingReplicaListTransformer.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.util.Collections; +import java.util.List; +import java.util.Random; + +class ShufflingReplicaListTransformer implements ReplicaListTransformer { + + private final Random r; + + public ShufflingReplicaListTransformer(Random r) + { + this.r = r; + } + + public void transform(List choices) + { + if (choices.size() > 1) { + Collections.shuffle(choices, r); + } + } + +} diff --git a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java index 0dab34bf506..3714bf1b9f6 100644 --- a/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java +++ b/solr/core/src/java/org/apache/solr/request/DocValuesFacets.java @@ -173,17 +173,18 @@ public class DocValuesFacets { int min=mincount-1; // the smallest value in the top 'N' values for (int i=(startTermIndex==-1)?1:0; imin) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). + if (contains != null) { + final BytesRef term = si.lookupOrd(startTermIndex+i); + if (!SimpleFacets.contains(term.utf8ToString(), contains, ignoreCase)) { + continue; + } + } + // smaller term numbers sort higher, so subtract the term number instead long pair = (((long)c)<<32) + (Integer.MAX_VALUE - i); boolean displaced = queue.insert(pair); diff --git a/solr/core/src/java/org/apache/solr/request/macro/MacroExpander.java b/solr/core/src/java/org/apache/solr/request/macro/MacroExpander.java index 305a9759f7c..9d432fa6ec6 100644 --- a/solr/core/src/java/org/apache/solr/request/macro/MacroExpander.java +++ b/solr/core/src/java/org/apache/solr/request/macro/MacroExpander.java @@ -71,6 +71,8 @@ public class MacroExpander { newValues.add(vv); } } + } + if (newValues != null) { newValues.add(newV); } } diff --git a/solr/core/src/java/org/apache/solr/response/JSONResponseWriter.java b/solr/core/src/java/org/apache/solr/response/JSONResponseWriter.java index 206bbd6d832..462c6561870 100644 --- a/solr/core/src/java/org/apache/solr/response/JSONResponseWriter.java +++ b/solr/core/src/java/org/apache/solr/response/JSONResponseWriter.java @@ -24,7 +24,11 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.solr.common.IteratorWriter; +import org.apache.solr.common.MapWriter.EntryWriter; +import org.apache.solr.common.PushWriter; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.MapWriter; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; @@ -74,6 +78,11 @@ public class JSONResponseWriter implements QueryResponseWriter { public String getContentType(SolrQueryRequest request, SolrQueryResponse response) { return contentType; } + + public static PushWriter getPushWriter(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp) { + return new JSONWriter(writer, req, rsp); + } + } class JSONWriter extends TextResponseWriter { @@ -507,6 +516,53 @@ class JSONWriter extends TextResponseWriter { } } + @Override + public void writeIterator(IteratorWriter val) throws IOException { + writeArrayOpener(-1); + incLevel(); + val.writeIter(new IteratorWriter.ItemWriter() { + boolean first = true; + + @Override + public IteratorWriter.ItemWriter add(Object o) throws IOException { + if (!first) { + JSONWriter.this.indent(); + JSONWriter.this.writeArraySeparator(); + } + JSONWriter.this.writeVal(null, o); + first = false; + return this; + } + }); + decLevel(); + writeArrayCloser(); + } + + @Override + public void writeMap(MapWriter val) + throws IOException { + writeMapOpener(-1); + incLevel(); + + val.writeMap(new EntryWriter() { + boolean isFirst = true; + + @Override + public EntryWriter put(String k, Object v) throws IOException { + if (isFirst) { + isFirst = false; + } else { + JSONWriter.this.writeMapSeparator(); + } + if (doIndent) JSONWriter.this.indent(); + JSONWriter.this.writeKey(k, true); + JSONWriter.this.writeVal(k, v); + return this; + } + }); + decLevel(); + writeMapCloser(); + } @Override public void writeMap(String name, Map val, boolean excludeOuter, boolean isFirstVal) throws IOException { @@ -544,12 +600,14 @@ class JSONWriter extends TextResponseWriter { public void writeArray(String name, List l) throws IOException { writeArrayOpener(l.size()); writeJsonIter(l.iterator()); + writeArrayCloser(); } @Override public void writeArray(String name, Iterator val) throws IOException { writeArrayOpener(-1); // no trivial way to determine array size writeJsonIter(val); + writeArrayCloser(); } private void writeJsonIter(Iterator val) throws IOException { @@ -564,7 +622,6 @@ class JSONWriter extends TextResponseWriter { first=false; } decLevel(); - writeArrayCloser(); } // @@ -634,11 +691,6 @@ class ArrayOfNamedValuePairJSONWriter extends JSONWriter { } } - @Override - public void writeArray(String name, List l) throws IOException { - writeArray(name, l.iterator()); - } - @Override public void writeNamedList(String name, NamedList val) throws IOException { diff --git a/solr/core/src/java/org/apache/solr/response/TextResponseWriter.java b/solr/core/src/java/org/apache/solr/response/TextResponseWriter.java index 255d3385b9a..c4c29943f90 100644 --- a/solr/core/src/java/org/apache/solr/response/TextResponseWriter.java +++ b/solr/core/src/java/org/apache/solr/response/TextResponseWriter.java @@ -31,9 +31,12 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.EnumFieldValue; +import org.apache.solr.common.IteratorWriter; import org.apache.solr.common.MapSerializable; +import org.apache.solr.common.PushWriter; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.MapWriter; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.util.Base64; import org.apache.solr.common.util.NamedList; @@ -48,7 +51,7 @@ import org.apache.solr.util.FastWriter; * * */ -public abstract class TextResponseWriter { +public abstract class TextResponseWriter implements PushWriter { // indent up to 40 spaces static final char[] indentChars = new char[81]; @@ -138,19 +141,19 @@ public abstract class TextResponseWriter { writeStr(name, f.stringValue(), true); } } else if (val instanceof Number) { - writeNumber(name, (Number)val); + writeNumber(name, (Number) val); } else if (val instanceof Boolean) { - writeBool(name, (Boolean)val); + writeBool(name, (Boolean) val); } else if (val instanceof Date) { - writeDate(name,(Date)val); + writeDate(name, (Date) val); } else if (val instanceof Document) { SolrDocument doc = DocsStreamer.getDoc((Document) val, schema); - writeSolrDocument(name, doc,returnFields, 0 ); + writeSolrDocument(name, doc, returnFields, 0); } else if (val instanceof SolrDocument) { - writeSolrDocument(name, (SolrDocument)val,returnFields, 0); + writeSolrDocument(name, (SolrDocument) val, returnFields, 0); } else if (val instanceof ResultContext) { // requires access to IndexReader - writeDocuments(name, (ResultContext)val); + writeDocuments(name, (ResultContext) val); } else if (val instanceof DocList) { // Should not happen normally ResultContext ctx = new BasicResultContext((DocList)val, returnFields, null, null, req); @@ -168,6 +171,8 @@ public abstract class TextResponseWriter { writeNamedList(name, (NamedList)val); } else if (val instanceof Path) { writeStr(name, ((Path) val).toAbsolutePath().toString(), true); + } else if (val instanceof IteratorWriter) { + writeIterator((IteratorWriter) val); } else if (val instanceof Iterable) { writeArray(name,((Iterable)val).iterator()); } else if (val instanceof Object[]) { @@ -184,6 +189,8 @@ public abstract class TextResponseWriter { writeStr(name, val.toString(), true); } else if (val instanceof WriteableValue) { ((WriteableValue)val).write(name, this); + } else if (val instanceof MapWriter) { + writeMap((MapWriter) val); } else if (val instanceof MapSerializable) { //todo find a better way to reuse the map more efficiently writeMap(name, ((MapSerializable) val).toMap(new LinkedHashMap<>()), false, true); @@ -192,6 +199,15 @@ public abstract class TextResponseWriter { writeStr(name, val.getClass().getName() + ':' + val.toString(), true); } } + @Override + public void writeMap(MapWriter mw) throws IOException { + //todo + } + + @Override + public void writeIterator(IteratorWriter iw) throws IOException { + /*todo*/ + } protected void writeBool(String name , Boolean val) throws IOException { writeBool(name, val.toString()); diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java index c1d28824ade..84b11a0cce0 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetProcessor.java @@ -94,20 +94,58 @@ public abstract class FacetProcessor { List qlist = new ArrayList<>(freq.domain.filters.size()); // TODO: prevent parsing filters each time! for (Object rawFilter : freq.domain.filters) { - Query symbolicFilter; if (rawFilter instanceof String) { QParser parser = null; try { parser = QParser.getParser((String)rawFilter, fcontext.req); - symbolicFilter = parser.getQuery(); + Query symbolicFilter = parser.getQuery(); + qlist.add(symbolicFilter); } catch (SyntaxError syntaxError) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, syntaxError); } + } else if (rawFilter instanceof Map) { + + Map m = (Map) rawFilter; + String type; + Object args; + + if (m.size() == 1) { + Map.Entry entry = m.entrySet().iterator().next(); + type = entry.getKey(); + args = entry.getValue(); + } else { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Can't convert map to query:" + rawFilter); + } + + if (!"param".equals(type)) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown type. Can't convert map to query:" + rawFilter); + } + + String tag; + if (!(args instanceof String)) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Can't retrieve non-string param:" + args); + } + tag = (String)args; + + String[] qstrings = fcontext.req.getParams().getParams(tag); + + if (qstrings != null) { + for (String qstring : qstrings) { + QParser parser = null; + try { + parser = QParser.getParser((String) qstring, fcontext.req); + Query symbolicFilter = parser.getQuery(); + qlist.add(symbolicFilter); + } catch (SyntaxError syntaxError) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, syntaxError); + } + } + } + } else { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Bad query (expected a string):" + rawFilter); } - qlist.add(symbolicFilter); } this.filter = fcontext.searcher.getDocSet(qlist); @@ -363,24 +401,29 @@ public abstract class FacetProcessor { void processSubs(SimpleOrderedMap response, Query filter, DocSet domain) throws IOException { - // TODO: what if a zero bucket has a sub-facet with an exclusion that would yield results? - // should we check for domain-altering exclusions, or even ask the sub-facet for - // it's domain and then only skip it if it's 0? - - if (domain == null || domain.size() == 0 && !freq.processEmpty) { - return; - } + boolean emptyDomain = domain == null || domain.size() == 0; for (Map.Entry sub : freq.getSubFacets().entrySet()) { + FacetRequest subRequest = sub.getValue(); + + // This includes a static check if a sub-facet can possibly produce something from + // an empty domain. Should this be changed to a dynamic check as well? That would + // probably require actually executing the facet anyway, and dropping it at the + // end if it was unproductive. + if (emptyDomain && !freq.processEmpty && !subRequest.canProduceFromEmpty()) { + continue; + } + // make a new context for each sub-facet since they can change the domain FacetContext subContext = fcontext.sub(filter, domain); - FacetProcessor subProcessor = sub.getValue().createFacetProcessor(subContext); + FacetProcessor subProcessor = subRequest.createFacetProcessor(subContext); + if (fcontext.getDebugInfo() != null) { // if fcontext.debugInfo != null, it means rb.debug() == true FacetDebugInfo fdebug = new FacetDebugInfo(); subContext.setDebugInfo(fdebug); fcontext.getDebugInfo().addChild(fdebug); - fdebug.setReqDescription(sub.getValue().getFacetDescription()); + fdebug.setReqDescription(subRequest.getFacetDescription()); fdebug.setProcessor(subProcessor.getClass().getSimpleName()); if (subContext.filter != null) fdebug.setFilter(subContext.filter.toString()); diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java b/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java index 273466cc622..9f68380dfad 100644 --- a/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java +++ b/solr/core/src/java/org/apache/solr/search/facet/FacetRequest.java @@ -88,6 +88,16 @@ public abstract class FacetRequest { public boolean toChildren; public String parents; // identifies the parent filter... the full set of parent documents for any block join operation public List filters; // list of symbolic filters (JSON query format) + + // True if a starting set of documents can be mapped onto a different set of documents not originally in the starting set. + public boolean canTransformDomain() { + return toParent || toChildren || excludeTags != null; + } + + // Can this domain become non-empty if the input domain is empty? This does not check any sub-facets (see canProduceFromEmpty for that) + public boolean canBecomeNonEmpty() { + return excludeTags != null; + } } public FacetRequest() { @@ -119,6 +129,15 @@ public abstract class FacetRequest { return false; } + /** Returns true if this facet, or any sub-facets can produce results from an empty domain. */ + public boolean canProduceFromEmpty() { + if (domain != null && domain.canBecomeNonEmpty()) return true; + for (FacetRequest freq : subFacets.values()) { + if (freq.canProduceFromEmpty()) return true; + } + return false; + } + public void addStat(String key, AggValueSource stat) { facetStats.put(key, stat); } diff --git a/solr/core/src/java/org/apache/solr/util/TestInjection.java b/solr/core/src/java/org/apache/solr/util/TestInjection.java index efd80bf8de0..0d0c023472d 100644 --- a/solr/core/src/java/org/apache/solr/util/TestInjection.java +++ b/solr/core/src/java/org/apache/solr/util/TestInjection.java @@ -24,6 +24,7 @@ import java.util.Random; import java.util.Set; import java.util.Timer; import java.util.TimerTask; +import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -110,6 +111,8 @@ public class TestInjection { public static String updateRandomPause = null; + public static String prepRecoveryOpPauseForever = null; + public static String randomDelayInCoreCreation = null; public static int randomDelayMaxInCoreCreationInSec = 10; @@ -118,6 +121,8 @@ public class TestInjection { private static Set timers = Collections.synchronizedSet(new HashSet()); + private static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0); + public static void reset() { nonGracefullClose = null; failReplicaRequests = null; @@ -127,6 +132,8 @@ public class TestInjection { updateRandomPause = null; randomDelayInCoreCreation = null; splitFailureBeforeReplicaCreation = null; + prepRecoveryOpPauseForever = null; + countPrepRecoveryOpPauseForever = new AtomicInteger(0); for (Timer timer : timers) { timer.cancel(); @@ -289,6 +296,31 @@ public class TestInjection { return true; } + public static boolean injectPrepRecoveryOpPauseForever() { + if (prepRecoveryOpPauseForever != null) { + Random rand = random(); + if (null == rand) return true; + + Pair pair = parseValue(prepRecoveryOpPauseForever); + boolean enabled = pair.first(); + int chanceIn100 = pair.second(); + // Prevent for continuous pause forever + if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 2) { + countPrepRecoveryOpPauseForever.incrementAndGet(); + log.info("inject pause forever for prep recovery op"); + try { + Thread.sleep(Integer.MAX_VALUE); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } else { + countPrepRecoveryOpPauseForever.set(0); + } + } + + return true; + } + public static boolean injectSplitFailureBeforeReplicaCreation() { if (splitFailureBeforeReplicaCreation != null) { Random rand = random(); diff --git a/solr/core/src/resources/ImplicitPlugins.json b/solr/core/src/resources/ImplicitPlugins.json index 8bf21069614..34e5c07170d 100644 --- a/solr/core/src/resources/ImplicitPlugins.json +++ b/solr/core/src/resources/ImplicitPlugins.json @@ -92,14 +92,16 @@ "useParams":"_ADMIN_FILE" }, "/export": { - "class": "solr.SearchHandler", + "class": "solr.ExportHandler", "useParams":"_EXPORT", "components": [ "query" ], + "defaults": { + "wt": "json" + }, "invariants": { "rq": "{!xport}", - "wt": "xsort", "distrib": false } }, diff --git a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java index c7b27450938..f39cfed48d8 100644 --- a/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java +++ b/solr/core/src/test/org/apache/solr/cloud/AbstractCloudBackupRestoreTestCase.java @@ -38,6 +38,7 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.ImplicitDocRouter; import org.apache.solr.common.cloud.Slice; +import org.apache.solr.common.params.CollectionAdminParams; import org.apache.solr.common.params.CoreAdminParams; import org.junit.BeforeClass; import org.junit.Test; @@ -124,9 +125,24 @@ public abstract class AbstractCloudBackupRestoreTestCase extends SolrCloudTestCa } testBackupAndRestore(getCollectionName()); + testConfigBackupOnly("conf1", getCollectionName()); testInvalidPath(getCollectionName()); } + /** + * This test validates the backup of collection configuration using + * {@linkplain CollectionAdminParams#NO_INDEX_BACKUP_STRATEGY}. + * + * @param configName The config name for the collection to be backed up. + * @param collectionName The name of the collection to be backed up. + * @throws Exception in case of errors. + */ + protected void testConfigBackupOnly(String configName, String collectionName) throws Exception { + // This is deliberately no-op since we want to run this test only for one of the backup repository + // implementation (mainly to avoid redundant test execution). Currently HDFS backup repository test + // implements this. + } + // This test verifies the system behavior when the backup location cluster property is configured with an invalid // value for the specified repository (and the default backup location is not configured in solr.xml). private void testInvalidPath(String collectionName) throws Exception { diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java index 2a7413c41e2..e2f3bfd244c 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java @@ -37,6 +37,8 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.core.SolrCore; import org.apache.solr.update.DirectUpdateHandler2; import org.apache.solr.update.UpdateLog; +import org.apache.solr.util.TestInjection; +import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -47,6 +49,7 @@ public class TestCloudRecovery extends SolrCloudTestCase { @BeforeClass public static void setupCluster() throws Exception { + TestInjection.prepRecoveryOpPauseForever = "true:30"; System.setProperty("solr.directoryFactory", "solr.StandardDirectoryFactory"); System.setProperty("solr.ulog.numRecordsToKeep", "1000"); @@ -62,6 +65,11 @@ public class TestCloudRecovery extends SolrCloudTestCase { false, true, 30); } + @AfterClass + public static void afterClass() { + TestInjection.reset(); + } + @Before public void resetCollection() throws IOException, SolrServerException { cluster.getSolrClient().deleteByQuery(COLLECTION, "*:*"); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestHdfsCloudBackupRestore.java b/solr/core/src/test/org/apache/solr/cloud/TestHdfsCloudBackupRestore.java index 5fd7666d411..40a6e30f9d8 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestHdfsCloudBackupRestore.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestHdfsCloudBackupRestore.java @@ -16,10 +16,18 @@ */ package org.apache.solr.cloud; +import static org.apache.solr.cloud.OverseerCollectionMessageHandler.COLL_CONF; +import static org.apache.solr.core.backup.BackupManager.*; + import java.io.IOException; import java.lang.invoke.MethodHandles; import java.net.URI; import java.net.URISyntaxException; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakFilters; import org.apache.commons.io.IOUtils; @@ -28,7 +36,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; +import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.cloud.hdfs.HdfsTestUtil; +import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.params.CollectionAdminParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.core.backup.BackupManager; +import org.apache.solr.core.backup.repository.HdfsBackupRepository; import org.apache.solr.util.BadHdfsThreadsFilter; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -144,4 +159,45 @@ public class TestHdfsCloudBackupRestore extends AbstractCloudBackupRestoreTestCa public String getBackupLocation() { return null; } + + protected void testConfigBackupOnly(String configName, String collectionName) throws Exception { + String backupName = "configonlybackup"; + CloudSolrClient solrClient = cluster.getSolrClient(); + + CollectionAdminRequest.Backup backup = CollectionAdminRequest.backupCollection(collectionName, backupName) + .setRepositoryName(getBackupRepoName()) + .setIndexBackupStrategy(CollectionAdminParams.NO_INDEX_BACKUP_STRATEGY); + backup.process(solrClient); + + Map params = new HashMap<>(); + params.put("location", "/backup"); + params.put("solr.hdfs.home", hdfsUri + "/solr"); + + HdfsBackupRepository repo = new HdfsBackupRepository(); + repo.init(new NamedList<>(params)); + BackupManager mgr = new BackupManager(repo, solrClient.getZkStateReader()); + + URI baseLoc = repo.createURI("/backup"); + + Properties props = mgr.readBackupProperties(baseLoc, backupName); + assertNotNull(props); + assertEquals(collectionName, props.getProperty(COLLECTION_NAME_PROP)); + assertEquals(backupName, props.getProperty(BACKUP_NAME_PROP)); + assertEquals(configName, props.getProperty(COLL_CONF)); + + DocCollection collectionState = mgr.readCollectionState(baseLoc, backupName, collectionName); + assertNotNull(collectionState); + assertEquals(collectionName, collectionState.getName()); + + URI configDirLoc = repo.resolve(baseLoc, backupName, ZK_STATE_DIR, CONFIG_STATE_DIR, configName); + assertTrue(repo.exists(configDirLoc)); + + Collection expected = Arrays.asList(BACKUP_PROPS_FILE, ZK_STATE_DIR); + URI backupLoc = repo.resolve(baseLoc, backupName); + String[] dirs = repo.listAll(backupLoc); + for (String d : dirs) { + assertTrue(expected.contains(d)); + } + } + } diff --git a/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java b/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java index 279907841c9..004039c7c37 100644 --- a/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java +++ b/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java @@ -97,7 +97,7 @@ public class SolrCoreTest extends SolrTestCaseJ4 { ++ihCount; assertEquals(pathToClassMap.get("/admin/system"), "solr.SystemInfoHandler"); ++ihCount; assertEquals(pathToClassMap.get("/admin/threads"), "solr.ThreadDumpHandler"); ++ihCount; assertEquals(pathToClassMap.get("/config"), "solr.SolrConfigHandler"); - ++ihCount; assertEquals(pathToClassMap.get("/export"), "solr.SearchHandler"); + ++ihCount; assertEquals(pathToClassMap.get("/export"), "solr.ExportHandler"); ++ihCount; assertEquals(pathToClassMap.get("/terms"), "solr.SearchHandler"); ++ihCount; assertEquals(pathToClassMap.get("/get"), "solr.RealTimeGetHandler"); ++ihCount; assertEquals(pathToClassMap.get(ReplicationHandler.PATH), "solr.ReplicationHandler"); diff --git a/solr/core/src/test/org/apache/solr/handler/component/ReplicaListTransformerTest.java b/solr/core/src/test/org/apache/solr/handler/component/ReplicaListTransformerTest.java new file mode 100644 index 00000000000..96d23192962 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/component/ReplicaListTransformerTest.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.solr.common.cloud.Replica; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.apache.solr.request.SolrQueryRequest; +import org.junit.Test; + +public class ReplicaListTransformerTest extends LuceneTestCase { + + // A transformer that keeps only matching choices + private static class ToyMatchingReplicaListTransformer implements ReplicaListTransformer { + + private final String regex; + + public ToyMatchingReplicaListTransformer(String regex) + { + this.regex = regex; + } + + public void transform(List choices) + { + Iterator it = choices.iterator(); + while (it.hasNext()) { + Object choice = it.next(); + final String url; + if (choice instanceof String) { + url = (String)choice; + } + else if (choice instanceof Replica) { + url = ((Replica)choice).getCoreUrl(); + } else { + url = null; + } + if (url == null || !url.matches(regex)) { + it.remove(); + } + } + } + + } + + // A transformer that makes no transformation + private static class ToyNoOpReplicaListTransformer implements ReplicaListTransformer { + + public ToyNoOpReplicaListTransformer() + { + } + + public void transform(List choices) + { + // no-op + } + + } + + @Test + public void testTransform() throws Exception { + + final String regex = ".*" + random().nextInt(10) + ".*"; + + final ReplicaListTransformer transformer; + if (random().nextBoolean()) { + + transformer = new ToyMatchingReplicaListTransformer(regex); + + } else { + + transformer = new HttpShardHandlerFactory() { + + @Override + ReplicaListTransformer getReplicaListTransformer(final SolrQueryRequest req) + { + final SolrParams params = req.getParams(); + + if (params.getBool("toyNoTransform", false)) { + return new ToyNoOpReplicaListTransformer(); + } + + final String regex = params.get("toyRegEx"); + if (regex != null) { + return new ToyMatchingReplicaListTransformer(regex); + } + + return super.getReplicaListTransformer(req); + } + + }.getReplicaListTransformer( + new LocalSolrQueryRequest(null, + new ModifiableSolrParams().add("toyRegEx", regex))); + } + + final List inputs = new ArrayList<>(); + final List expectedTransformed = new ArrayList<>(); + + final List urls = createRandomUrls(); + for (int ii=0; ii propMap = new HashMap(); + propMap.put("base_url", url); + // a skeleton replica, good enough for this test's purposes + final Replica replica = new Replica(name, propMap); + + inputs.add(replica); + if (url.matches(regex)) { + expectedTransformed.add(replica); + } + } + + final List actualTransformed = new ArrayList<>(inputs); + transformer.transform(actualTransformed); + + assertEquals(expectedTransformed.size(), actualTransformed.size()); + for (int ii=0; ii urls, final String url) { + if (random().nextBoolean()) { + urls.add(url); + } + } + +} diff --git a/solr/core/src/test/org/apache/solr/handler/component/ShufflingReplicaListTransformerTest.java b/solr/core/src/test/org/apache/solr/handler/component/ShufflingReplicaListTransformerTest.java new file mode 100644 index 00000000000..26bb00879dd --- /dev/null +++ b/solr/core/src/test/org/apache/solr/handler/component/ShufflingReplicaListTransformerTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.solr.common.cloud.Replica; +import org.junit.Test; + +public class ShufflingReplicaListTransformerTest extends LuceneTestCase { + + private final ShufflingReplicaListTransformer transformer = new ShufflingReplicaListTransformer(random()); + + @Test + public void testTransformReplicas() throws Exception { + final List replicas = new ArrayList<>(); + for (final String url : createRandomUrls()) { + replicas.add(new Replica(url, new HashMap())); + } + implTestTransform(replicas); + } + + @Test + public void testTransformUrls() throws Exception { + final List urls = createRandomUrls(); + implTestTransform(urls); + } + + private void implTestTransform(List inputs) throws Exception { + final List transformedInputs = new ArrayList<>(inputs); + transformer.transform(transformedInputs); + + final Set inputSet = new HashSet<>(inputs); + final Set transformedSet = new HashSet<>(transformedInputs); + + assertTrue(inputSet.equals(transformedSet)); + } + + private final List createRandomUrls() throws Exception { + final List urls = new ArrayList<>(); + maybeAddUrl(urls, "a"+random().nextDouble()); + maybeAddUrl(urls, "bb"+random().nextFloat()); + maybeAddUrl(urls, "ccc"+random().nextGaussian()); + maybeAddUrl(urls, "dddd"+random().nextInt()); + maybeAddUrl(urls, "eeeee"+random().nextLong()); + Collections.shuffle(urls, random()); + return urls; + } + + private final void maybeAddUrl(final List urls, final String url) { + if (random().nextBoolean()) { + urls.add(url); + } + } + +} diff --git a/solr/core/src/test/org/apache/solr/request/macro/TestMacroExpander.java b/solr/core/src/test/org/apache/solr/request/macro/TestMacroExpander.java index 5b16a11e849..e908037cdd8 100644 --- a/solr/core/src/test/org/apache/solr/request/macro/TestMacroExpander.java +++ b/solr/core/src/test/org/apache/solr/request/macro/TestMacroExpander.java @@ -113,4 +113,17 @@ public class TestMacroExpander extends LuceneTestCase { } } + @Test + public void testMap() { // see SOLR-9740, the second fq param was being dropped. + final Map request = new HashMap<>(); + request.put("fq", new String[] {"zero", "${one_ref}", "two", "${three_ref}"}); + request.put("one_ref",new String[] {"one"}); + request.put("three_ref",new String[] {"three"}); + Map expanded = MacroExpander.expand(request); + assertEquals("zero", ((String[])expanded.get("fq"))[0]); + assertEquals("one", ((String[])expanded.get("fq"))[1]); + assertEquals("two", ((String[]) expanded.get("fq"))[2]); + assertEquals("three", ((String[]) expanded.get("fq"))[3]); + } + } diff --git a/solr/core/src/test/org/apache/solr/response/JSONWriterTest.java b/solr/core/src/test/org/apache/solr/response/JSONWriterTest.java index b096a094d5a..076d322437f 100644 --- a/solr/core/src/test/org/apache/solr/response/JSONWriterTest.java +++ b/solr/core/src/test/org/apache/solr/response/JSONWriterTest.java @@ -181,15 +181,19 @@ public class JSONWriterTest extends SolrTestCaseJ4 { methodsExpectedNotOverriden.add("writeMapOpener"); methodsExpectedNotOverriden.add("writeMapSeparator"); methodsExpectedNotOverriden.add("writeMapCloser"); + methodsExpectedNotOverriden.add("public void org.apache.solr.response.JSONWriter.writeArray(java.lang.String,java.util.List) throws java.io.IOException"); methodsExpectedNotOverriden.add("writeArrayOpener"); methodsExpectedNotOverriden.add("writeArraySeparator"); methodsExpectedNotOverriden.add("writeArrayCloser"); + methodsExpectedNotOverriden.add("public void org.apache.solr.response.JSONWriter.writeMap(org.apache.solr.common.MapWriter) throws java.io.IOException"); + methodsExpectedNotOverriden.add("public void org.apache.solr.response.JSONWriter.writeIterator(org.apache.solr.common.IteratorWriter) throws java.io.IOException"); final Class subClass = ArrayOfNamedValuePairJSONWriter.class; final Class superClass = subClass.getSuperclass(); for (final Method superClassMethod : superClass.getDeclaredMethods()) { final String methodName = superClassMethod.getName(); + final String methodFullName = superClassMethod.toString(); if (!methodName.startsWith("write")) continue; final int modifiers = superClassMethod.getModifiers(); @@ -197,7 +201,8 @@ public class JSONWriterTest extends SolrTestCaseJ4 { if (Modifier.isStatic(modifiers)) continue; if (Modifier.isPrivate(modifiers)) continue; - final boolean expectOverriden = !methodsExpectedNotOverriden.contains(methodName); + final boolean expectOverriden = !methodsExpectedNotOverriden.contains(methodName) + && !methodsExpectedNotOverriden.contains(methodFullName); try { final Method subClassMethod = subClass.getDeclaredMethod( @@ -215,7 +220,7 @@ public class JSONWriterTest extends SolrTestCaseJ4 { if (expectOverriden) { fail(subClass + " needs to override '" + superClassMethod + "'"); } else { - assertTrue(methodName+" not found in remaining "+methodsExpectedNotOverriden, methodsExpectedNotOverriden.remove(methodName)); + assertTrue(methodName+" not found in remaining "+methodsExpectedNotOverriden, methodsExpectedNotOverriden.remove(methodName)|| methodsExpectedNotOverriden.remove(methodFullName)); } } } diff --git a/solr/core/src/test/org/apache/solr/response/TestSortingResponseWriter.java b/solr/core/src/test/org/apache/solr/response/TestExportWriter.java similarity index 68% rename from solr/core/src/test/org/apache/solr/response/TestSortingResponseWriter.java rename to solr/core/src/test/org/apache/solr/response/TestExportWriter.java index 4b181330e3a..5a303e9893d 100644 --- a/solr/core/src/test/org/apache/solr/response/TestSortingResponseWriter.java +++ b/solr/core/src/test/org/apache/solr/response/TestExportWriter.java @@ -17,11 +17,12 @@ package org.apache.solr.response; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.util.Utils; import org.junit.*; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; @SuppressCodecs({"Lucene3x", "Lucene40","Lucene41","Lucene42","Lucene45"}) -public class TestSortingResponseWriter extends SolrTestCaseJ4 { +public class TestExportWriter extends SolrTestCaseJ4 { @BeforeClass public static void beforeClass() throws Exception { System.setProperty("export.test", "true"); @@ -109,67 +110,71 @@ public class TestSortingResponseWriter extends SolrTestCaseJ4 { //Test single value DocValue output String s = h.query(req("q", "id:1", "qt", "/export", "fl", "floatdv,intdv,stringdv,longdv,doubledv", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv\":2.1,\"intdv\":1,\"stringdv\":\"hello world\",\"longdv\":323223232323,\"doubledv\":2344.345}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv\":2.1,\"intdv\":1,\"stringdv\":\"hello world\",\"longdv\":323223232323,\"doubledv\":2344.345}]}}"); //Test null value string: s = h.query(req("q", "id:7", "qt", "/export", "fl", "floatdv,intdv,stringdv,longdv,doubledv", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv\":2.1,\"intdv\":7,\"longdv\":323223232323,\"doubledv\":2344.345}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv\":2.1,\"intdv\":7,\"longdv\":323223232323,\"doubledv\":2344.345}]}}"); //Test multiValue docValues output s = h.query(req("q", "id:1", "qt", "/export", "fl", "intdv_m,floatdv_m,doubledv_m,longdv_m,stringdv_m", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"intdv_m\":[100,250],\"floatdv_m\":[123.321,345.123],\"doubledv_m\":[3444.222,23232.2],\"longdv_m\":[343332,43434343434],\"stringdv_m\":[\"Everton\",\"liverpool\",\"manchester \\\"city\\\"\"]}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"intdv_m\":[100,250],\"floatdv_m\":[123.321,345.123],\"doubledv_m\":[3444.222,23232.2],\"longdv_m\":[343332,43434343434],\"stringdv_m\":[\"Everton\",\"liverpool\",\"manchester \\\"city\\\"\"]}]}}"); //Test multiValues docValues output with nulls s = h.query(req("q", "id:7", "qt", "/export", "fl", "intdv_m,floatdv_m,doubledv_m,longdv_m,stringdv_m", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv_m\":[123.321,345.123],\"doubledv_m\":[3444.222,23232.2],\"longdv_m\":[343332,43434343434]}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"floatdv_m\":[123.321,345.123],\"doubledv_m\":[3444.222,23232.2],\"longdv_m\":[343332,43434343434]}]}}"); //Test single sort param is working s = h.query(req("q", "id:(1 2)", "qt", "/export", "fl", "intdv", "sort", "intdv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":2},{\"intdv\":1}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":2},{\"intdv\":1}]}}"); s = h.query(req("q", "id:(1 2)", "qt", "/export", "fl", "intdv", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":2}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":2}]}}"); // Test sort on String will null value. Null value should sort last on desc and first on asc. s = h.query(req("q", "id:(1 7)", "qt", "/export", "fl", "intdv", "sort", "stringdv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":7}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":7}]}}"); s = h.query(req("q", "id:(1 7)", "qt", "/export", "fl", "intdv", "sort", "stringdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":7},{\"intdv\":1}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":7},{\"intdv\":1}]}}"); //Test multi-sort params s = h.query(req("q", "id:(1 2)", "qt", "/export", "fl", "intdv", "sort", "floatdv asc,intdv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":2},{\"intdv\":1}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":2},{\"intdv\":1}]}}"); s = h.query(req("q", "id:(1 2)", "qt", "/export", "fl", "intdv", "sort", "floatdv desc,intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":2}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":2, \"docs\":[{\"intdv\":1},{\"intdv\":2}]}}"); //Test three sort fields s = h.query(req("q", "id:(1 2 3)", "qt", "/export", "fl", "intdv", "sort", "floatdv asc,stringdv asc,intdv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":2},{\"intdv\":1}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":2},{\"intdv\":1}]}}"); //Test three sort fields s = h.query(req("q", "id:(1 2 3)", "qt", "/export", "fl", "intdv", "sort", "floatdv asc,stringdv desc,intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":1},{\"intdv\":2},{\"intdv\":3}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":1},{\"intdv\":2},{\"intdv\":3}]}}"); //Test four sort fields s = h.query(req("q", "id:(1 2 3)", "qt", "/export", "fl", "intdv", "sort", "floatdv asc,floatdv desc,floatdv asc,intdv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":2},{\"intdv\":1}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":2},{\"intdv\":1}]}}"); s = h.query(req("q", "id:(1 2 3)", "qt", "/export", "fl", "intdv", "sort", "doubledv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":1},{\"intdv\":2}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":1},{\"intdv\":2}]}}"); s = h.query(req("q", "intdv:[2 TO 1000]", "qt", "/export", "fl", "intdv", "sort", "doubledv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":7},{\"intdv\":2}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":3, \"docs\":[{\"intdv\":3},{\"intdv\":7},{\"intdv\":2}]}}"); s = h.query(req("q", "stringdv:blah", "qt", "/export", "fl", "intdv", "sort", "doubledv desc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":0, \"docs\":[]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":0, \"docs\":[]}}"); s = h.query(req("q", "id:8", "qt", "/export", "fl", "stringdv", "sort", "intdv asc")); - assertEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"stringdv\":\"chello \\\"world\\\"\"}]}}"); + assertJsonEquals(s, "{\"responseHeader\": {\"status\": 0}, \"response\":{\"numFound\":1, \"docs\":[{\"stringdv\":\"chello \\\"world\\\"\"}]}}"); + } + + private void assertJsonEquals(String actual, String expected) { + assertEquals(Utils.toJSONString(Utils.fromJSONString(expected)), Utils.toJSONString(Utils.fromJSONString(actual))); } @Test diff --git a/solr/core/src/test/org/apache/solr/response/TestPushWriter.java b/solr/core/src/test/org/apache/solr/response/TestPushWriter.java new file mode 100644 index 00000000000..043fe92db7a --- /dev/null +++ b/solr/core/src/test/org/apache/solr/response/TestPushWriter.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.response; + + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.lang.invoke.MethodHandles; +import java.util.Map; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.IteratorWriter; +import org.apache.solr.common.MapWriter; +import org.apache.solr.common.PushWriter; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.util.JavaBinCodec; +import org.apache.solr.common.util.Utils; +import org.apache.solr.request.LocalSolrQueryRequest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Collections.singletonMap; + +public class TestPushWriter extends SolrTestCaseJ4 { + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + + public void testStandardResponse() throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + OutputStreamWriter osw = new OutputStreamWriter(baos, UTF_8); + PushWriter pw = new JSONWriter(osw, + new LocalSolrQueryRequest(null, new ModifiableSolrParams()), new SolrQueryResponse()); + writeData(pw); + osw.flush(); + log.info(new String(baos.toByteArray(), "UTF-8")); + Map m = (Map) Utils.fromJSON(baos.toByteArray()); + checkValues(m); + writeData(new JavaBinCodec(baos= new ByteArrayOutputStream(), null)); + m = (Map) new JavaBinCodec().unmarshal(new ByteArrayInputStream(baos.toByteArray())); + checkValues(m); + } + + protected void checkValues(Map m) { + assertEquals(0, ((Number)Utils.getObjectByPath(m, true, "responseHeader/status")).intValue()); + assertEquals(10, ((Number)Utils.getObjectByPath(m, true, "response/numFound")).intValue()); + assertEquals(1, ((Number)Utils.getObjectByPath(m, true, "response/docs[0]/id")).intValue()); + assertEquals(2, ((Number)Utils.getObjectByPath(m, true, "response/docs[1]/id")).intValue()); + assertEquals(3, ((Number)Utils.getObjectByPath(m, true, "response/docs[2]/id")).intValue()); + } + + protected void writeData(PushWriter pw) throws IOException { + pw.writeMap(m -> { + m.put("responseHeader", singletonMap("status", 0)) + .put("response", (MapWriter) m1 -> { + m1.put("numFound", 10) + .put("docs", (IteratorWriter) w -> { + w.add((MapWriter) m3 -> m3.put("id", 1)) + .add(singletonMap("id", 2)) + .add(singletonMap("id", 3)); + }); }); }); + pw.close(); + } +} diff --git a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java index 32c6ef11f62..32f9dfa9b4b 100644 --- a/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java +++ b/solr/core/src/test/org/apache/solr/search/facet/TestJsonFacets.java @@ -998,6 +998,25 @@ public class TestJsonFacets extends SolrTestCaseHS { "}" ); + // test sub-facets of empty buckets with domain filter exclusions (canProduceFromEmpty) (see SOLR-9519) + client.testJQ(params(p, "q", "*:*", "fq","{!tag=doc3}id:non-exist", "fq","{!tag=CATA}${cat_s}:A" + + , "json.facet", "{" + + "f1:{${terms} type:terms, field:${cat_s}, domain:{excludeTags:doc3} } " + + ",q1 :{type:query, q:'*:*', facet:{ f1:{${terms} type:terms, field:${cat_s}, domain:{excludeTags:doc3} } } } " + // nested under query + ",q1a:{type:query, q:'id:4', facet:{ f1:{${terms} type:terms, field:${cat_s}, domain:{excludeTags:doc3} } } } " + // nested under query, make sure id:4 filter still applies + ",r1 :{type:range, field:${num_d}, start:0, gap:3, end:5, facet:{ f1:{${terms} type:terms, field:${cat_s}, domain:{excludeTags:doc3} } } } " + // nested under range, make sure range constraints still apply + ",f2:{${terms} type:terms, field:${cat_s}, domain:{filter:'*:*'} } " + // domain filter doesn't widen, so f2 should not appear. + "}" + ) + , "facets=={ count:0, " + + " f1:{ buckets:[ {val:A, count:2} ] }" + + ",q1:{ count:0, f1:{buckets:[{val:A, count:2}]} }" + + ",q1a:{ count:0, f1:{buckets:[{val:A, count:1}]} }" + + ",r1:{ buckets:[ {val:0.0,count:0,f1:{buckets:[{val:A, count:1}]}}, {val:3.0,count:0,f1:{buckets:[{val:A, count:1}]}} ] }" + + "}" + ); + // nested query facets on subset (with excludeTags) client.testJQ(params(p, "q", "*:*", "fq","{!tag=abc}id:(2 3)" , "json.facet", "{ processEmpty:true," + @@ -1165,26 +1184,30 @@ public class TestJsonFacets extends SolrTestCaseHS { // test filter - client.testJQ(params(p, "q", "*:*", "myfilt","${cat_s}:A" + client.testJQ(params(p, "q", "*:*", "myfilt","${cat_s}:A", "ff","-id:1", "ff","-id:2" , "json.facet", "{" + "t:{${terms} type:terms, field:${cat_s}, domain:{filter:[]} }" + // empty filter list ",t_filt:{${terms} type:terms, field:${cat_s}, domain:{filter:'${cat_s}:B'} }" + - ",t_filt2:{${terms} type:terms, field:${cat_s}, domain:{filter:'{!query v=$myfilt}'} }" + // test access to qparser and other query parameters - ",t_filt3:{${terms} type:terms, field:${cat_s}, domain:{filter:['-id:1','-id:2']} }" + + ",t_filt2 :{${terms} type:terms, field:${cat_s}, domain:{filter:'{!query v=$myfilt}'} }" + // test access to qparser and other query parameters + ",t_filt2a:{${terms} type:terms, field:${cat_s}, domain:{filter:{param:myfilt} } }" + // test filter via "param" type + ",t_filt3: {${terms} type:terms, field:${cat_s}, domain:{filter:['-id:1','-id:2']} }" + + ",t_filt3a:{${terms} type:terms, field:${cat_s}, domain:{filter:{param:ff}} }" + // test multi-valued query parameter ",q:{type:query, q:'${cat_s}:B', domain:{filter:['-id:5']} }" + // also tests a top-level negative filter ",r:{type:range, field:${num_d}, start:-5, end:10, gap:5, domain:{filter:'-id:4'} }" + "}" ) , "facets=={ count:6, " + - "t :{ buckets:[ {val:B, count:3}, {val:A, count:2} ] }" + - ",t_filt :{ buckets:[ {val:B, count:3}] } " + - ",t_filt2:{ buckets:[ {val:A, count:2}] } " + - ",t_filt3:{ buckets:[ {val:B, count:2}, {val:A, count:1}] } " + + "t :{ buckets:[ {val:B, count:3}, {val:A, count:2} ] }" + + ",t_filt :{ buckets:[ {val:B, count:3}] } " + + ",t_filt2 :{ buckets:[ {val:A, count:2}] } " + + ",t_filt2a:{ buckets:[ {val:A, count:2}] } " + + ",t_filt3 :{ buckets:[ {val:B, count:2}, {val:A, count:1}] } " + + ",t_filt3a:{ buckets:[ {val:B, count:2}, {val:A, count:1}] } " + ",q:{count:2}" + ",r:{buckets:[ {val:-5.0,count:1}, {val:0.0,count:1}, {val:5.0,count:0} ] }" + "}" ); - + } @Test @@ -1433,6 +1456,24 @@ public class TestJsonFacets extends SolrTestCaseHS { "}" ); + + // test other various ways to get filters + client.testJQ(params(p, "q", "*:*", "f1","-id:3.1", "f2","id:1" + , "json.facet", "{ " + + "pages1:{type:terms, field:v_t, domain:{blockChildren:'type_s:book', filter:[]} }" + + ",pages2:{type:terms, field:v_t, domain:{blockChildren:'type_s:book', filter:{param:f1} } }" + + ",books:{type:terms, field:v_t, domain:{blockParent:'type_s:book', filter:[{param:q},{param:missing_param}]} }" + + ",books2:{type:terms, field:v_t, domain:{blockParent:'type_s:book', filter:[{param:f2}] } }" + + "}" + ) + , "facets=={ count:10" + + ", pages1:{ buckets:[ {val:y,count:4},{val:x,count:3},{val:z,count:3} ] }" + + ", pages2:{ buckets:[ {val:y,count:4},{val:z,count:3},{val:x,count:2} ] }" + + ", books:{ buckets:[ {val:q,count:3},{val:e,count:2},{val:w,count:2} ] }" + + ", books2:{ buckets:[ {val:q,count:1} ] }" + + "}" + ); + } diff --git a/solr/example/files/conf/update-script.js b/solr/example/files/conf/update-script.js index 0991c889413..2589968b50d 100644 --- a/solr/example/files/conf/update-script.js +++ b/solr/example/files/conf/update-script.js @@ -72,13 +72,18 @@ function processAdd(cmd) { doc.setField("content_type_subtype_s", ct_subtype); } + var content = doc.getFieldValue("content"); + if (!content) { + return; //No content found, so we are done here + } + var analyzer = req.getCore().getLatestSchema() .getFieldTypeByName("text_email_url") .getIndexAnalyzer(); var token_stream = - analyzer.tokenStream("content", doc.getFieldValue("content")); + analyzer.tokenStream("content", content); var term_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.CharTermAttribute")); var type_att = token_stream.getAttribute(get_class("org.apache.lucene.analysis.tokenattributes.TypeAttribute")); token_stream.reset(); @@ -107,4 +112,4 @@ function processRollback(cmd) { function finish() { // no-op -} \ No newline at end of file +} diff --git a/solr/licenses/carrot2-mini-3.12.0.jar.sha1 b/solr/licenses/carrot2-mini-3.12.0.jar.sha1 deleted file mode 100644 index 5a90da6d079..00000000000 --- a/solr/licenses/carrot2-mini-3.12.0.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9d8b42afe43ba5c0a0c5d67208d5c919e45c3584 diff --git a/solr/licenses/carrot2-mini-3.15.0.jar.sha1 b/solr/licenses/carrot2-mini-3.15.0.jar.sha1 new file mode 100644 index 00000000000..cd87a99aaa0 --- /dev/null +++ b/solr/licenses/carrot2-mini-3.15.0.jar.sha1 @@ -0,0 +1 @@ +5d76ec388711056bfaaacc354ed04ffa6811c7b7 diff --git a/solr/server/scripts/cloud-scripts/snapshotscli.sh b/solr/server/scripts/cloud-scripts/snapshotscli.sh new file mode 100755 index 00000000000..41ae4aa0321 --- /dev/null +++ b/solr/server/scripts/cloud-scripts/snapshotscli.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash + +set -e + +run_solr_snapshot_tool() { + JVM="java" + scriptDir=$(dirname "$0") + if [ -n "$LOG4J_PROPS" ]; then + log4j_config="file:${LOG4J_PROPS}" + else + log4j_config="file:${scriptDir}/log4j.properties" + fi + PATH=${JAVA_HOME}/bin:${PATH} ${JVM} ${ZKCLI_JVM_FLAGS} -Dlog4j.configuration=${log4j_config} \ + -classpath "${solrLibPath}" org.apache.solr.core.snapshots.SolrSnapshotsTool "$@" 2> /dev/null +} + +usage() { + run_solr_snapshot_tool --help +} + +distcp_warning() { + echo "SOLR_USE_DISTCP environment variable is not set. \ + Do you want to use hadoop distcp tool for exporting Solr collection snapshot ?" +} + +parse_options() { + OPTIND=3 + while getopts ":c:d:s:z:p:r:i:" o ; do + case "${o}" in + d) + destPath=${OPTARG} + ;; + s) + sourcePath=${OPTARG} + ;; + c) + collectionName=${OPTARG} + ;; + z) + solrZkEnsemble=${OPTARG} + ;; + p) + pathPrefix=${OPTARG} + ;; + r) + backupRepoName=${OPTARG} + ;; + i) + aysncReqId=${OPTARG} + ;; + *) + echo "Unknown option ${OPTARG}" + usage 1>&2 + exit 1 + ;; + esac + done +} + +prepare_snapshot_export() { + #Make sure to cleanup the temporary files. + scratch=$(mktemp -d -t solrsnaps.XXXXXXXXXX) + function finish { + rm -rf "${scratch}" + } + trap finish EXIT + + if hdfs dfs -test -d "${destPath}" ; then + run_solr_snapshot_tool --prepare-snapshot-export "$@" -t "${scratch}" + + hdfs dfs -mkdir -p "${copyListingDirPath}" > /dev/null + find "${scratch}" -type f -printf "%f\n" | while read shardId; do + echo "Copying the copy-listing for $shardId" + hdfs dfs -copyFromLocal "${scratch}/${shardId}" "${copyListingDirPath}" > /dev/null + done + else + echo "Directory ${destPath} does not exist." + exit 1 + fi +} + +copy_snapshot_files() { + copylisting_dir_path="$1" + + if hdfs dfs -test -d "${copylisting_dir_path}" ; then + for shardId in $(hdfs dfs -stat "%n" "${copylisting_dir_path}/*"); do + oPath="${destPath}/${snapshotName}/snapshot.${shardId}" + echo "Copying the index files for ${shardId} to ${oPath}" + ${distCpCmd} -f " ${copylisting_dir_path}/${shardId}" "${oPath}" > /dev/null + done + else + echo "Directory ${copylisting_dir_path} does not exist." + exit 1 + fi +} + +collectionName="" +solrZkEnsemble="" +pathPrefix="" +destPath="" +sourcePath="" +cmd="$1" +snapshotName="$2" +copyListingDirPath="" +distCpCmd="${SOLR_DISTCP_CMD:-hadoop distcp}" +scriptDir=$(dirname "$0") +solrLibPath="${SOLR_LIB_PATH:-${scriptDir}/../../solr-webapp/webapp/WEB-INF/lib/*:${scriptDir}/../../lib/ext/*}" + +case "${cmd}" in + --create) + run_solr_snapshot_tool "$@" + ;; + --delete) + run_solr_snapshot_tool "$@" + ;; + --list) + run_solr_snapshot_tool "$@" + ;; + --describe) + run_solr_snapshot_tool "$@" + ;; + --prepare-snapshot-export) + : "${SOLR_USE_DISTCP:? $(distcp_warning)}" + + parse_options "$@" + + : "${destPath:? Please specify destination directory using -d option}" + + copyListingDirPath="${destPath}/copylistings" + prepare_snapshot_export "${@:2}" + echo "Done. GoodBye!" + ;; + --export) + if [ -z "${SOLR_USE_DISTCP}" ]; then + run_solr_snapshot_tool "$@" + echo "Done. GoodBye!" + exit 0 + fi + + parse_options "$@" + + : "${snapshotName:? Please specify the name of the snapshot}" + : "${destPath:? Please specify destination directory using -d option}" + + if [ -n "${collectionName}" ] && [ -n "${sourcePath}" ]; then + echo "The -c and -s options can not be specified together" + exit 1 + fi + + if [ -z "${collectionName}" ] && [ -z "${sourcePath}" ]; then + echo "At least one of options (-c or -s) must be specified" + exit 1 + fi + + if [ -n "${collectionName}" ]; then + copyListingDirPath="${destPath}/${snapshotName}/copylistings" + prepare_snapshot_export "${@:2}" + copy_snapshot_files "${destPath}/${snapshotName}/copylistings" + hdfs dfs -rm -r -f -skipTrash "${destPath}/${snapshotName}/copylistings" > /dev/null + else + copy_snapshot_files "${sourcePath}/copylistings" + echo "Copying the collection meta-data to ${destPath}/${snapshotName}" + ${distCpCmd} "${sourcePath}/${snapshotName}/*" "${destPath}/${snapshotName}/" > /dev/null + fi + + echo "Done. GoodBye!" + ;; + --help) + usage 1>&2 + ;; + *) + echo "Unknown command ${cmd}" + usage 1>&2 + exit 1 +esac + diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Tuple.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Tuple.java index 207bc6ad9cb..58d948dd65f 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Tuple.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Tuple.java @@ -16,16 +16,15 @@ */ package org.apache.solr.client.solrj.io; +import java.io.IOException; import java.time.Instant; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; -import java.util.Map; import java.util.List; -import java.util.function.BiConsumer; +import java.util.Map; -import org.apache.solr.common.MapSerializable; -import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.MapWriter; /** * A simple abstraction of a record containing key/value pairs. @@ -34,7 +33,7 @@ import org.apache.solr.common.SolrDocument; * **/ -public class Tuple implements Cloneable, MapSerializable { +public class Tuple implements Cloneable, MapWriter { /** * When EOF field is true the Tuple marks the end of the stream. @@ -92,7 +91,7 @@ public class Tuple implements Cloneable, MapSerializable { } } - // Convenience method since Booleans can be pased around as Strings. + // Convenience method since Booleans can be passed around as Strings. public Boolean getBool(Object key) { Object o = this.fields.get(key); @@ -198,7 +197,13 @@ public class Tuple implements Cloneable, MapSerializable { } @Override - public Map toMap(Map map) { - return fields; + public void writeMap(EntryWriter ew) throws IOException { + fields.forEach((k, v) -> { + try { + ew.put((String)k,v); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); } } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/TupleStream.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/TupleStream.java index 6f381ec68b5..49a806f1660 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/TupleStream.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/TupleStream.java @@ -19,21 +19,19 @@ package org.apache.solr.client.solrj.io.stream; import java.io.Closeable; import java.io.IOException; import java.io.Serializable; -import java.util.Collections; -import java.util.Iterator; import java.util.List; -import java.util.Map; import java.util.UUID; import org.apache.solr.client.solrj.io.Tuple; import org.apache.solr.client.solrj.io.comp.StreamComparator; import org.apache.solr.client.solrj.io.stream.expr.Explanation; import org.apache.solr.client.solrj.io.stream.expr.StreamFactory; -import org.apache.solr.common.MapSerializable; +import org.apache.solr.common.IteratorWriter; +import org.apache.solr.common.MapWriter; import org.apache.solr.common.SolrException; -public abstract class TupleStream implements Closeable, Serializable, MapSerializable { +public abstract class TupleStream implements Closeable, Serializable, MapWriter { private static final long serialVersionUID = 1; @@ -42,15 +40,6 @@ public abstract class TupleStream implements Closeable, Serializable, MapSeriali public TupleStream() { } -/* - public static void writeStreamOpen(Writer out) throws IOException { - out.write("{\"docs\":["); - } - - public static void writeStreamClose(Writer out) throws IOException { - out.write("]}"); - }*/ - public abstract void setStreamContext(StreamContext context); public abstract List children(); @@ -69,41 +58,25 @@ public abstract class TupleStream implements Closeable, Serializable, MapSeriali return 0; } - private boolean isOpen = false; - @Override - public Map toMap(Map map) { - try { - if (!isOpen) { - open(); - isOpen = true; - } - } catch (IOException e) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } - return Collections.singletonMap("docs", new Iterator() { - Tuple tuple; - boolean isEOF = false; - - @Override - public boolean hasNext() { - if (isEOF) return false; - if (tuple != null) return true; - try { - tuple = read(); - if(tuple != null && tuple.EOF) close(); - return tuple != null; - } catch (IOException e) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + public void writeMap(EntryWriter ew) throws IOException { + open(); + ew.put("docs", (IteratorWriter) iw -> { + try { + for (; ; ) { + Tuple tuple = read(); + if (tuple != null) { + iw.add(tuple); + if (tuple.EOF) { + close(); + break; + } + } else { + break; + } } - } - - @Override - public Tuple next() { - Tuple tmp = tuple; - tuple = null; - isEOF = tmp == null || tmp.EOF; - return tmp; + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } }); } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/expr/Explanation.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/expr/Explanation.java index 97f0192234c..e72d6edcf96 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/expr/Explanation.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/expr/Explanation.java @@ -17,7 +17,6 @@ package org.apache.solr.client.solrj.io.stream.expr; import java.util.ArrayList; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java index 92ea99bc0b8..0beaa55b644 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/request/CollectionAdminRequest.java @@ -713,10 +713,12 @@ public abstract class CollectionAdminRequest protected Optional repositoryName = Optional.empty(); protected String location; protected Optional commitName = Optional.empty(); + protected Optional indexBackupStrategy = Optional.empty(); public Backup(String collection, String name) { super(CollectionAction.BACKUP, collection); this.name = name; + this.repositoryName = Optional.empty(); } @Override @@ -760,6 +762,15 @@ public abstract class CollectionAdminRequest return this; } + public Optional getIndexBackupStrategy() { + return indexBackupStrategy; + } + + public Backup setIndexBackupStrategy(String indexBackupStrategy) { + this.indexBackupStrategy = Optional.ofNullable(indexBackupStrategy); + return this; + } + @Override public SolrParams getParams() { ModifiableSolrParams params = (ModifiableSolrParams) super.getParams(); @@ -772,6 +783,9 @@ public abstract class CollectionAdminRequest if (commitName.isPresent()) { params.set(CoreAdminParams.COMMIT_NAME, commitName.get()); } + if (indexBackupStrategy.isPresent()) { + params.set(CollectionAdminParams.INDEX_BACKUP_STRATEGY, indexBackupStrategy.get()); + } return params; } diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/Cluster.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/Cluster.java index ae3e529dab4..378e1a72758 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/Cluster.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/Cluster.java @@ -16,7 +16,9 @@ */ package org.apache.solr.client.solrj.response; +import java.util.Collections; import java.util.List; +import java.util.Objects; /** * This class represents a cluster of Solr Docs . @@ -28,41 +30,43 @@ public class Cluster { private List labels; private double score; private List docIds; + private List subclusters; + private boolean otherTopics; + + public Cluster(List labels, double score, List docIds) { + this(labels, score, docIds, Collections.emptyList(), false); + } /** * @param labels the list of human readable labels associated to the cluster * @param score the score produced by the clustering algorithm for the current cluster * @param docIds the list of document Ids belonging to the cluster */ - public Cluster(List labels, double score, List docIds) { + public Cluster(List labels, double score, List docIds, List subclusters, boolean otherTopics) { this.labels = labels; this.score = score; this.docIds = docIds; + this.subclusters = subclusters; + this.otherTopics = otherTopics; } @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Cluster)) return false; + return o != null && + this.getClass().isInstance(o) && + equalsTo((Cluster) o); + } - Cluster cluster = (Cluster) o; - - if (Double.compare(cluster.score, score) != 0) return false; - if (!docIds.equals(cluster.docIds)) return false; - if (!labels.equals(cluster.labels)) return false; - - return true; + private boolean equalsTo(Cluster o) { + return Double.compare(o.score, score) == 0 && + Objects.equals(o.docIds, docIds) && + Objects.equals(o.labels, labels) && + Objects.equals(o.subclusters, subclusters); } @Override public int hashCode() { - int result; - long temp; - result = labels.hashCode(); - temp = Double.doubleToLongBits(score); - result = 31 * result + (int) (temp ^ (temp >>> 32)); - result = 31 * result + docIds.hashCode(); - return result; + return Objects.hash(subclusters, docIds, labels, score); } public List getLabels() { @@ -89,5 +93,15 @@ public class Cluster { this.docIds = docIds; } + public List getSubclusters() { + return subclusters; + } + /** + * @return If true, the cluster contains references to documents that are not semantically associated + * and form a group of documents not related to any other cluster (or themselves). + */ + public boolean isOtherTopics() { + return otherTopics; + } } \ No newline at end of file diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/response/ClusteringResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/response/ClusteringResponse.java index ad6e0484970..73afb6b9e78 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/response/ClusteringResponse.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/response/ClusteringResponse.java @@ -15,8 +15,10 @@ * limitations under the License. */ package org.apache.solr.client.solrj.response; -import java.util.LinkedList; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; import org.apache.solr.common.util.NamedList; @@ -24,21 +26,47 @@ import org.apache.solr.common.util.NamedList; * Encapsulates responses from ClusteringComponent */ public class ClusteringResponse { - + private static final String CLUSTERS_NODE = "clusters"; private static final String LABELS_NODE = "labels"; private static final String DOCS_NODE = "docs"; private static final String SCORE_NODE = "score"; - private List clusters = new LinkedList(); + private static final String IS_OTHER_TOPICS = "other-topics"; + private List clusters; + @SuppressWarnings("unchecked") public ClusteringResponse(List> clusterInfo) { + clusters = new ArrayList(); for (NamedList clusterNode : clusterInfo) { - List labelList; - List docIdList; - labelList = (List) clusterNode.get(LABELS_NODE); - double score = (double) clusterNode.get(SCORE_NODE); - docIdList = (List) clusterNode.get(DOCS_NODE); - Cluster currentCluster = new Cluster(labelList, score, docIdList); - clusters.add(currentCluster); + List labelList, docIdList; + List subclusters = Collections.emptyList(); + labelList = docIdList = Collections.emptyList(); + Double score = 0d; + boolean otherTopics = false; + for (Map.Entry e : clusterNode) { + switch (e.getKey()) { + case LABELS_NODE: + labelList = (List) e.getValue(); + break; + + case DOCS_NODE: + docIdList = (List) e.getValue(); + break; + + case SCORE_NODE: + score = (Double) e.getValue(); + break; + + case CLUSTERS_NODE: + subclusters = new ClusteringResponse((List>) e.getValue()).getClusters(); + break; + + case IS_OTHER_TOPICS: + otherTopics = (Boolean) e.getValue(); + break; + } + } + + clusters.add(new Cluster(labelList, score, docIdList, subclusters, otherTopics)); } } diff --git a/solr/solrj/src/java/org/apache/solr/common/IteratorWriter.java b/solr/solrj/src/java/org/apache/solr/common/IteratorWriter.java new file mode 100644 index 00000000000..0049a5bda99 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/IteratorWriter.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common; + + +import java.io.IOException; + +/** + * Interface to help do push writing to an array + */ +public interface IteratorWriter { + /** + * @param iw after this method returns , the EntryWriter Object is invalid + * Do not hold a reference to this object + */ + void writeIter(ItemWriter iw) throws IOException; + + interface ItemWriter { + /**The item could be any supported type + */ + ItemWriter add(Object o) throws IOException; + + default ItemWriter add(int v) throws IOException { + add((Integer) v); + return this; + } + + + default ItemWriter add(long v) throws IOException { + add((Long) v); + return this; + } + + + default ItemWriter add(float v) throws IOException { + add((Float) v); + return this; + } + + default ItemWriter add(double v) throws IOException { + add((Double) v); + return this; + } + + default ItemWriter add(boolean v) throws IOException { + add((Boolean) v); + return this; + } + } +} diff --git a/solr/solrj/src/java/org/apache/solr/common/MapWriter.java b/solr/solrj/src/java/org/apache/solr/common/MapWriter.java new file mode 100644 index 00000000000..8fb9d031b27 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/MapWriter.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common; + + +import java.io.IOException; +import java.util.Map; + +/** + * Use this class to push all entries of a Map into an output. + * This avoids creating map instances and is supposed to be memory efficient. + * If the entries are primitives, unnecessary boxing is also avoided + */ +public interface MapWriter extends MapSerializable { + + @Override + default Map toMap(Map map) { + try { + writeMap(new EntryWriter() { + @Override + public EntryWriter put(String k, Object v) throws IOException { + map.put(k, v); + return this; + } + + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + return map; + } + + void writeMap(EntryWriter ew) throws IOException; + + /** + * An interface to push one entry at a time to the output + */ + interface EntryWriter { + + /** + * Writes a key value into the map + * + * @param k The key + * @param v The value can be any supported object + */ + EntryWriter put(String k, Object v) throws IOException; + + default EntryWriter put(String k, int v) throws IOException { + put(k, (Integer) v); + return this; + } + + + default EntryWriter put(String k, long v) throws IOException { + put(k, (Long) v); + return this; + } + + + default EntryWriter put(String k, float v) throws IOException { + put(k, (Float) v); + return this; + } + + default EntryWriter put(String k, double v) throws IOException { + put(k, (Double) v); + return this; + } + + default EntryWriter put(String k, boolean v) throws IOException { + put(k, (Boolean) v); + return this; + } + } +} diff --git a/solr/solrj/src/java/org/apache/solr/common/PushWriter.java b/solr/solrj/src/java/org/apache/solr/common/PushWriter.java new file mode 100644 index 00000000000..ddfac3cca52 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/PushWriter.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common; + + +import java.io.Closeable; +import java.io.IOException; + +/**This is an interface to stream data out using a push API + * + */ +public interface PushWriter extends Closeable { + + /**Write a Map. The map is opened in the beginning of the method + * and closed at the end. All map entries MUST be written before this + * method returns + */ + void writeMap(MapWriter mw) throws IOException; + + /**Write an array. The array is opened at the beginning of this method + * and closed at the end. All array entries must be returned before this + * method returns + * + */ + void writeIterator(IteratorWriter iw) throws IOException; + +} diff --git a/solr/solrj/src/java/org/apache/solr/common/params/CollectionAdminParams.java b/solr/solrj/src/java/org/apache/solr/common/params/CollectionAdminParams.java index a8686a12322..98ae2928b09 100644 --- a/solr/solrj/src/java/org/apache/solr/common/params/CollectionAdminParams.java +++ b/solr/solrj/src/java/org/apache/solr/common/params/CollectionAdminParams.java @@ -16,6 +16,9 @@ */ package org.apache.solr.common.params; +import java.util.Arrays; +import java.util.Collection; + public interface CollectionAdminParams { /* Param used by DELETESTATUS call to clear all stored responses */ @@ -26,4 +29,25 @@ public interface CollectionAdminParams { String COUNT_PROP = "count"; + /** + * A parameter to specify the name of the index backup strategy to be used. + */ + public static final String INDEX_BACKUP_STRATEGY = "indexBackup"; + + /** + * This constant defines the index backup strategy based on copying index files to desired location. + */ + public static final String COPY_FILES_STRATEGY = "copy-files"; + + /** + * This constant defines the strategy to not copy index files (useful for meta-data only backup). + */ + public static final String NO_INDEX_BACKUP_STRATEGY = "none"; + + /** + * This constant defines a list of valid index backup strategies. + */ + public static final Collection INDEX_BACKUP_STRATEGIES = + Arrays.asList(COPY_FILES_STRATEGY, NO_INDEX_BACKUP_STRATEGY); + } diff --git a/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java b/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java index 737174e820d..9bb5ea6aa9e 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java @@ -33,7 +33,11 @@ import java.util.Map; import java.util.Map.Entry; import org.apache.solr.common.EnumFieldValue; +import org.apache.solr.common.IteratorWriter; +import org.apache.solr.common.IteratorWriter.ItemWriter; import org.apache.solr.common.MapSerializable; +import org.apache.solr.common.MapWriter; +import org.apache.solr.common.PushWriter; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; @@ -55,7 +59,7 @@ import org.noggit.CharArr; *

    * NOTE -- {@link JavaBinCodec} instances cannot be reused for more than one marshall or unmarshall operation. */ -public class JavaBinCodec { +public class JavaBinCodec implements PushWriter { public static final byte NULL = 0, @@ -79,7 +83,7 @@ public class JavaBinCodec { END = 15, SOLRINPUTDOC = 16, - SOLRINPUTDOC_CHILDS = 17, + MAP_ENTRY_ITER = 17, ENUM_FIELD_VALUE = 18, MAP_ENTRY = 19, // types that combine tag + length (or other info) in a single byte @@ -108,6 +112,16 @@ public class JavaBinCodec { writableDocFields =null; } + /** + * Use this to use this as a PushWriter. ensure that close() is called explicitly after use + * + * @param os The output stream + */ + public JavaBinCodec(OutputStream os, ObjectResolver resolver) throws IOException { + this.resolver = resolver; + initWrite(os); + } + public JavaBinCodec(ObjectResolver resolver) { this(resolver, null); } @@ -127,17 +141,26 @@ public class JavaBinCodec { } public void marshal(Object nl, OutputStream os) throws IOException { - assert !alreadyMarshalled; - init(FastOutputStream.wrap(os)); + initWrite(os); try { - daos.writeByte(VERSION); writeVal(nl); } finally { - daos.flushBuffer(); - alreadyMarshalled = true; + finish(); } } + protected void initWrite(OutputStream os) throws IOException { + assert !alreadyMarshalled; + init(FastOutputStream.wrap(os)); + daos.writeByte(VERSION); + } + + protected void finish() throws IOException { + closed = true; + daos.flushBuffer(); + alreadyMarshalled = true; + } + /** expert: sets a new output stream */ public void init(FastOutputStream os) { daos = os; @@ -281,6 +304,8 @@ public class JavaBinCodec { return readEnumFieldValue(dis); case MAP_ENTRY: return readMapEntry(dis); + case MAP_ENTRY_ITER: + return readMapIter(dis); } throw new RuntimeException("Unknown type " + tagByte); @@ -296,6 +321,10 @@ public class JavaBinCodec { writeSolrDocumentList((SolrDocumentList) val); return true; } + if (val instanceof IteratorWriter) { + writeIterator((IteratorWriter) val); + return true; + } if (val instanceof Collection) { writeArray((Collection) val); return true; @@ -313,6 +342,10 @@ public class JavaBinCodec { writeSolrInputDocument((SolrInputDocument)val); return true; } + if (val instanceof MapWriter) { + writeMap((MapWriter) val); + return true; + } if (val instanceof Map) { writeMap((Map) val); return true; @@ -346,6 +379,58 @@ public class JavaBinCodec { return false; } + private final MapWriter.EntryWriter ew = new MapWriter.EntryWriter() { + @Override + public MapWriter.EntryWriter put(String k, Object v) throws IOException { + writeExternString(k); + JavaBinCodec.this.writeVal(v); + return this; + } + + @Override + public MapWriter.EntryWriter put(String k, int v) throws IOException { + writeExternString(k); + JavaBinCodec.this.writeInt(v); + return this; + } + + @Override + public MapWriter.EntryWriter put(String k, long v) throws IOException { + writeExternString(k); + JavaBinCodec.this.writeLong(v); + return this; + } + + @Override + public MapWriter.EntryWriter put(String k, float v) throws IOException { + writeExternString(k); + JavaBinCodec.this.writeFloat(v); + return this; + } + + @Override + public MapWriter.EntryWriter put(String k, double v) throws IOException { + writeExternString(k); + JavaBinCodec.this.writeDouble(v); + return this; + } + + @Override + public MapWriter.EntryWriter put(String k, boolean v) throws IOException { + writeExternString(k); + writeBoolean(v); + return this; + } + }; + + + public void writeMap(MapWriter val) throws IOException { + writeTag(MAP_ENTRY_ITER); + val.writeMap(ew); + writeTag(END); + } + + public void writeTag(byte tag) throws IOException { daos.writeByte(tag); } @@ -503,6 +588,17 @@ public class JavaBinCodec { } + public Map readMapIter(DataInputInputStream dis) throws IOException { + Map m = new LinkedHashMap<>(); + for (; ; ) { + Object key = readVal(dis); + if (key == END_OBJ) break; + Object val = readVal(dis); + m.put(key, val); + } + return m; + } + public Map readMap(DataInputInputStream dis) throws IOException { int sz = readVInt(dis); @@ -516,12 +612,56 @@ public class JavaBinCodec { return m; } + private final ItemWriter itemWriter = new ItemWriter() { + @Override + public ItemWriter add(Object o) throws IOException { + writeVal(o); + return this; + } + + @Override + public ItemWriter add(int v) throws IOException { + writeInt(v); + return this; + } + + @Override + public ItemWriter add(long v) throws IOException { + writeLong(v); + return this; + } + + @Override + public ItemWriter add(float v) throws IOException { + writeFloat(v); + return this; + } + + @Override + public ItemWriter add(double v) throws IOException { + writeDouble(v); + return this; + } + + @Override + public ItemWriter add(boolean v) throws IOException { + writeBoolean(v); + return this; + } + }; + + @Override + public void writeIterator(IteratorWriter val) throws IOException { + writeTag(ITERATOR); + val.writeIter(itemWriter); + writeTag(END); + } public void writeIterator(Iterator iter) throws IOException { writeTag(ITERATOR); while (iter.hasNext()) { writeVal(iter.next()); } - writeVal(END_OBJ); + writeTag(END); } public List readIterator(DataInputInputStream fis) throws IOException { @@ -644,7 +784,7 @@ public class JavaBinCodec { /** * write the string as tag+length, with length being the number of UTF-8 bytes */ - public void writeStr(String s) throws IOException { + public void writeStr(CharSequence s) throws IOException { if (s == null) { writeTag(NULL); return; @@ -745,7 +885,7 @@ public class JavaBinCodec { if (val == null) { daos.writeByte(NULL); return true; - } else if (val instanceof String) { + } else if (val instanceof CharSequence) { writeStr((String) val); return true; } else if (val instanceof Number) { @@ -760,8 +900,7 @@ public class JavaBinCodec { writeFloat(((Float) val).floatValue()); return true; } else if (val instanceof Double) { - daos.writeByte(DOUBLE); - daos.writeDouble(((Double) val).doubleValue()); + writeDouble(((Double) val).doubleValue()); return true; } else if (val instanceof Byte) { daos.writeByte(BYTE); @@ -779,8 +918,7 @@ public class JavaBinCodec { daos.writeLong(((Date) val).getTime()); return true; } else if (val instanceof Boolean) { - if ((Boolean) val) daos.writeByte(BOOL_TRUE); - else daos.writeByte(BOOL_FALSE); + writeBoolean((Boolean) val); return true; } else if (val instanceof byte[]) { writeByteArray((byte[]) val, 0, ((byte[]) val).length); @@ -796,6 +934,16 @@ public class JavaBinCodec { return false; } + protected void writeBoolean(boolean val) throws IOException { + if (val) daos.writeByte(BOOL_TRUE); + else daos.writeByte(BOOL_FALSE); + } + + protected void writeDouble(double val) throws IOException { + daos.writeByte(DOUBLE); + daos.writeDouble(val); + } + public void writeMap(Map val) throws IOException { writeTag(MAP, val.size()); @@ -1003,4 +1151,12 @@ public class JavaBinCodec { return hash; } } + + private boolean closed; + + @Override + public void close() throws IOException { + if (closed) return; + finish(); + } } diff --git a/solr/solrj/src/test-files/solrj/sampleClusteringResponse.xml b/solr/solrj/src/test-files/solrj/sampleClusteringResponse.xml index 16d6e4a9ed3..ea042c9195c 100644 --- a/solr/solrj/src/test-files/solrj/sampleClusteringResponse.xml +++ b/solr/solrj/src/test-files/solrj/sampleClusteringResponse.xml @@ -58,6 +58,25 @@ id2 id3 + + + + label1.sub1 + + + id1 + id2 + + + + + label1.sub2 + + + id2 + + + diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/response/TestClusteringResponse.java b/solr/solrj/src/test/org/apache/solr/client/solrj/response/TestClusteringResponse.java index 5bc20e14c10..7e789d12aeb 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/response/TestClusteringResponse.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/response/TestClusteringResponse.java @@ -19,7 +19,7 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.StandardCharsets; -import java.util.LinkedList; +import java.util.Arrays; import java.util.List; import org.apache.solr.SolrJettyTestBase; @@ -49,51 +49,21 @@ public class TestClusteringResponse extends SolrJettyTestBase { List clusters = clusteringResponse.getClusters(); Assert.assertEquals(4, clusters.size()); - //First Cluster - Cluster cluster1 = clusters.get(0); - List expectedLabel1 = new LinkedList(); - expectedLabel1.add("label1"); - List expectedDocs1 = new LinkedList(); - expectedDocs1.add("id1"); - expectedDocs1.add("id2"); - expectedDocs1.add("id3"); - Assert.assertEquals(expectedLabel1, cluster1.getLabels()); - Assert.assertEquals(expectedDocs1, cluster1.getDocs()); - Assert.assertEquals(expectedLabel1, cluster1.getLabels()); - Assert.assertEquals(0.6, cluster1.getScore(), 0); - //Second Cluster - Cluster cluster2 = clusters.get(1); - List expectedLabel2 = new LinkedList(); - expectedLabel2.add("label2"); - List expectedDocs2 = new LinkedList(); - expectedDocs2.add("id5"); - expectedDocs2.add("id6"); - Assert.assertEquals(expectedLabel2, cluster2.getLabels()); - Assert.assertEquals(expectedDocs2, cluster2.getDocs()); - Assert.assertEquals(expectedLabel2, cluster2.getLabels()); - Assert.assertEquals(0.93, cluster2.getScore(), 0); - //Third Cluster - Cluster cluster3 = clusters.get(2); - List expectedLabel3 = new LinkedList(); - expectedLabel3.add("label3"); - List expectedDocs3 = new LinkedList(); - expectedDocs3.add("id7"); - expectedDocs3.add("id8"); - Assert.assertEquals(expectedLabel3, cluster3.getLabels()); - Assert.assertEquals(expectedDocs3, cluster3.getDocs()); - Assert.assertEquals(expectedLabel3, cluster3.getLabels()); - Assert.assertEquals(1.26, cluster3.getScore(), 0); - //Fourth Cluster - Cluster cluster4 = clusters.get(3); - List expectedLabel4 = new LinkedList(); - expectedLabel4.add("label4"); - List expectedDocs4 = new LinkedList(); - expectedDocs4.add("id9"); - Assert.assertEquals(expectedLabel4, cluster4.getLabels()); - Assert.assertEquals(expectedDocs4, cluster4.getDocs()); - Assert.assertEquals(expectedLabel4, cluster4.getLabels()); - Assert.assertEquals(0.0, cluster4.getScore(), 0); - + checkCluster(clusters.get(0), Arrays.asList("label1"), Arrays.asList("id1", "id2", "id3"), 0.6d, false); + checkCluster(clusters.get(1), Arrays.asList("label2"), Arrays.asList("id5", "id6"), 0.93d, false); + checkCluster(clusters.get(2), Arrays.asList("label3"), Arrays.asList("id7", "id8"), 1.26d, false); + checkCluster(clusters.get(3), Arrays.asList("label4"), Arrays.asList("id9"), 0d, true); + + List sub = clusters.get(0).getSubclusters(); + checkCluster(sub.get(0), Arrays.asList("label1.sub1"), Arrays.asList("id1", "id2"), 0.0d, false); + checkCluster(sub.get(1), Arrays.asList("label1.sub2"), Arrays.asList("id2"), 0.0d, false); + assertEquals(sub.size(), 2); } + private void checkCluster(Cluster cluster, List labels, List docRefs, double score, boolean otherTopics) { + Assert.assertEquals(cluster.getLabels(), labels); + Assert.assertEquals(cluster.getDocs(), docRefs); + Assert.assertTrue(Double.compare(cluster.getScore(), score) == 0); + Assert.assertEquals(otherTopics, cluster.isOtherTopics()); + } } diff --git a/solr/test-framework/src/java/org/apache/solr/util/TestHarness.java b/solr/test-framework/src/java/org/apache/solr/util/TestHarness.java index 1f4b33b6b1f..261d2ecfff1 100644 --- a/solr/test-framework/src/java/org/apache/solr/util/TestHarness.java +++ b/solr/test-framework/src/java/org/apache/solr/util/TestHarness.java @@ -16,6 +16,7 @@ */ package org.apache.solr.util; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.StringWriter; import java.nio.file.Path; @@ -30,12 +31,22 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList.NamedListEntry; -import org.apache.solr.core.*; +import org.apache.solr.core.CloudConfig; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.CoreDescriptor; +import org.apache.solr.core.CorePropertiesLocator; +import org.apache.solr.core.CoresLocator; +import org.apache.solr.core.NodeConfig; +import org.apache.solr.core.SolrConfig; +import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrResourceLoader; +import org.apache.solr.core.SolrXmlConfig; import org.apache.solr.handler.UpdateRequestHandler; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.request.SolrRequestInfo; +import org.apache.solr.response.BinaryQueryResponseWriter; import org.apache.solr.response.QueryResponseWriter; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.IndexSchema; @@ -311,10 +322,18 @@ public class TestHarness extends BaseTestHarness { if (rsp.getException() != null) { throw rsp.getException(); } - StringWriter sw = new StringWriter(32000); QueryResponseWriter responseWriter = core.getQueryResponseWriter(req); - responseWriter.write(sw,req,rsp); - return sw.toString(); + if (responseWriter instanceof BinaryQueryResponseWriter) { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(32000); + BinaryQueryResponseWriter writer = (BinaryQueryResponseWriter) responseWriter; + writer.write(byteArrayOutputStream, req, rsp); + return new String(byteArrayOutputStream.toByteArray(), "UTF-8"); + } else { + StringWriter sw = new StringWriter(32000); + responseWriter.write(sw,req,rsp); + return sw.toString(); + } + } finally { req.close(); SolrRequestInfo.clearRequestInfo(); diff --git a/solr/webapp/web/js/angular/app.js b/solr/webapp/web/js/angular/app.js index e7491fe63d4..e86b55feee0 100644 --- a/solr/webapp/web/js/angular/app.js +++ b/solr/webapp/web/js/angular/app.js @@ -148,7 +148,7 @@ solrAdminApp.config([ }) .filter('highlight', function($sce) { return function(input, lang) { - if (lang && input && lang!="txt") return hljs.highlight(lang, input).value; + if (lang && input && lang!="txt" && lang!="csv") return hljs.highlight(lang, input).value; return input; } })