diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 851ed722bca..cee0335352e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -63,16 +63,42 @@ Other ======================= Lucene 6.5.0 ======================= +API Changes + +* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core. + (Alan Woodward) + +* LUCENE-7637: TermInSetQuery requires that all terms come from the same field. + (Adrien Grand) + New Features * LUCENE-7623: Add FunctionScoreQuery and FunctionMatchQuery (Alan Woodward, Adrien Grand, David Smiley) +* LUCENE-7619: Add WordDelimiterGraphFilter, just like + WordDelimiterFilter except it produces correct token graphs so that + proximity queries at search time will produce correct results (Mike + McCandless) + Bug Fixes * LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads and preserve all attributes. (Nathan Gass via Uwe Schindler) +Improvements + +* LUCENE-7055: Added Weight#scorerSupplier, which allows to estimate the cost + of a Scorer before actually building it, in order to optimize how the query + should be run, eg. using points or doc values depending on costs of other + parts of the query. (Adrien Grand) + +Optimizations + +* LUCENE-7641: Optimized point range queries to compute documents that do not + match the range on single-valued fields when more than half the documents in + the index would match. (Adrien Grand) + ======================= Lucene 6.4.0 ======================= API Changes @@ -100,9 +126,6 @@ API Changes * LUCENE-7611: DocumentValueSourceDictionary now takes a LongValuesSource as a parameter, and the ValueSource equivalent is deprecated (Alan Woodward) -* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core. - (Alan Woodward) - New features * LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java similarity index 98% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java index c1fa1f7cba1..01e1f6f7dfc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.synonym; +package org.apache.lucene.analysis.core; import java.io.IOException; import java.util.ArrayList; @@ -23,6 +23,7 @@ import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; @@ -410,8 +411,8 @@ public final class FlattenGraphFilter extends TokenFilter { maxLookaheadUsed = 0; } - // for testing - int getMaxLookaheadUsed() { + /** For testing */ + public int getMaxLookaheadUsed() { return maxLookaheadUsed; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java similarity index 97% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java index a6cba97473e..920ab3dff1d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/FlattenGraphFilterFactory.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.lucene.analysis.synonym; +package org.apache.lucene.analysis.core; import java.util.Map; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index f80ed8a800d..aef697ce4ff 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -28,6 +28,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.InPlaceMergeSorter; @@ -80,7 +81,12 @@ import org.apache.lucene.util.InPlaceMergeSorter; * the current {@link StandardTokenizer} immediately removes many intra-word * delimiters, it is recommended that this filter be used after a tokenizer that * does not do this (such as {@link WhitespaceTokenizer}). + * + * @deprecated Use {@link WordDelimiterGraphFilter} instead: it produces a correct + * token graph so that e.g. {@link PhraseQuery} works correctly when it's used in + * the search time analyzer. */ +@Deprecated public final class WordDelimiterFilter extends TokenFilter { public static final int LOWER = 0x01; @@ -116,7 +122,7 @@ public final class WordDelimiterFilter extends TokenFilter { /** * Causes maximum runs of word parts to be catenated: *
- * "wi-fi" => "wifi" + * "500-42" => "50042" */ public static final int CATENATE_NUMBERS = 8; @@ -494,7 +500,6 @@ public final class WordDelimiterFilter extends TokenFilter { private void generatePart(boolean isSingleWord) { clearAttributes(); termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current); - int startOffset = savedStartOffset + iterator.current; int endOffset = savedStartOffset + iterator.end; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java index 6a15b55b6aa..0002d65331c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java @@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.search.PhraseQuery; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; @@ -47,7 +48,12 @@ import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; * types="wdfftypes.txt" /> * </analyzer> * </fieldType> + * + * @deprecated Use {@link WordDelimiterGraphFilterFactory} instead: it produces a correct + * token graph so that e.g. {@link PhraseQuery} works correctly when it's used in + * the search time analyzer. */ +@Deprecated public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { public static final String PROTECTED_TOKENS = "protected"; public static final String TYPES = "types"; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java new file mode 100644 index 00000000000..fe8ed72357c --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -0,0 +1,706 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.InPlaceMergeSorter; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Splits words into subwords and performs optional transformations on subword + * groups, producing a correct token graph so that e.g. {@link PhraseQuery} can + * work correctly when this filter is used in the search-time analyzer. Unlike + * the deprecated {@link WordDelimiterFilter}, this token filter produces a + * correct token graph as output. However, it cannot consume an input token + * graph correctly. + * + *
+ * Words are split into subwords with the following rules: + *
"Wi-Fi"
→ "Wi", "Fi"
"PowerShot"
→
+ * "Power", "Shot"
"SD500"
→
+ * "SD", "500"
"//hello---there, 'dude'"
→
+ * "hello", "there", "dude"
"O'Neil's"
+ * → "O", "Neil"
+ * "PowerShot"
+ * → 0:"Power", 1:"Shot"
(0 and 1 are the token positions)"PowerShot"
→
+ * 0:"Power", 1:"Shot" 1:"PowerShot"
"A's+B's&C's"
> 0:"A", 1:"B", 2:"C", 2:"ABC"
+ * "Super-Duper-XL500-42-AutoCoder!"
→
+ * 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
+ * + * "PowerShot" => "Power" "Shot" + */ + public static final int GENERATE_WORD_PARTS = 1; + + /** + * Causes number subwords to be generated: + *
+ * "500-42" => "500" "42" + */ + public static final int GENERATE_NUMBER_PARTS = 2; + + /** + * Causes maximum runs of word parts to be catenated: + *
+ * "wi-fi" => "wifi" + */ + public static final int CATENATE_WORDS = 4; + + /** + * Causes maximum runs of number parts to be catenated: + *
+ * "500-42" => "50042" + */ + public static final int CATENATE_NUMBERS = 8; + + /** + * Causes all subword parts to be catenated: + *
+ * "wi-fi-4000" => "wifi4000" + */ + public static final int CATENATE_ALL = 16; + + /** + * Causes original words are preserved and added to the subword list (Defaults to false) + *
+ * "500-42" => "500" "42" "500-42" + */ + public static final int PRESERVE_ORIGINAL = 32; + + /** + * Causes lowercase -> uppercase transition to start a new subword. + */ + public static final int SPLIT_ON_CASE_CHANGE = 64; + + /** + * If not set, causes numeric changes to be ignored (subwords will only be generated + * given SUBWORD_DELIM tokens). + */ + public static final int SPLIT_ON_NUMERICS = 128; + + /** + * Causes trailing "'s" to be removed for each subword + *
+ * "O'Neil's" => "O", "Neil" + */ + public static final int STEM_ENGLISH_POSSESSIVE = 256; + + /** + * If not null is the set of tokens to protect from being delimited + * + */ + final CharArraySet protWords; + + private final int flags; + + // packs start pos, end pos, start part, end part (= slice of the term text) for each buffered part: + private int[] bufferedParts = new int[16]; + private int bufferedLen; + private int bufferedPos; + + // holds text for each buffered part, or null if it's a simple slice of the original term + private char[][] bufferedTermParts = new char[4][]; + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + + // used for iterating word delimiter breaks + private final WordDelimiterIterator iterator; + + // used for concatenating runs of similar typed subwords (word,number) + private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation(); + + // number of subwords last output by concat. + private int lastConcatCount; + + // used for catenate all + private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation(); + + // used for accumulating position increment gaps so that we preserve incoming holes: + private int accumPosInc; + + private char[] savedTermBuffer = new char[16]; + private int savedTermLength; + private int savedStartOffset; + private int savedEndOffset; + private AttributeSource.State savedState; + private int lastStartOffset; + + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + private boolean hasIllegalOffsets; + + private int wordPos; + + /** + * Creates a new WordDelimiterGraphFilter + * + * @param in TokenStream to be filtered + * @param charTypeTable table containing character types + * @param configurationFlags Flags configuring the filter + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public WordDelimiterGraphFilter(TokenStream in, byte[] charTypeTable, int configurationFlags, CharArraySet protWords) { + super(in); + if ((configurationFlags & + ~(GENERATE_WORD_PARTS | + GENERATE_NUMBER_PARTS | + CATENATE_WORDS | + CATENATE_NUMBERS | + CATENATE_ALL | + PRESERVE_ORIGINAL | + SPLIT_ON_CASE_CHANGE | + SPLIT_ON_NUMERICS | + STEM_ENGLISH_POSSESSIVE)) != 0) { + throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags); + } + this.flags = configurationFlags; + this.protWords = protWords; + this.iterator = new WordDelimiterIterator( + charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE)); + } + + /** + * Creates a new WordDelimiterGraphFilter using {@link WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE} + * as its charTypeTable + * + * @param in TokenStream to be filtered + * @param configurationFlags Flags configuring the filter + * @param protWords If not null is the set of tokens to protect from being delimited + */ + public WordDelimiterGraphFilter(TokenStream in, int configurationFlags, CharArraySet protWords) { + this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); + } + + /** Iterates all words parts and concatenations, buffering up the term parts we should return. */ + private void bufferWordParts() throws IOException { + + saveState(); + + // if length by start + end offsets doesn't match the term's text then set offsets for all our word parts/concats to the incoming + // offsets. this can happen if WDGF is applied to an injected synonym, or to a stem'd form, etc: + hasIllegalOffsets = (savedEndOffset - savedStartOffset != savedTermLength); + + bufferedLen = 0; + lastConcatCount = 0; + wordPos = 0; + + if (iterator.isSingleWord()) { + buffer(wordPos, wordPos+1, iterator.current, iterator.end); + wordPos++; + iterator.next(); + } else { + + // iterate all words parts, possibly buffering them, building up concatenations and possibly buffering them too: + while (iterator.end != WordDelimiterIterator.DONE) { + int wordType = iterator.type(); + + // do we already have queued up incompatible concatenations? + if (concat.isNotEmpty() && (concat.type & wordType) == 0) { + flushConcatenation(concat); + } + + // add subwords depending upon options + if (shouldConcatenate(wordType)) { + concatenate(concat); + } + + // add all subwords (catenateAll) + if (has(CATENATE_ALL)) { + concatenate(concatAll); + } + + // if we should output the word or number part + if (shouldGenerateParts(wordType)) { + buffer(wordPos, wordPos+1, iterator.current, iterator.end); + wordPos++; + } + iterator.next(); + } + + if (concat.isNotEmpty()) { + // flush final concatenation + flushConcatenation(concat); + } + + if (concatAll.isNotEmpty()) { + // only if we haven't output this same combo above, e.g. PowerShot with CATENATE_WORDS: + if (concatAll.subwordCount > lastConcatCount) { + if (wordPos == concatAll.startPos) { + // we are not generating parts, so we must advance wordPos now + wordPos++; + } + concatAll.write(); + } + concatAll.clear(); + } + } + + if (has(PRESERVE_ORIGINAL)) { + if (wordPos == 0) { + // can happen w/ strange flag combos and inputs :) + wordPos++; + } + // add the original token now so that we can set the correct end position + buffer(0, wordPos, 0, savedTermLength); + } + + sorter.sort(0, bufferedLen); + wordPos = 0; + + // set back to 0 for iterating from the buffer + bufferedPos = 0; + } + + @Override + public boolean incrementToken() throws IOException { + while (true) { + if (savedState == null) { + + // process a new input token + if (input.incrementToken() == false) { + return false; + } + + int termLength = termAttribute.length(); + char[] termBuffer = termAttribute.buffer(); + + accumPosInc += posIncAttribute.getPositionIncrement(); + + // iterate & cache all word parts up front: + iterator.setText(termBuffer, termLength); + iterator.next(); + + // word of no delimiters, or protected word: just return it + if ((iterator.current == 0 && iterator.end == termLength) || + (protWords != null && protWords.contains(termBuffer, 0, termLength))) { + posIncAttribute.setPositionIncrement(accumPosInc); + accumPosInc = 0; + return true; + } + + // word of simply delimiters: swallow this token, creating a hole, and move on to next token + if (iterator.end == WordDelimiterIterator.DONE) { + if (has(PRESERVE_ORIGINAL) == false) { + continue; + } else { + return true; + } + } + + // otherwise, we have delimiters, process & buffer all parts: + bufferWordParts(); + } + + if (bufferedPos < bufferedLen) { + clearAttributes(); + restoreState(savedState); + + char[] termPart = bufferedTermParts[bufferedPos]; + int startPos = bufferedParts[4*bufferedPos]; + int endPos = bufferedParts[4*bufferedPos+1]; + int startPart = bufferedParts[4*bufferedPos+2]; + int endPart = bufferedParts[4*bufferedPos+3]; + bufferedPos++; + + int startOffset; + int endOffset; + + if (hasIllegalOffsets) { + startOffset = savedStartOffset; + endOffset = savedEndOffset; + } else { + startOffset = savedStartOffset + startPart; + endOffset = savedStartOffset + endPart; + } + + // never let offsets go backwards: + startOffset = Math.max(startOffset, lastStartOffset); + endOffset = Math.max(endOffset, lastStartOffset); + + offsetAttribute.setOffset(startOffset, endOffset); + lastStartOffset = startOffset; + + if (termPart == null) { + termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); + } else { + termAttribute.copyBuffer(termPart, 0, termPart.length); + } + + posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); + accumPosInc = 0; + posLenAttribute.setPositionLength(endPos - startPos); + wordPos = startPos; + return true; + } + + // no saved concatenations, on to the next input word + savedState = null; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + accumPosInc = 0; + savedState = null; + lastStartOffset = 0; + concat.clear(); + concatAll.clear(); + } + + // ================================================= Helper Methods ================================================ + + private class PositionSorter extends InPlaceMergeSorter { + @Override + protected int compare(int i, int j) { + // sort by smaller start position + int iPosStart = bufferedParts[4*i]; + int jPosStart = bufferedParts[4*j]; + int cmp = Integer.compare(iPosStart, jPosStart); + if (cmp != 0) { + return cmp; + } + + // tie break by longest pos length: + int iPosEnd = bufferedParts[4*i+1]; + int jPosEnd = bufferedParts[4*j+1]; + return Integer.compare(jPosEnd, iPosEnd); + } + + @Override + protected void swap(int i, int j) { + int iOffset = 4*i; + int jOffset = 4*j; + for(int x=0;x<4;x++) { + int tmp = bufferedParts[iOffset+x]; + bufferedParts[iOffset+x] = bufferedParts[jOffset+x]; + bufferedParts[jOffset+x] = tmp; + } + + char[] tmp2 = bufferedTermParts[i]; + bufferedTermParts[i] = bufferedTermParts[j]; + bufferedTermParts[j] = tmp2; + } + } + + final PositionSorter sorter = new PositionSorter(); + + /** + * startPos, endPos -> graph start/end position + * startPart, endPart -> slice of the original term for this part + */ + + void buffer(int startPos, int endPos, int startPart, int endPart) { + buffer(null, startPos, endPos, startPart, endPart); + } + + /** + * a null termPart means it's a simple slice of the original term + */ + void buffer(char[] termPart, int startPos, int endPos, int startPart, int endPart) { + /* + System.out.println("buffer: pos=" + startPos + "-" + endPos + " part=" + startPart + "-" + endPart); + if (termPart != null) { + System.out.println(" termIn=" + new String(termPart)); + } else { + System.out.println(" term=" + new String(savedTermBuffer, startPart, endPart-startPart)); + } + */ + assert endPos > startPos: "startPos=" + startPos + " endPos=" + endPos; + assert endPart > startPart || (endPart == 0 && startPart == 0 && savedTermLength == 0): "startPart=" + startPart + " endPart=" + endPart; + if ((bufferedLen+1)*4 > bufferedParts.length) { + bufferedParts = ArrayUtil.grow(bufferedParts, (bufferedLen+1)*4); + } + if (bufferedTermParts.length == bufferedLen) { + int newSize = ArrayUtil.oversize(bufferedLen+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + char[][] newArray = new char[newSize][]; + System.arraycopy(bufferedTermParts, 0, newArray, 0, bufferedTermParts.length); + bufferedTermParts = newArray; + } + bufferedTermParts[bufferedLen] = termPart; + bufferedParts[bufferedLen*4] = startPos; + bufferedParts[bufferedLen*4+1] = endPos; + bufferedParts[bufferedLen*4+2] = startPart; + bufferedParts[bufferedLen*4+3] = endPart; + bufferedLen++; + } + + /** + * Saves the existing attribute states + */ + private void saveState() { + savedTermLength = termAttribute.length(); + savedStartOffset = offsetAttribute.startOffset(); + savedEndOffset = offsetAttribute.endOffset(); + savedState = captureState(); + + if (savedTermBuffer.length < savedTermLength) { + savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)]; + } + + System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength); + } + + /** + * Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing. + * + * @param concat WordDelimiterConcatenation that will be flushed + */ + private void flushConcatenation(WordDelimiterConcatenation concat) { + if (wordPos == concat.startPos) { + // we are not generating parts, so we must advance wordPos now + wordPos++; + } + lastConcatCount = concat.subwordCount; + if (concat.subwordCount != 1 || shouldGenerateParts(concat.type) == false) { + concat.write(); + } + concat.clear(); + } + + /** + * Determines whether to concatenate a word or number if the current word is the given type + * + * @param wordType Type of the current word used to determine if it should be concatenated + * @return {@code true} if concatenation should occur, {@code false} otherwise + */ + private boolean shouldConcatenate(int wordType) { + return (has(CATENATE_WORDS) && WordDelimiterIterator.isAlpha(wordType)) || (has(CATENATE_NUMBERS) && WordDelimiterIterator.isDigit(wordType)); + } + + /** + * Determines whether a word/number part should be generated for a word of the given type + * + * @param wordType Type of the word used to determine if a word/number part should be generated + * @return {@code true} if a word/number part should be generated, {@code false} otherwise + */ + private boolean shouldGenerateParts(int wordType) { + return (has(GENERATE_WORD_PARTS) && WordDelimiterIterator.isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && WordDelimiterIterator.isDigit(wordType)); + } + + /** + * Concatenates the saved buffer to the given WordDelimiterConcatenation + * + * @param concatenation WordDelimiterConcatenation to concatenate the buffer to + */ + private void concatenate(WordDelimiterConcatenation concatenation) { + if (concatenation.isEmpty()) { + concatenation.type = iterator.type(); + concatenation.startPart = iterator.current; + concatenation.startPos = wordPos; + } + concatenation.append(savedTermBuffer, iterator.current, iterator.end - iterator.current); + concatenation.endPart = iterator.end; + } + + /** + * Determines whether the given flag is set + * + * @param flag Flag to see if set + * @return {@code true} if flag is set + */ + private boolean has(int flag) { + return (flags & flag) != 0; + } + + // ================================================= Inner Classes ================================================= + + /** + * A WDF concatenated 'run' + */ + final class WordDelimiterConcatenation { + final StringBuilder buffer = new StringBuilder(); + int startPart; + int endPart; + int startPos; + int type; + int subwordCount; + + /** + * Appends the given text of the given length, to the concetenation at the given offset + * + * @param text Text to append + * @param offset Offset in the concetenation to add the text + * @param length Length of the text to append + */ + void append(char text[], int offset, int length) { + buffer.append(text, offset, length); + subwordCount++; + } + + /** + * Writes the concatenation to part buffer + */ + void write() { + char[] termPart = new char[buffer.length()]; + buffer.getChars(0, buffer.length(), termPart, 0); + buffer(termPart, startPos, wordPos, startPart, endPart); + } + + /** + * Determines if the concatenation is empty + * + * @return {@code true} if the concatenation is empty, {@code false} otherwise + */ + boolean isEmpty() { + return buffer.length() == 0; + } + + boolean isNotEmpty() { + return isEmpty() == false; + } + + /** + * Clears the concatenation and resets its state + */ + void clear() { + buffer.setLength(0); + startPart = endPart = type = subwordCount = 0; + } + } + + /** Returns string representation of configuration flags */ + public static String flagsToString(int flags) { + StringBuilder b = new StringBuilder(); + if ((flags & GENERATE_WORD_PARTS) != 0) { + b.append("GENERATE_WORD_PARTS"); + } + if ((flags & GENERATE_NUMBER_PARTS) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("GENERATE_NUMBER_PARTS"); + } + if ((flags & CATENATE_WORDS) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("CATENATE_WORDS"); + } + if ((flags & CATENATE_NUMBERS) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("CATENATE_NUMBERS"); + } + if ((flags & CATENATE_ALL) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("CATENATE_ALL"); + } + if ((flags & PRESERVE_ORIGINAL) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("PRESERVE_ORIGINAL"); + } + if ((flags & SPLIT_ON_CASE_CHANGE) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("SPLIT_ON_CASE_CHANGE"); + } + if ((flags & SPLIT_ON_NUMERICS) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("SPLIT_ON_NUMERICS"); + } + if ((flags & STEM_ENGLISH_POSSESSIVE) != 0) { + if (b.length() > 0) { + b.append(" | "); + } + b.append("STEM_ENGLISH_POSSESSIVE"); + } + + return b.toString(); + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(); + b.append("WordDelimiterGraphFilter(flags="); + b.append(flagsToString(flags)); + b.append(')'); + return b.toString(); + } + + // questions: + // negative numbers? -42 indexed as just 42? + // dollar sign? $42 + // percent sign? 33% + // downsides: if source text is "powershot" then a query of "PowerShot" won't match! +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java new file mode 100644 index 00000000000..a06cc7502d7 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilterFactory.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.*; + +/** + * Factory for {@link WordDelimiterGraphFilter}. + *
+ * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100"> + * <analyzer> + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> + * <filter class="solr.WordDelimiterGraphFilterFactory" protected="protectedword.txt" + * preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1" + * catenateWords="0" catenateNumbers="0" catenateAll="0" + * generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1" + * types="wdfftypes.txt" /> + * </analyzer> + * </fieldType>+ */ +public class WordDelimiterGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + public static final String TYPES = "types"; + + private final String wordFiles; + private final String types; + private final int flags; + byte[] typeTable = null; + private CharArraySet protectedWords = null; + + /** Creates a new WordDelimiterGraphFilterFactory */ + public WordDelimiterGraphFilterFactory(Map
false
by default */
public void setUnicodeArcs(boolean unicodeArcs) {
@@ -118,7 +124,7 @@ public class TokenStreamToAutomaton {
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
- if (!preservePositionIncrements && posInc > 1) {
+ if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
@@ -201,10 +207,35 @@ public class TokenStreamToAutomaton {
}
in.end();
+
int endState = -1;
- if (offsetAtt.endOffset() > maxOffset) {
+
+ int endPosInc = posIncAtt.getPositionIncrement();
+
+ if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
+ endPosInc = 1;
+ }
+
+ if (endPosInc > 0) {
+ // there were hole(s) after the last token
endState = builder.createState();
- builder.setAccept(endState, true);
+
+ // add trailing holes now:
+ int lastState = endState;
+ while (true) {
+ int state1 = builder.createState();
+ builder.addTransition(lastState, state1, HOLE);
+ endPosInc--;
+ if (endPosInc == 0) {
+ builder.setAccept(state1, true);
+ break;
+ }
+ int state2 = builder.createState();
+ builder.addTransition(state1, state2, POS_SEP);
+ lastState = state2;
+ }
+ } else {
+ endState = -1;
}
pos++;
@@ -219,7 +250,7 @@ public class TokenStreamToAutomaton {
}
pos++;
}
-
+
return builder.finish();
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
index cdc5d42e8cf..166d6b21607 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java
@@ -43,7 +43,7 @@ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribut
// OffsetAtt
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+ throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
index c89a37420ca..ad1e23220f1 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PackedTokenAttributeImpl.java
@@ -107,7 +107,7 @@ public class PackedTokenAttributeImpl extends CharTermAttributeImpl
@Override
public void setOffset(int startOffset, int endOffset) {
if (startOffset < 0 || endOffset < startOffset) {
- throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+ throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset; got "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
}
this.startOffset = startOffset;
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
index 4d63d6fb36b..e89fec12715 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionIncrementAttributeImpl extends AttributeImpl implements Pos
@Override
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0) {
- throw new IllegalArgumentException
- ("Increment must be zero or greater: got " + positionIncrement);
+ throw new IllegalArgumentException("Position increment must be zero or greater; got " + positionIncrement);
}
this.positionIncrement = positionIncrement;
}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
index 9bfdb49a5da..d019a2b6d61 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionLengthAttributeImpl.java
@@ -30,8 +30,7 @@ public class PositionLengthAttributeImpl extends AttributeImpl implements Positi
@Override
public void setPositionLength(int positionLength) {
if (positionLength < 1) {
- throw new IllegalArgumentException
- ("Position length must be 1 or greater: got " + positionLength);
+ throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
}
this.positionLength = positionLength;
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
index 38cd440e68c..d9a0b30e89d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PointsWriter.java
@@ -127,6 +127,11 @@ public abstract class PointsWriter implements Closeable {
}
}
+ @Override
+ public long estimatePointCount(IntersectVisitor visitor) {
+ throw new UnsupportedOperationException();
+ }
+
@Override
public byte[] getMinPackedValue() {
throw new UnsupportedOperationException();
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 3bb10d325b5..f3bdfb0b24f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -42,6 +42,8 @@ import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Sort;
@@ -1810,6 +1812,19 @@ public final class CheckIndex implements Closeable {
long size = values.size();
int docCount = values.getDocCount();
+ final long crossCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_CROSSES_QUERY));
+ if (crossCost < size / 2) {
+ throw new RuntimeException("estimatePointCount should return >= size/2 when all cells match");
+ }
+ final long insideCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_INSIDE_QUERY));
+ if (insideCost < size) {
+ throw new RuntimeException("estimatePointCount should return >= size when all cells fully match");
+ }
+ final long outsideCost = values.estimatePointCount(new ConstantRelationIntersectVisitor(Relation.CELL_OUTSIDE_QUERY));
+ if (outsideCost != 0) {
+ throw new RuntimeException("estimatePointCount should return 0 when no cells match");
+ }
+
VerifyPointsVisitor visitor = new VerifyPointsVisitor(fieldInfo.name, reader.maxDoc(), values);
values.intersect(visitor);
@@ -2002,6 +2017,28 @@ public final class CheckIndex implements Closeable {
}
}
+ private static class ConstantRelationIntersectVisitor implements IntersectVisitor {
+ private final Relation relation;
+
+ ConstantRelationIntersectVisitor(Relation relation) {
+ this.relation = relation;
+ }
+
+ @Override
+ public void visit(int docID) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void visit(int docID, byte[] packedValue) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+ return relation;
+ }
+ }
/**
* Test stored fields.
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index 197ab3155f9..b118c13b0a5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -313,10 +313,7 @@ final class DefaultIndexingChain extends DocConsumer {
@Override
public void abort() {
- try {
- storedFieldsConsumer.abort();
- } catch (Throwable t) {
- }
+ storedFieldsConsumer.abort();
try {
// E.g. close any open files in the term vectors writer:
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index 7f0b97c6bab..0fc2e2476a8 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -1034,17 +1034,17 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
/** Confirms that the incoming index sort (if any) matches the existing index sort (if any).
* This is unfortunately just best effort, because it could be the old index only has unsorted flushed segments built
- * before {@link Version#LUCENE_7_0_0} (flushed segments are sorted in Lucene 7.0). */
- private void validateIndexSort() {
+ * before {@link Version#LUCENE_6_5_0} (flushed segments are sorted in Lucene 7.0). */
+ private void validateIndexSort() throws CorruptIndexException {
Sort indexSort = config.getIndexSort();
if (indexSort != null) {
for(SegmentCommitInfo info : segmentInfos) {
Sort segmentIndexSort = info.info.getIndexSort();
if (segmentIndexSort != null && indexSort.equals(segmentIndexSort) == false) {
throw new IllegalArgumentException("cannot change previous indexSort=" + segmentIndexSort + " (from segment=" + info + ") to new indexSort=" + indexSort);
- } else if (segmentIndexSort == null) {
- // Flushed segments are not sorted if they were built with a version prior to 7.0
- assert info.info.getVersion().onOrAfter(Version.LUCENE_7_0_0) == false;
+ } else if (segmentIndexSort == null && info.info.getVersion().onOrAfter(Version.LUCENE_6_5_0)) {
+ // Flushed segments are not sorted if they were built with a version prior to 6.5.0
+ throw new CorruptIndexException("segment not sorted with indexSort=" + segmentIndexSort, info.info.toString());
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValues.java b/lucene/core/src/java/org/apache/lucene/index/PointValues.java
index ffac5f7dbd0..01f77e46509 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValues.java
@@ -26,6 +26,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDWriter;
@@ -220,6 +221,12 @@ public abstract class PointValues {
* to test whether each document is deleted, if necessary. */
public abstract void intersect(IntersectVisitor visitor) throws IOException;
+ /** Estimate the number of points that would be visited by {@link #intersect}
+ * with the given {@link IntersectVisitor}. This should run many times faster
+ * than {@link #intersect(IntersectVisitor)}.
+ * @see DocIdSetIterator#cost */
+ public abstract long estimatePointCount(IntersectVisitor visitor);
+
/** Returns minimum value for each dimension, packed, or null if {@link #size} is 0
*/
public abstract byte[] getMinPackedValue() throws IOException;
diff --git a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
index 07cf293c173..4aaf09552b3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PointValuesWriter.java
@@ -90,6 +90,11 @@ class PointValuesWriter {
}
}
+ @Override
+ public long estimatePointCount(IntersectVisitor visitor) {
+ throw new UnsupportedOperationException();
+ }
+
@Override
public byte[] getMinPackedValue() {
throw new UnsupportedOperationException();
@@ -208,6 +213,11 @@ class PointValuesWriter {
});
}
+ @Override
+ public long estimatePointCount(IntersectVisitor visitor) {
+ return in.estimatePointCount(visitor);
+ }
+
@Override
public byte[] getMinPackedValue() throws IOException {
return in.getMinPackedValue();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
index a6748b85325..f24a4d0728e 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortingLeafReader.java
@@ -42,7 +42,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
/**
* An {@link org.apache.lucene.index.LeafReader} which supports sorting documents by a given
- * {@link Sort}. This is package private and is only used by Lucene fo BWC when it needs to merge
+ * {@link Sort}. This is package private and is only used by Lucene for BWC when it needs to merge
* an unsorted flushed segment built by an older version (newly flushed segments are sorted since version 7.0).
*
* @lucene.experimental
@@ -327,6 +327,11 @@ class SortingLeafReader extends FilterLeafReader {
});
}
+ @Override
+ public long estimatePointCount(IntersectVisitor visitor) {
+ return in.estimatePointCount(visitor);
+ }
+
@Override
public byte[] getMinPackedValue() throws IOException {
return in.getMinPackedValue();
diff --git a/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java b/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java
new file mode 100644
index 00000000000..4540c852fc6
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.OptionalLong;
+import java.util.stream.Stream;
+
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.util.PriorityQueue;
+
+final class Boolean2ScorerSupplier extends ScorerSupplier {
+
+ private final BooleanWeight weight;
+ private final Map
- * NOTE: be very careful using this query: it is
- * typically much slower than using {@code TermsQuery},
- * but in certain specialized cases may be faster.
+ * NOTE: This query is typically best used within a
+ * {@link IndexOrDocValuesQuery} alongside a query that uses an indexed
+ * structure such as {@link PointValues points} or {@link Terms terms},
+ * which allows to run the query on doc values when that would be more
+ * efficient, and using an index otherwise.
*
* @lucene.experimental
*/
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/IndexOrDocValuesQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/IndexOrDocValuesQuery.java
new file mode 100644
index 00000000000..0f9e8e3e027
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/IndexOrDocValuesQuery.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReaderContext;
+
+/**
+ * A query that uses either an index (points or terms) or doc values in order
+ * to run a range query, depending which one is more efficient.
+ */
+public final class IndexOrDocValuesQuery extends Query {
+
+ private final Query indexQuery, dvQuery;
+
+ /**
+ * Constructor that takes both a query that executes on an index structure
+ * like the inverted index or the points tree, and another query that
+ * executes on doc values. Both queries must match the same documents and
+ * attribute constant scores.
+ */
+ public IndexOrDocValuesQuery(Query indexQuery, Query dvQuery) {
+ this.indexQuery = indexQuery;
+ this.dvQuery = dvQuery;
+ }
+
+ @Override
+ public String toString(String field) {
+ return indexQuery.toString(field);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (sameClassAs(obj) == false) {
+ return false;
+ }
+ IndexOrDocValuesQuery that = (IndexOrDocValuesQuery) obj;
+ return indexQuery.equals(that.indexQuery) && dvQuery.equals(that.dvQuery);
+ }
+
+ @Override
+ public int hashCode() {
+ int h = classHash();
+ h = 31 * h + indexQuery.hashCode();
+ h = 31 * h + dvQuery.hashCode();
+ return h;
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException {
+ Query indexRewrite = indexQuery.rewrite(reader);
+ Query dvRewrite = dvQuery.rewrite(reader);
+ if (indexQuery != indexRewrite || dvQuery != dvRewrite) {
+ return new IndexOrDocValuesQuery(indexRewrite, dvRewrite);
+ }
+ return this;
+ }
+
+ @Override
+ public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
+ final Weight indexWeight = indexQuery.createWeight(searcher, needsScores, boost);
+ final Weight dvWeight = dvQuery.createWeight(searcher, needsScores, boost);
+ return new ConstantScoreWeight(this, boost) {
+ @Override
+ public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
+ return indexWeight.bulkScorer(context);
+ }
+
+ @Override
+ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+ final ScorerSupplier indexScorerSupplier = indexWeight.scorerSupplier(context);
+ final ScorerSupplier dvScorerSupplier = dvWeight.scorerSupplier(context);
+ if (indexScorerSupplier == null || dvScorerSupplier == null) {
+ return null;
+ }
+ return new ScorerSupplier() {
+ @Override
+ public Scorer get(boolean randomAccess) throws IOException {
+ return (randomAccess ? dvScorerSupplier : indexScorerSupplier).get(randomAccess);
+ }
+
+ @Override
+ public long cost() {
+ return Math.min(indexScorerSupplier.cost(), dvScorerSupplier.cost());
+ }
+ };
+ }
+
+ @Override
+ public Scorer scorer(LeafReaderContext context) throws IOException {
+ ScorerSupplier scorerSupplier = scorerSupplier(context);
+ if (scorerSupplier == null) {
+ return null;
+ }
+ return scorerSupplier.get(false);
+ }
+ };
+ }
+
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestIndexOrDocValuesQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestIndexOrDocValuesQuery.java
new file mode 100644
index 00000000000..2a16e5d8eac
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestIndexOrDocValuesQuery.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestIndexOrDocValuesQuery extends LuceneTestCase {
+
+ public void testUseIndexForSelectiveQueries() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, newIndexWriterConfig()
+ // relies on costs and PointValues.estimateCost so we need the default codec
+ .setCodec(TestUtil.getDefaultCodec()));
+ for (int i = 0; i < 2000; ++i) {
+ Document doc = new Document();
+ if (i == 42) {
+ doc.add(new StringField("f1", "bar", Store.NO));
+ doc.add(new LongPoint("f2", 42L));
+ doc.add(new NumericDocValuesField("f2", 42L));
+ } else if (i == 100) {
+ doc.add(new StringField("f1", "foo", Store.NO));
+ doc.add(new LongPoint("f2", 2L));
+ doc.add(new NumericDocValuesField("f2", 2L));
+ } else {
+ doc.add(new StringField("f1", "bar", Store.NO));
+ doc.add(new LongPoint("f2", 2L));
+ doc.add(new NumericDocValuesField("f2", 2L));
+ }
+ w.addDocument(doc);
+ }
+ w.forceMerge(1);
+ IndexReader reader = DirectoryReader.open(w);
+ IndexSearcher searcher = newSearcher(reader);
+ searcher.setQueryCache(null);
+
+ // The term query is more selective, so the IndexOrDocValuesQuery should use doc values
+ final Query q1 = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("f1", "foo")), Occur.MUST)
+ .add(new IndexOrDocValuesQuery(LongPoint.newExactQuery("f2", 2), new DocValuesNumbersQuery("f2", 2L)), Occur.MUST)
+ .build();
+
+ final Weight w1 = searcher.createNormalizedWeight(q1, random().nextBoolean());
+ final Scorer s1 = w1.scorer(reader.leaves().get(0));
+ assertNotNull(s1.twoPhaseIterator()); // means we use doc values
+
+ // The term query is less selective, so the IndexOrDocValuesQuery should use points
+ final Query q2 = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("f1", "bar")), Occur.MUST)
+ .add(new IndexOrDocValuesQuery(LongPoint.newExactQuery("f2", 42), new DocValuesNumbersQuery("f2", 42L)), Occur.MUST)
+ .build();
+
+ final Weight w2 = searcher.createNormalizedWeight(q2, random().nextBoolean());
+ final Scorer s2 = w2.scorer(reader.leaves().get(0));
+ assertNull(s2.twoPhaseIterator()); // means we use points
+
+ reader.close();
+ w.close();
+ dir.close();
+ }
+
+}
diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java
index bb26a2eb5e7..3cdf5e94851 100644
--- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java
+++ b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java
@@ -24,7 +24,6 @@ import java.util.List;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TermInSetQuery;
@@ -36,6 +35,7 @@ import org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.junit.Before;
import org.junit.Test;
@@ -127,12 +127,12 @@ public class NumberRangeFacetsTest extends StrategyTestCase {
Collections.shuffle(acceptFieldIds, random());
acceptFieldIds = acceptFieldIds.subList(0, randomInt(acceptFieldIds.size()));
if (!acceptFieldIds.isEmpty()) {
- List