- * Unicode version: 9.0.0.0 - */ -public final class UnicodeProps { - private UnicodeProps() {} - - /** Unicode version that was used to generate this file: {@value} */ - public static final String UNICODE_VERSION = "9.0.0.0"; - - /** Bitset with Unicode WHITESPACE code points. */ - public static final Bits WHITESPACE = createBits( - 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, - 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000); - - private static Bits createBits(final int... codepoints) { - final int len = codepoints[codepoints.length - 1] + 1; - final SparseFixedBitSet bitset = new SparseFixedBitSet(len); - for (int i : codepoints) bitset.set(i); - return new Bits() { - @Override - public boolean get(int index) { - return index < len && bitset.get(index); - } - - @Override - public int length() { - return 0x10FFFF + 1; - } - }; - } -} +// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate. + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.util; + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.SparseFixedBitSet; + +/** + * This file contains unicode properties used by various {@link CharTokenizer}s. + * The data was created using ICU4J v60.2.0.0 + *
+ * Unicode version: 10.0.0.0
+ */
+public final class UnicodeProps {
+ private UnicodeProps() {}
+
+ /** Unicode version that was used to generate this file: {@value} */
+ public static final String UNICODE_VERSION = "10.0.0.0";
+
+ /** Bitset with Unicode WHITESPACE code points. */
+ public static final Bits WHITESPACE = createBits(
+ 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003,
+ 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
+
+ private static Bits createBits(final int... codepoints) {
+ final int len = codepoints[codepoints.length - 1] + 1;
+ final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
+ for (int i : codepoints) bitset.set(i);
+ return new Bits() {
+ @Override
+ public boolean get(int index) {
+ return index < len && bitset.get(index);
+ }
+
+ @Override
+ public int length() {
+ return 0x10FFFF + 1;
+ }
+ };
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
index ed3abe45b54..67a1bb42920 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
}
+
+ public void testLucene8124() throws Exception {
+ InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(is);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ whitespaceMockTokenizer(
+ "Rindfleisch"),
+ hyphenator);
+
+ // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
+ assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
+ }
+
public static interface MockRetainAttribute extends Attribute {
void setRetain(boolean attr);
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/hyphenation-LUCENE-8124.xml b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/hyphenation-LUCENE-8124.xml
new file mode 100644
index 00000000000..8710eab0872
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/hyphenation-LUCENE-8124.xml
@@ -0,0 +1,61 @@
+
+
+
+
+
+ * A normalizer with additional settings such as a filter that lists characters not + * to be normalized can be passed in the constructor. + *
*/ public final class ICUFoldingFilter extends ICUNormalizer2Filter { - // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error. - // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html - private static final Normalizer2 normalizer = Normalizer2.getInstance( - ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), - "utr30", Normalizer2.Mode.COMPOSE); - + /** + * A normalizer for search term folding to Unicode text, + * applying foldings from UTR#30 Character Foldings. + */ + public static final Normalizer2 NORMALIZER = Normalizer2.getInstance( + // TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error. + // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html + ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), + "utr30", Normalizer2.Mode.COMPOSE); + /** * Create a new ICUFoldingFilter on the specified input */ public ICUFoldingFilter(TokenStream input) { + super(input, NORMALIZER); + } + + /** + * Create a new ICUFoldingFilter on the specified input with the specified + * normalizer + */ + public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) { super(input, normalizer); } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java index 036874ac9ff..1065cbfac81 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java @@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs import org.apache.lucene.analysis.util.MultiTermAwareComponent; import org.apache.lucene.analysis.util.TokenFilterFactory; -/** +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + +/** * Factory for {@link ICUFoldingFilter}. ** <fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100"> @@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * @since 3.1.0 */ public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private final Normalizer2 normalizer; /** Creates a new ICUFoldingFilterFactory */ public ICUFoldingFilterFactory(Mapargs) { super(args); + + Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER; + String filter = get(args, "filter"); + if (filter != null) { + UnicodeSet set = new UnicodeSet(filter); + if (!set.isEmpty()) { + set.freeze(); + normalizer = new FilteredNormalizer2(normalizer, set); + } + } if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } + this.normalizer = normalizer; } @Override public TokenStream create(TokenStream input) { - return new ICUFoldingFilter(input); + return new ICUFoldingFilter(input, normalizer); } @Override diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java index d8ecb77d401..9e5050d55b8 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java @@ -16,152 +16,84 @@ */ package org.apache.lucene.analysis.icu.segmentation; - -import java.text.CharacterIterator; - import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; /** - * Contain all the issues surrounding BreakIterators in ICU in one place. - * Basically this boils down to the fact that they aren't very friendly to any - * sort of OO design. - * - * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to - * BreakIterator from RuleBasedBreakIterator - *
- * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but - * doesn't actually behave as a subclass: it always returns 0 for - * getRuleStatus(): - * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type - * tags + * Wraps RuleBasedBreakIterator, making object reuse convenient and + * emitting a rule status for emoji sequences. * @lucene.experimental */ -abstract class BreakIteratorWrapper { - protected final CharArrayIterator textIterator = new CharArrayIterator(); - protected char text[]; - protected int start; - protected int length; +final class BreakIteratorWrapper { + private final CharArrayIterator textIterator = new CharArrayIterator(); + private final RuleBasedBreakIterator rbbi; + private char text[]; + private int start; + private int status; + + BreakIteratorWrapper(RuleBasedBreakIterator rbbi) { + this.rbbi = rbbi; + } + + int current() { + return rbbi.current(); + } - abstract int next(); - abstract int current(); - abstract int getRuleStatus(); - abstract void setText(CharacterIterator text); + int getRuleStatus() { + return status; + } + + int next() { + int current = rbbi.current(); + int next = rbbi.next(); + status = calcStatus(current, next); + return next; + } + + /** Returns current rule status for the text between breaks. (determines token type) */ + private int calcStatus(int current, int next) { + // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing. + // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i= + if (next != BreakIterator.DONE && isEmoji(current, next)) { + return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS; + } else { + return rbbi.getRuleStatus(); + } + } + + // See unicode doc L2/16-315 and also the RBBI rules for rationale. + // we don't include regional indicators here, because they aren't ambiguous for tagging, + // they need only be treated special for segmentation. + static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze(); + + /** Returns true if the current text represents emoji character or sequence */ + private boolean isEmoji(int current, int next) { + int begin = start + current; + int end = start + next; + int codepoint = UTF16.charAt(text, 0, end, begin); + // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:] + if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) { + if (EMOJI_RK.contains(codepoint)) { + // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, + // an emoji presentation selector or keycap follows. + int trailer = begin + Character.charCount(codepoint); + return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3); + } else { + return true; + } + } + return false; + } void setText(char text[], int start, int length) { this.text = text; this.start = start; - this.length = length; textIterator.setText(text, start, length); - setText(textIterator); - } - - /** - * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's - * any other BreakIterator, the rulestatus method is not available, so treat - * it like a generic BreakIterator. - */ - static BreakIteratorWrapper wrap(BreakIterator breakIterator) { - if (breakIterator instanceof RuleBasedBreakIterator) - return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); - else - return new BIWrapper(breakIterator); - } - - /** - * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not - * a DictionaryBasedBreakIterator) behaves correctly. - */ - static final class RBBIWrapper extends BreakIteratorWrapper { - private final RuleBasedBreakIterator rbbi; - - RBBIWrapper(RuleBasedBreakIterator rbbi) { - this.rbbi = rbbi; - } - - @Override - int current() { - return rbbi.current(); - } - - @Override - int getRuleStatus() { - return rbbi.getRuleStatus(); - } - - @Override - int next() { - return rbbi.next(); - } - - @Override - void setText(CharacterIterator text) { - rbbi.setText(text); - } - } - - /** - * Generic BreakIterator wrapper: Either the rulestatus method is not - * available or always returns 0. Calculate a rulestatus here so it behaves - * like RuleBasedBreakIterator. - * - * Note: This is slower than RuleBasedBreakIterator. - */ - static final class BIWrapper extends BreakIteratorWrapper { - private final BreakIterator bi; - private int status; - - BIWrapper(BreakIterator bi) { - this.bi = bi; - } - - @Override - int current() { - return bi.current(); - } - - @Override - int getRuleStatus() { - return status; - } - - @Override - int next() { - int current = bi.current(); - int next = bi.next(); - status = calcStatus(current, next); - return next; - } - - private int calcStatus(int current, int next) { - if (current == BreakIterator.DONE || next == BreakIterator.DONE) - return RuleBasedBreakIterator.WORD_NONE; - - int begin = start + current; - int end = start + next; - - int codepoint; - for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { - codepoint = UTF16.charAt(text, 0, end, begin); - - if (UCharacter.isDigit(codepoint)) - return RuleBasedBreakIterator.WORD_NUMBER; - else if (UCharacter.isLetter(codepoint)) { - // TODO: try to separately specify ideographic, kana? - // [currently all bundled as letter for this case] - return RuleBasedBreakIterator.WORD_LETTER; - } - } - - return RuleBasedBreakIterator.WORD_NONE; - } - - @Override - void setText(CharacterIterator text) { - bi.setText(text); - status = RuleBasedBreakIterator.WORD_NONE; - } + rbbi.setText(textIterator); + status = RuleBasedBreakIterator.WORD_NONE; } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java index 096eada2de3..3cb39edb92d 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java @@ -123,7 +123,7 @@ final class CompositeBreakIterator { private BreakIteratorWrapper getBreakIterator(int scriptCode) { if (wordBreakers[scriptCode] == null) - wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode)); + wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode)); return wordBreakers[scriptCode]; } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java index 50a6b4c71d8..10e6c671817 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java @@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /** Token type for words that appear to be numbers */ public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + /** Token type for words that appear to be emoji sequences */ + public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; /* * the default breakiterators in use. these can be expensive to @@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html // the same as ROOT, except no dictionary segmentation for cjk - private static final BreakIterator defaultBreakIterator = + private static final RuleBasedBreakIterator defaultBreakIterator = readBreakIterator("Default.brk"); - private static final BreakIterator myanmarSyllableIterator = + private static final RuleBasedBreakIterator myanmarSyllableIterator = readBreakIterator("MyanmarSyllable.brk"); // TODO: deprecate this boolean? you only care if you are doing super-expert stuff... @@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { } @Override - public BreakIterator getBreakIterator(int script) { + public RuleBasedBreakIterator getBreakIterator(int script) { switch(script) { - case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone(); + case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone(); case UScript.MYANMAR: if (myanmarAsWords) { - return (BreakIterator)defaultBreakIterator.clone(); + return (RuleBasedBreakIterator)defaultBreakIterator.clone(); } else { - return (BreakIterator)myanmarSyllableIterator.clone(); + return (RuleBasedBreakIterator)myanmarSyllableIterator.clone(); } - default: return (BreakIterator)defaultBreakIterator.clone(); + default: return (RuleBasedBreakIterator)defaultBreakIterator.clone(); } } @@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; case RuleBasedBreakIterator.WORD_NUMBER: return WORD_NUMBER; + case EMOJI_SEQUENCE_STATUS: + return WORD_EMOJI; default: /* some other custom code */ return "
"; } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java index 09415516479..8b62ddbea67 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer { */ private boolean incrementTokenBuffer() { int start = breaker.current(); - if (start == BreakIterator.DONE) - return false; // BreakIterator exhausted + assert start != BreakIterator.DONE; // find the next set of boundaries, skipping over non-tokens (rule status 0) int end = breaker.next(); - while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) { + while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) { start = end; end = breaker.next(); } - if (start == BreakIterator.DONE) + if (end == BreakIterator.DONE) { return false; // BreakIterator exhausted + } termAtt.copyBuffer(buffer, start, end - start); offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end)); diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java index 69694fc0780..e2d3dae3d75 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java @@ -16,8 +16,7 @@ */ package org.apache.lucene.analysis.icu.segmentation; - -import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; /** * Class that allows for tailored Unicode Text Segmentation on @@ -25,14 +24,16 @@ import com.ibm.icu.text.BreakIterator; * @lucene.experimental */ public abstract class ICUTokenizerConfig { - + /** Rule status for emoji sequences */ + public static final int EMOJI_SEQUENCE_STATUS = 299; + /** * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public ICUTokenizerConfig() {} /** Return a breakiterator capable of processing a given script. */ - public abstract BreakIterator getBreakIterator(int script); + public abstract RuleBasedBreakIterator getBreakIterator(int script); /** Return a token type value for a given script and BreakIterator * rule status. */ public abstract String getType(int script, int ruleStatus); diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java index 4d29b0c36bc..0cd4cf28e5f 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java @@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) { @Override - public BreakIterator getBreakIterator(int script) { + public RuleBasedBreakIterator getBreakIterator(int script) { if (breakers[script] != null) { - return (BreakIterator) breakers[script].clone(); + return (RuleBasedBreakIterator) breakers[script].clone(); } else { return super.getBreakIterator(script); } diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html index bdace97b4c6..6fa5821c242 100644 --- a/lucene/analysis/icu/src/java/overview.html +++ b/lucene/analysis/icu/src/java/overview.html @@ -353,7 +353,7 @@ and Backwards Compatibility
This module exists to provide up-to-date Unicode functionality that supports -the most recent version of Unicode (currently 8.0). However, some users who wish +the most recent version of Unicode (currently 10.0). However, some users who wish for stronger backwards compatibility can restrict {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk index c94a023c2ce..4a9df159935 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk index c3357efa7ce..a9d0673aa8d 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm index 1a16f3eb182..1c3de121cad 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java index 3782216d38c..3e3c5235791 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java @@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream; /** basic tests for {@link ICUFoldingFilterFactory} */ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { - + /** basic tests to ensure the folding is working */ public void test() throws Exception { Reader reader = new StringReader("Résumé"); @@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { stream = factory.create(stream); assertTokenStreamContents(stream, new String[] { "resume" }); } - + + /** test to ensure the filter parameter is working */ + public void testFilter() throws Exception { + HashMap
args = new HashMap (); + args.put("filter", "[^ö]"); + ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args); + + Reader reader = new StringReader("Résumé"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "resume" }); + + reader = new StringReader("Fönster"); + stream = whitespaceMockTokenizer(reader); + stream = factory.create(stream); + assertTokenStreamContents(stream, new String[] { "fönster" }); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index 027baa35705..98939752cbe 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -16,13 +16,10 @@ */ package org.apache.lucene.analysis.icu.segmentation; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; import com.ibm.icu.lang.UScript; @@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); - TokenFilter filter = new ICUNormalizer2Filter(tokenizer); - return new TokenStreamComponents(tokenizer, filter); + return new TokenStreamComponents(tokenizer); } }; } @@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testArmenian() throws Exception { assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", - new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", - "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } ); + new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", + "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } ); } public void testAmharic() throws Exception { @@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testArabic() throws Exception { assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.", new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", - "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); + "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); } public void testAramaic() throws Exception { assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀", - new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", + new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"}); } @@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testGreek() throws Exception { assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.", - new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που", + new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που", "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); } @@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { */ public void testChinese() throws Exception { assertAnalyzesTo(a, "我是中国人。 1234 Tests ", - new String[] { "我", "是", "中", "国", "人", "1234", "tests"}); + new String[] { "我", "是", "中", "国", "人", "1234", "Tests"}); } public void testHebrew() throws Exception { @@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { /* Tests from StandardAnalyzer, just to show behavior is similar */ public void testAlphanumericSA() throws Exception { // alphanumeric tokens - assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); - assertAnalyzesTo(a, "2B", new String[]{"2b"}); + assertAnalyzesTo(a, "B2B", new String[]{"B2B"}); + assertAnalyzesTo(a, "2B", new String[]{"2B"}); } public void testDelimitersSA() throws Exception { @@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's - assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"}); assertAnalyzesTo(a, "you're", new String[]{"you're"}); assertAnalyzesTo(a, "she's", new String[]{"she's"}); - assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); + assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"}); assertAnalyzesTo(a, "don't", new String[]{"don't"}); - assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"}); } public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. // every other segment must have at least one digit assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); - assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); + assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); } public void testTextWithNumbersSA() throws Exception { // numbers - assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); + assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"}); } public void testVariousTextSA() throws Exception { // various - assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); - assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); - assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); - assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); + assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"}); + assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"}); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"}); + assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"}); } public void testKoreanSA() throws Exception { @@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testOffsets() throws Exception { assertAnalyzesTo(a, "David has 5000 bones", - new String[] {"david", "has", "5000", "bones"}, + new String[] {"David", "has", "5000", "bones"}, new int[] {0, 6, 10, 15}, new int[] {5, 9, 14, 20}); } public void testTypes() throws Exception { assertAnalyzesTo(a, "David has 5000 bones", - new String[] {"david", "has", "5000", "bones"}, + new String[] {"David", "has", "5000", "bones"}, new String[] { " ", " ", " ", " " }); } @@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { new String[] { " ", " ", " ", " ", " " }); } + /** simple emoji */ + public void testEmoji() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩", + new String[] { "💩", "💩", "💩" }, + new String[] { " ", " ", " " }); + } + + /** emoji zwj sequence */ + public void testEmojiSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩", + new String[] { "👩❤️👩" }, + new String[] { " " }); + } + + /** emoji zwj sequence with fitzpatrick modifier */ + public void testEmojiSequenceWithModifier() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️", + new String[] { "👨🏼⚕️" }, + new String[] { " " }); + } + + /** regional indicator */ + public void testEmojiRegionalIndicator() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸", + new String[] { "🇺🇸", "🇺🇸" }, + new String[] { " ", " " }); + } + + /** variation sequence */ + public void testEmojiVariationSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣", + new String[] { "#️⃣" }, + new String[] { " " }); + BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣", + new String[] { "3️⃣",}, + new String[] { " " }); + } + + public void testEmojiTagSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴", + new String[] { "🏴" }, + new String[] { " " }); + } + + public void testEmojiTokenization() throws Exception { + // simple emoji around latin + BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo", + new String[] { "poo", "💩", "poo" }, + new String[] { " ", " ", " " }); + // simple emoji around non-latin + BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩", + new String[] { "💩", "中", "國", "💩" }, + new String[] { " ", " ", " ", " " }); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java index 75481f1924c..d93a8104891 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java @@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase { ); } + /** + * dictionary segmentation with emoji + */ + public void testSimpleJapaneseWithEmoji() throws Exception { + assertAnalyzesTo(a, "それはまだ実験段階にあります💩", + new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩" } + ); + } + public void testJapaneseTypes() throws Exception { assertAnalyzesTo(a, "仮名遣い カタカナ", new String[] { "仮名遣い", "カタカナ" }, diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java index 0f2bffecfb0..042fa37a2f4 100644 --- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java +++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java @@ -62,9 +62,9 @@ import java.util.regex.Pattern; */ public class GenerateUTR30DataFiles { private static final String ICU_SVN_TAG_URL - = "http://source.icu-project.org/repos/icu/icu/tags"; - private static final String ICU_RELEASE_TAG = "release-58-1"; - private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2"; + = "http://source.icu-project.org/repos/icu/tags"; + private static final String ICU_RELEASE_TAG = "release-60-2"; + private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2"; private static final String NFC_TXT = "nfc.txt"; private static final String NFKC_TXT = "nfkc.txt"; private static final String NFKC_CF_TXT = "nfkc_cf.txt"; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java index 4f768b732f9..7e1d7a11529 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilter.java @@ -166,9 +166,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter { buffer.reset(input); } - /** - * {@inheritDoc} - */ @Override public int read(char[] buffer, int offset, int length) throws IOException { int read = 0; @@ -185,9 +182,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter { return read == 0 ? -1 : read; } - /** - * {@inheritDoc} - */ @Override public int read() throws IOException { int ic = buffer.get(bufferPosition); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 98e7aafcacf..edeb0ee6c1c 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -293,7 +293,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "7.1.0-cfs", "7.1.0-nocfs", "7.2.0-cfs", - "7.2.0-nocfs" + "7.2.0-nocfs", + "7.2.1-cfs", + "7.2.1-nocfs" }; public static String[] getOldNames() { @@ -304,7 +306,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "sorted.7.0.0", "sorted.7.0.1", "sorted.7.1.0", - "sorted.7.2.0" + "sorted.7.2.0", + "sorted.7.2.1" }; public static String[] getOldSortedNames() { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-cfs.zip new file mode 100644 index 00000000000..e579dabbb48 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-nocfs.zip new file mode 100644 index 00000000000..68f14a4e095 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.7.2.1-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.7.2.1.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.7.2.1.zip new file mode 100644 index 00000000000..80e676a5a5f Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/sorted.7.2.1.zip differ diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java index 1a74416f33c..f03fc5300f0 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BM25NBClassifier.java @@ -99,17 +99,11 @@ public class BM25NBClassifier implements Classifier { this.query = query; } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(String inputDocument) throws IOException { return assignClassNormalizedList(inputDocument).get(0); } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text) throws IOException { List > assignedClasses = assignClassNormalizedList(text); @@ -117,9 +111,6 @@ public class BM25NBClassifier implements Classifier { return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text, int max) throws IOException { List > assignedClasses = assignClassNormalizedList(text); diff --git a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java index 928c0366770..394d15f777d 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/BooleanPerceptronClassifier.java @@ -195,9 +195,6 @@ public class BooleanPerceptronClassifier implements Classifier { } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(String text) throws IOException { @@ -220,18 +217,12 @@ public class BooleanPerceptronClassifier implements Classifier { return new ClassificationResult<>(output >= bias, score); } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text) throws IOException { return null; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text, int max) throws IOException { diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java index cbd241b4bb4..941d881a3ef 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestFuzzyClassifier.java @@ -103,9 +103,6 @@ public class KNearestFuzzyClassifier implements Classifier { } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(String text) throws IOException { TopDocs knnResults = knnSearch(text); @@ -121,9 +118,6 @@ public class KNearestFuzzyClassifier implements Classifier { return assignedClass; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text) throws IOException { TopDocs knnResults = knnSearch(text); @@ -132,9 +126,6 @@ public class KNearestFuzzyClassifier implements Classifier { return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text, int max) throws IOException { TopDocs knnResults = knnSearch(text); @@ -213,7 +204,7 @@ public class KNearestFuzzyClassifier implements Classifier { ", classFieldName='" + classFieldName + '\'' + ", k=" + k + ", query=" + query + - ", similarity=" + indexSearcher.getSimilarity(true) + + ", similarity=" + indexSearcher.getSimilarity() + '}'; } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java index f0391f4471d..1bc53b0202c 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java @@ -119,9 +119,6 @@ public class KNearestNeighborClassifier implements Classifier { } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(String text) throws IOException { return classifyFromTopDocs(knnSearch(text)); @@ -143,9 +140,6 @@ public class KNearestNeighborClassifier implements Classifier { return assignedClass; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text) throws IOException { TopDocs knnResults = knnSearch(text); @@ -154,9 +148,6 @@ public class KNearestNeighborClassifier implements Classifier { return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text, int max) throws IOException { TopDocs knnResults = knnSearch(text); @@ -251,7 +242,7 @@ public class KNearestNeighborClassifier implements Classifier { ", classFieldName='" + classFieldName + '\'' + ", k=" + k + ", query=" + query + - ", similarity=" + indexSearcher.getSimilarity(true) + + ", similarity=" + indexSearcher.getSimilarity() + '}'; } } diff --git a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java index 3509df58511..a1546498de6 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java @@ -98,9 +98,6 @@ public class SimpleNaiveBayesClassifier implements Classifier { this.query = query; } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(String inputDocument) throws IOException { List > assignedClasses = assignClassNormalizedList(inputDocument); @@ -115,9 +112,6 @@ public class SimpleNaiveBayesClassifier implements Classifier { return assignedClass; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text) throws IOException { List > assignedClasses = assignClassNormalizedList(text); @@ -125,9 +119,6 @@ public class SimpleNaiveBayesClassifier implements Classifier { return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(String text, int max) throws IOException { List > assignedClasses = assignClassNormalizedList(text); diff --git a/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java index d687722c87c..39684ee25e7 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/document/KNearestNeighborDocumentClassifier.java @@ -72,17 +72,11 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi this.field2analyzer = field2analyzer; } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(Document document) throws IOException { return classifyFromTopDocs(knnSearch(document)); } - /** - * {@inheritDoc} - */ @Override public List > getClasses(Document document) throws IOException { TopDocs knnResults = knnSearch(document); @@ -91,9 +85,6 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(Document document, int max) throws IOException { TopDocs knnResults = knnSearch(document); diff --git a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java index 6bc8573c094..f6405901384 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/document/SimpleNaiveBayesDocumentClassifier.java @@ -71,9 +71,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi this.field2analyzer = field2analyzer; } - /** - * {@inheritDoc} - */ @Override public ClassificationResult assignClass(Document document) throws IOException { List > assignedClasses = assignNormClasses(document); @@ -88,9 +85,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi return assignedClass; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(Document document) throws IOException { List > assignedClasses = assignNormClasses(document); @@ -98,9 +92,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi return assignedClasses; } - /** - * {@inheritDoc} - */ @Override public List > getClasses(Document document, int max) throws IOException { List > assignedClasses = assignNormClasses(document); diff --git a/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java b/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java index d4a26341560..308dcdc84d3 100644 --- a/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java +++ b/lucene/classification/src/java/org/apache/lucene/classification/utils/NearestFuzzyQuery.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause; @@ -210,20 +210,20 @@ public class NearestFuzzyQuery extends Query { } private Query newTermQuery(IndexReader reader, Term term) throws IOException { - // we build an artificial TermContext that will give an overall df and ttf + // we build an artificial TermStates that will give an overall df and ttf // equal to 1 - TermContext context = new TermContext(reader.getContext()); + TermStates termStates = new TermStates(reader.getContext()); for (LeafReaderContext leafContext : reader.leaves()) { Terms terms = leafContext.reader().terms(term.field()); if (terms != null) { TermsEnum termsEnum = terms.iterator(); if (termsEnum.seekExact(term.bytes())) { - int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 - context.register(termsEnum.termState(), leafContext.ord, freq, freq); + int freq = 1 - termStates.docFreq(); // we want the total df and ttf to be 1 + termStates.register(termsEnum.termState(), leafContext.ord, freq, freq); } } } - return new TermQuery(term, context); + return new TermQuery(term, termStates); } @Override diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 663e733f6b0..f6f4da3d769 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -1309,7 +1309,8 @@ ant test "-Dtests.method=*esi*" ant test -Dtests.seed=DEADBEEF # Repeats _all_ tests of ClassName N times. Every test repetition -# will have a different seed. +# will have a different seed. NOTE: does not reinitialize +# between repetitions, use only for idempotent tests. ant test -Dtests.iters=N -Dtestcase=ClassName # Repeats _all_ tests of ClassName N times. Every test repetition diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index 04101246460..50d1f9fb5d0 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer { public static final int KATAKANA = 5; /** Hangul token type */ public static final int HANGUL = 6; + /** Emoji token type. */ + public static final int EMOJI = 7; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { @@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer { " ", " ", " ", - " " + " ", + " " }; /** Absolute maximum sized token */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java index 19e56a40a04..7521763f330 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java @@ -103,11 +103,8 @@ final class IntersectTermsEnum extends TermsEnum { arcs[arcIdx] = new FST.Arc<>(); } - if (fr.index == null) { - fstReader = null; - } else { - fstReader = fr.index.getBytesReader(); - } + + fstReader = fr.index.getBytesReader(); // TODO: if the automaton is "smallish" we really // should use the terms index to seek at least to diff --git a/lucene/core/src/java/org/apache/lucene/index/TermContext.java b/lucene/core/src/java/org/apache/lucene/index/TermStates.java similarity index 63% rename from lucene/core/src/java/org/apache/lucene/index/TermContext.java rename to lucene/core/src/java/org/apache/lucene/index/TermStates.java index 3ba8dd9d848..4bb83fe4e8f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermStates.java @@ -17,34 +17,37 @@ package org.apache.lucene.index; -import org.apache.lucene.util.BytesRef; - import java.io.IOException; import java.util.Arrays; /** * Maintains a {@link IndexReader} {@link TermState} view over * {@link IndexReader} instances containing a single term. The - * {@link TermContext} doesn't track if the given {@link TermState} + * {@link TermStates} doesn't track if the given {@link TermState} * objects are valid, neither if the {@link TermState} instances refer to the * same terms in the associated readers. * * @lucene.experimental */ -public final class TermContext { +public final class TermStates { + + private static final TermState EMPTY_TERMSTATE = new TermState() { + @Override + public void copyFrom(TermState other) { + + } + }; // Important: do NOT keep hard references to index readers private final Object topReaderContextIdentity; private final TermState[] states; + private final Term term; // null if stats are to be used private int docFreq; private long totalTermFreq; //public static boolean DEBUG = BlockTreeTermsWriter.DEBUG; - /** - * Creates an empty {@link TermContext} from a {@link IndexReaderContext} - */ - public TermContext(IndexReaderContext context) { + private TermStates(Term term, IndexReaderContext context) { assert context != null && context.isTopLevel; topReaderContextIdentity = context.identity; docFreq = 0; @@ -56,10 +59,18 @@ public final class TermContext { len = context.leaves().size(); } states = new TermState[len]; + this.term = term; } /** - * Expert: Return whether this {@link TermContext} was built for the given + * Creates an empty {@link TermStates} from a {@link IndexReaderContext} + */ + public TermStates(IndexReaderContext context) { + this(null, context); + } + + /** + * Expert: Return whether this {@link TermStates} was built for the given * {@link IndexReaderContext}. This is typically used for assertions. * @lucene.internal */ @@ -68,35 +79,35 @@ public final class TermContext { } /** - * Creates a {@link TermContext} with an initial {@link TermState}, + * Creates a {@link TermStates} with an initial {@link TermState}, * {@link IndexReader} pair. */ - public TermContext(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) { - this(context); + public TermStates(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) { + this(null, context); register(state, ord, docFreq, totalTermFreq); } /** - * Creates a {@link TermContext} from a top-level {@link IndexReaderContext} and the + * Creates a {@link TermStates} from a top-level {@link IndexReaderContext} and the * given {@link Term}. This method will lookup the given term in all context's leaf readers - * and register each of the readers containing the term in the returned {@link TermContext} + * and register each of the readers containing the term in the returned {@link TermStates} * using the leaf reader's ordinal. * * Note: the given context must be a top-level context. + * + * @param needsStats if {@code true} then all leaf contexts will be visited up-front to + * collect term statistics. Otherwise, the {@link TermState} objects + * will be built only when requested */ - public static TermContext build(IndexReaderContext context, Term term) + public static TermStates build(IndexReaderContext context, Term term, boolean needsStats) throws IOException { assert context != null && context.isTopLevel; - final String field = term.field(); - final BytesRef bytes = term.bytes(); - final TermContext perReaderTermState = new TermContext(context); - //if (DEBUG) System.out.println("prts.build term=" + term); - for (final LeafReaderContext ctx : context.leaves()) { - //if (DEBUG) System.out.println(" r=" + leaves[i].reader); - final Terms terms = ctx.reader().terms(field); - if (terms != null) { - final TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seekExact(bytes)) { + final TermStates perReaderTermState = new TermStates(needsStats ? null : term, context); + if (needsStats) { + for (final LeafReaderContext ctx : context.leaves()) { + //if (DEBUG) System.out.println(" r=" + leaves[i].reader); + TermsEnum termsEnum = loadTermsEnum(ctx, term); + if (termsEnum != null) { final TermState termState = termsEnum.termState(); //if (DEBUG) System.out.println(" found"); perReaderTermState.register(termState, ctx.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); @@ -106,8 +117,19 @@ public final class TermContext { return perReaderTermState; } + private static TermsEnum loadTermsEnum(LeafReaderContext ctx, Term term) throws IOException { + final Terms terms = ctx.reader().terms(term.field()); + if (terms != null) { + final TermsEnum termsEnum = terms.iterator(); + if (termsEnum.seekExact(term.bytes())) { + return termsEnum; + } + } + return null; + } + /** - * Clears the {@link TermContext} internal state and removes all + * Clears the {@link TermStates} internal state and removes all * registered {@link TermState}s */ public void clear() { @@ -149,17 +171,25 @@ public final class TermContext { } /** - * Returns the {@link TermState} for an leaf ordinal or
null
if no - * {@link TermState} for the ordinal was registered. + * Returns the {@link TermState} for a leaf reader context ornull
if no + * {@link TermState} for the context was registered. * - * @param ord - * the readers leaf ordinal to get the {@link TermState} for. + * @param ctx + * the {@link LeafReaderContext} to get the {@link TermState} for. * @return the {@link TermState} for the given readers ord ornull
if no * {@link TermState} for the reader was registered */ - public TermState get(int ord) { - assert ord >= 0 && ord < states.length; - return states[ord]; + public TermState get(LeafReaderContext ctx) throws IOException { + assert ctx.ord >= 0 && ctx.ord < states.length; + if (term == null) + return states[ctx.ord]; + if (this.states[ctx.ord] == null) { + TermsEnum te = loadTermsEnum(ctx, term); + this.states[ctx.ord] = te == null ? EMPTY_TERMSTATE : te.termState(); + } + if (this.states[ctx.ord] == EMPTY_TERMSTATE) + return null; + return this.states[ctx.ord]; } /** @@ -169,6 +199,9 @@ public final class TermContext { * instances passed to {@link #register(TermState, int, int, long)}. */ public int docFreq() { + if (term != null) { + throw new IllegalStateException("Cannot call docFreq() when needsStats=false"); + } return docFreq; } @@ -179,19 +212,23 @@ public final class TermContext { * instances passed to {@link #register(TermState, int, int, long)}. */ public long totalTermFreq() { + if (term != null) { + throw new IllegalStateException("Cannot call totalTermFreq() when needsStats=false"); + } return totalTermFreq; } @Override public String toString() { StringBuilder sb = new StringBuilder(); - sb.append("TermContext\n"); + sb.append("TermStates\n"); for(TermState termState : states) { sb.append(" state="); - sb.append(termState.toString()); + sb.append(termState); sb.append('\n'); } return sb.toString(); } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java index 219d4535827..cca667575a4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/BlendedTermQuery.java @@ -25,7 +25,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.util.ArrayUtil; @@ -53,7 +53,7 @@ public final class BlendedTermQuery extends Query { private int numTerms = 0; private Term[] terms = new Term[0]; private float[] boosts = new float[0]; - private TermContext[] contexts = new TermContext[0]; + private TermStates[] contexts = new TermStates[0]; private RewriteMethod rewriteMethod = DISJUNCTION_MAX_REWRITE; /** Sole constructor. */ @@ -82,10 +82,10 @@ public final class BlendedTermQuery extends Query { /** * Expert: Add a {@link Term} with the provided boost and context. - * This method is useful if you already have a {@link TermContext} + * This method is useful if you already have a {@link TermStates} * object constructed for the given term. */ - public Builder add(Term term, float boost, TermContext context) { + public Builder add(Term term, float boost, TermStates context) { if (numTerms >= BooleanQuery.getMaxClauseCount()) { throw new BooleanQuery.TooManyClauses(); } @@ -184,10 +184,10 @@ public final class BlendedTermQuery extends Query { private final Term[] terms; private final float[] boosts; - private final TermContext[] contexts; + private final TermStates[] contexts; private final RewriteMethod rewriteMethod; - private BlendedTermQuery(Term[] terms, float[] boosts, TermContext[] contexts, + private BlendedTermQuery(Term[] terms, float[] boosts, TermStates[] contexts, RewriteMethod rewriteMethod) { assert terms.length == boosts.length; assert terms.length == contexts.length; @@ -205,7 +205,7 @@ public final class BlendedTermQuery extends Query { terms[i] = terms[j]; terms[j] = tmpTerm; - TermContext tmpContext = contexts[i]; + TermStates tmpContext = contexts[i]; contexts[i] = contexts[j]; contexts[j] = tmpContext; @@ -263,10 +263,10 @@ public final class BlendedTermQuery extends Query { @Override public final Query rewrite(IndexReader reader) throws IOException { - final TermContext[] contexts = Arrays.copyOf(this.contexts, this.contexts.length); + final TermStates[] contexts = Arrays.copyOf(this.contexts, this.contexts.length); for (int i = 0; i < contexts.length; ++i) { if (contexts[i] == null || contexts[i].wasBuiltFor(reader.getContext()) == false) { - contexts[i] = TermContext.build(reader.getContext(), terms[i]); + contexts[i] = TermStates.build(reader.getContext(), terms[i], true); } } @@ -275,7 +275,7 @@ public final class BlendedTermQuery extends Query { // ttf will be the sum of all total term freqs int df = 0; long ttf = 0; - for (TermContext ctx : contexts) { + for (TermStates ctx : contexts) { df = Math.max(df, ctx.docFreq()); ttf += ctx.totalTermFreq(); } @@ -294,8 +294,8 @@ public final class BlendedTermQuery extends Query { return rewriteMethod.rewrite(termQueries); } - private static TermContext adjustFrequencies(IndexReaderContext readerContext, - TermContext ctx, int artificialDf, long artificialTtf) { + private static TermStates adjustFrequencies(IndexReaderContext readerContext, + TermStates ctx, int artificialDf, long artificialTtf) throws IOException { Listleaves = readerContext.leaves(); final int len; if (leaves == null) { @@ -303,9 +303,9 @@ public final class BlendedTermQuery extends Query { } else { len = leaves.size(); } - TermContext newCtx = new TermContext(readerContext); + TermStates newCtx = new TermStates(readerContext); for (int i = 0; i < len; ++i) { - TermState termState = ctx.get(i); + TermState termState = ctx.get(leaves.get(i)); if (termState == null) { continue; } diff --git a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java index 900a77f076f..fffdd09093f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/BooleanWeight.java @@ -48,7 +48,7 @@ final class BooleanWeight extends Weight { super(query); this.query = query; this.scoreMode = scoreMode; - this.similarity = searcher.getSimilarity(scoreMode.needsScores()); + this.similarity = searcher.getSimilarity(); weights = new ArrayList<>(); for (BooleanClause c : query) { Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost); diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java index f4a7ca7be10..e2d6d8047f3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.similarities.Similarity; final class ExactPhraseScorer extends Scorer { @@ -42,13 +41,13 @@ final class ExactPhraseScorer extends Scorer { private int freq; - private final Similarity.SimScorer docScorer; + private final LeafSimScorer docScorer; private final boolean needsScores, needsTotalHitCount; private float matchCost; private float minCompetitiveScore; ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity.SimScorer docScorer, ScoreMode scoreMode, + LeafSimScorer docScorer, ScoreMode scoreMode, float matchCost) throws IOException { super(weight); this.docScorer = docScorer; @@ -123,7 +122,7 @@ final class ExactPhraseScorer extends Scorer { @Override public float maxScore() { - return docScorer.maxScore(Integer.MAX_VALUE); + return docScorer.maxScore(); } /** Advance the given pos enum to the first doc on or after {@code target}. diff --git a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java index 5ee815cb888..da5ed036ddc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java @@ -32,7 +32,6 @@ import java.util.concurrent.Future; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexWriter; @@ -40,7 +39,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.Similarity; @@ -75,36 +74,6 @@ import org.apache.lucene.util.ThreadInterruptedException; */ public class IndexSearcher { - /** A search-time {@link Similarity} that does not make use of scoring factors - * and may be used when scores are not needed. */ - private static final Similarity NON_SCORING_SIMILARITY = new Similarity() { - - @Override - public long computeNorm(FieldInvertState state) { - throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing"); - } - - @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimWeight() {}; - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { - @Override - public float score(int doc, float freq) { - return 0f; - } - @Override - public float maxScore(float maxFreq) { - return 0f; - } - }; - } - - }; - private static QueryCache DEFAULT_QUERY_CACHE; private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy(); static { @@ -136,7 +105,7 @@ public class IndexSearcher { * Expert: returns a default Similarity instance. * In general, this method is only called to initialize searchers and writers. * User code and query implementations should respect - * {@link IndexSearcher#getSimilarity(boolean)}. + * {@link IndexSearcher#getSimilarity()}. * @lucene.internal */ public static Similarity getDefaultSimilarity() { @@ -329,15 +298,11 @@ public class IndexSearcher { this.similarity = similarity; } - /** Expert: Get the {@link Similarity} to use to compute scores. When - * {@code needsScores} is {@code false}, this method will return a simple - * {@link Similarity} that does not leverage scoring factors such as norms. - * When {@code needsScores} is {@code true}, this returns the + /** Expert: Get the {@link Similarity} to use to compute scores. This returns the * {@link Similarity} that has been set through {@link #setSimilarity(Similarity)} - * or the {@link #getDefaultSimilarity()} default {@link Similarity} if none - * has been set explicitly. */ - public Similarity getSimilarity(boolean needsScores) { - return needsScores ? similarity : NON_SCORING_SIMILARITY; + * or the default {@link Similarity} if none has been set explicitly. */ + public Similarity getSimilarity() { + return similarity; } /** @@ -774,7 +739,7 @@ public class IndexSearcher { * across a distributed collection. * @lucene.experimental */ - public TermStatistics termStatistics(Term term, TermContext context) throws IOException { + public TermStatistics termStatistics(Term term, TermStates context) throws IOException { if (context.docFreq() == 0) { return null; } else { diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java new file mode 100644 index 00000000000..5de82951d22 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/LeafSimScorer.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.similarities.Similarity.SimScorer; + +/** + * {@link SimScorer} on a specific {@link LeafReader}. + */ +public final class LeafSimScorer { + + private final SimScorer scorer; + private final NumericDocValues norms; + private final float maxScore; + + /** + * Sole constructor: Score documents of {@code reader} with {@code scorer}. + */ + public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException { + this.scorer = scorer; + norms = needsScores ? reader.getNormValues(scorer.getField()) : null; + maxScore = needsScores ? scorer.score(maxFreq, 1) : Float.MAX_VALUE; + } + + private long getNormValue(int doc) throws IOException { + if (norms != null) { + boolean found = norms.advanceExact(doc); + assert found; + return norms.longValue(); + } else { + return 1L; // default norm + } + } + + /** Score the provided document assuming the given term document frequency. + * This method must be called on non-decreasing sequences of doc ids. + * @see SimScorer#score(float, long) */ + public float score(int doc, float freq) throws IOException { + return scorer.score(freq, getNormValue(doc)); + } + + /** Explain the score for the provided document assuming the given term document frequency. + * This method must be called on non-decreasing sequences of doc ids. + * @see SimScorer#explain(Explanation, long) */ + public Explanation explain(int doc, Explanation freqExpl) throws IOException { + return scorer.explain(freqExpl, getNormValue(doc)); + } + + /** + * Return an upper bound of the score. + */ + public float maxScore() { + return maxScore; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 34361a728cd..65d6631e9a7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -18,19 +18,26 @@ package org.apache.lucene.search; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -183,36 +190,38 @@ public class MultiPhraseQuery extends Query { private class MultiPhraseWeight extends Weight { private final Similarity similarity; - private final Similarity.SimWeight stats; - private final Map termContexts = new HashMap<>(); + private final Similarity.SimScorer stats; + private final Map termStates = new HashMap<>(); private final ScoreMode scoreMode; public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { super(MultiPhraseQuery.this); this.scoreMode = scoreMode; - this.similarity = searcher.getSimilarity(scoreMode.needsScores()); + this.similarity = searcher.getSimilarity(); final IndexReaderContext context = searcher.getTopReaderContext(); // compute idf ArrayList allTermStats = new ArrayList<>(); for(final Term[] terms: termArrays) { for (Term term: terms) { - TermContext termContext = termContexts.get(term); - if (termContext == null) { - termContext = TermContext.build(context, term); - termContexts.put(term, termContext); + TermStates ts = termStates.get(term); + if (ts == null) { + ts = TermStates.build(context, term, scoreMode.needsScores()); + termStates.put(term, ts); } - TermStatistics termStatistics = searcher.termStatistics(term, termContext); - if (termStatistics != null) { - allTermStats.add(termStatistics); + if (scoreMode.needsScores()) { + TermStatistics termStatistics = searcher.termStatistics(term, ts); + if (termStatistics != null) { + allTermStats.add(termStatistics); + } } } } if (allTermStats.isEmpty()) { stats = null; // none of the terms were found, we won't use sim at all } else { - stats = similarity.computeWeight( + stats = similarity.scorer( boost, searcher.collectionStatistics(field), allTermStats.toArray(new TermStatistics[allTermStats.size()])); @@ -253,7 +262,7 @@ public class MultiPhraseQuery extends Query { List postings = new ArrayList<>(); for (Term term : terms) { - TermState termState = termContexts.get(term).get(context.ord); + TermState termState = termStates.get(term).get(context); if (termState != null) { termsEnum.seekExact(term.bytes(), termState); postings.add(termsEnum.postings(null, PostingsEnum.POSITIONS)); @@ -282,11 +291,11 @@ public class MultiPhraseQuery extends Query { if (slop == 0) { return new ExactPhraseScorer(this, postingsFreqs, - similarity.simScorer(stats, context), + new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE), scoreMode, totalMatchCost); } else { return new SloppyPhraseScorer(this, postingsFreqs, slop, - similarity.simScorer(stats, context), + new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE), scoreMode.needsScores(), totalMatchCost); } } @@ -303,7 +312,7 @@ public class MultiPhraseQuery extends Query { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq(); - SimScorer docScorer = similarity.simScorer(stats, context); + LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE); Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match( diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java index d0869d61b7d..636a7d6757a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -24,7 +24,7 @@ import org.apache.lucene.index.FilteredTermsEnum; // javadocs import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SingleTermsEnum; // javadocs import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanQuery.Builder; @@ -166,7 +166,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermContext states) { + protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermStates states) { final TermQuery tq = new TermQuery(term, states); topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD); } @@ -218,7 +218,7 @@ public abstract class MultiTermQuery extends Query { @Override protected void addClause(BlendedTermQuery.Builder topLevel, Term term, int docCount, - float boost, TermContext states) { + float boost, TermStates states) { topLevel.add(term, boost, states); } } @@ -262,7 +262,7 @@ public abstract class MultiTermQuery extends Query { } @Override - protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermContext states) { + protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermStates states) { final Query q = new ConstantScoreQuery(new TermQuery(term, states)); topLevel.add(new BoostQuery(q, boost), BooleanClause.Occur.SHOULD); } diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java index f82316d9255..3a46b96411c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java @@ -25,7 +25,7 @@ import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -148,9 +148,9 @@ final class MultiTermQueryConstantScoreWrapper extends // build a boolean query BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (TermAndState t : collectedTerms) { - final TermContext termContext = new TermContext(searcher.getTopReaderContext()); - termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD); + final TermStates termStates = new TermStates(searcher.getTopReaderContext()); + termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq); + bq.add(new TermQuery(new Term(query.field, t.term), termStates), Occur.SHOULD); } Query q = new ConstantScoreQuery(bq.build()); final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score()); diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index 3d359b4f5b3..ff1538820d6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -32,12 +32,11 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -352,9 +351,9 @@ public class PhraseQuery extends Query { private class PhraseWeight extends Weight { private final Similarity similarity; - private final Similarity.SimWeight stats; + private final Similarity.SimScorer stats; private final ScoreMode scoreMode; - private transient TermContext states[]; + private transient TermStates states[]; public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { @@ -366,21 +365,23 @@ public class PhraseQuery extends Query { throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first"); } this.scoreMode = scoreMode; - this.similarity = searcher.getSimilarity(scoreMode.needsScores()); + this.similarity = searcher.getSimilarity(); final IndexReaderContext context = searcher.getTopReaderContext(); - states = new TermContext[terms.length]; + states = new TermStates[terms.length]; TermStatistics termStats[] = new TermStatistics[terms.length]; int termUpTo = 0; for (int i = 0; i < terms.length; i++) { final Term term = terms[i]; - states[i] = TermContext.build(context, term); - TermStatistics termStatistics = searcher.termStatistics(term, states[i]); - if (termStatistics != null) { - termStats[termUpTo++] = termStatistics; + states[i] = TermStates.build(context, term, scoreMode.needsScores()); + if (scoreMode.needsScores()) { + TermStatistics termStatistics = searcher.termStatistics(term, states[i]); + if (termStatistics != null) { + termStats[termUpTo++] = termStatistics; + } } } if (termUpTo > 0) { - stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo)); + stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo)); } else { stats = null; // no terms at all, we won't use similarity } @@ -415,7 +416,7 @@ public class PhraseQuery extends Query { for (int i = 0; i < terms.length; i++) { final Term t = terms[i]; - final TermState state = states[i].get(context.ord); + final TermState state = states[i].get(context); if (state == null) { /* term doesnt exist in this segment */ assert termNotInReader(reader, t): "no termstate found but term exists in reader"; return null; @@ -433,11 +434,11 @@ public class PhraseQuery extends Query { if (slop == 0) { // optimize exact case return new ExactPhraseScorer(this, postingsFreqs, - similarity.simScorer(stats, context), + new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE), scoreMode, totalMatchCost); } else { return new SloppyPhraseScorer(this, postingsFreqs, slop, - similarity.simScorer(stats, context), + new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE), scoreMode.needsScores(), totalMatchCost); } } @@ -459,7 +460,7 @@ public class PhraseQuery extends Query { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq(); - SimScorer docScorer = similarity.simScorer(stats, context); + LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE); Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match( diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java index 6f54a866b1d..9d02b35e961 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -20,7 +20,7 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.MultiTermQuery.RewriteMethod; @@ -64,7 +64,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { @Override protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, - float boost, TermContext states) { + float boost, TermStates states) { final TermQuery tq = new TermQuery(term, states); topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD); } @@ -109,7 +109,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { if (size > 0) { final int sort[] = col.terms.sort(); final float[] boost = col.array.boost; - final TermContext[] termStates = col.array.termState; + final TermStates[] termStates = col.array.termState; for (int i = 0; i < size; i++) { final int pos = sort[i]; final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef())); @@ -146,7 +146,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { } else { // new entry: we populate the entry initially array.boost[e] = boostAtt.getBoost(); - array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); + array.termState[e] = new TermStates(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); ScoringRewrite.this.checkMaxClauseCount(terms.size()); } return true; @@ -156,7 +156,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { /** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */ static final class TermFreqBoostByteStart extends DirectBytesStartArray { float[] boost; - TermContext[] termState; + TermStates[] termState; public TermFreqBoostByteStart(int initSize) { super(initSize); @@ -166,7 +166,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { public int[] init() { final int[] ord = super.init(); boost = new float[ArrayUtil.oversize(ord.length, Float.BYTES)]; - termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + termState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; assert termState.length >= ord.length && boost.length >= ord.length; return ord; } @@ -176,7 +176,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { final int[] ord = super.grow(); boost = ArrayUtil.grow(boost, ord.length); if (termState.length < ord.length) { - TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + TermStates[] tmpTermState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(termState, 0, tmpTermState, 0, termState.length); termState = tmpTermState; } diff --git a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java index dc5490a5342..60b77c5c4ea 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/SloppyPhraseScorer.java @@ -26,7 +26,6 @@ import java.util.HashSet; import java.util.LinkedHashMap; import org.apache.lucene.index.Term; -import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.util.FixedBitSet; final class SloppyPhraseScorer extends Scorer { @@ -36,7 +35,7 @@ final class SloppyPhraseScorer extends Scorer { private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq(). - private final Similarity.SimScorer docScorer; + private final LeafSimScorer docScorer; private final int slop; private final int numPostings; @@ -55,7 +54,7 @@ final class SloppyPhraseScorer extends Scorer { private final float matchCost; SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - int slop, Similarity.SimScorer docScorer, boolean needsScores, + int slop, LeafSimScorer docScorer, boolean needsScores, float matchCost) { super(weight); this.docScorer = docScorer; @@ -558,7 +557,7 @@ final class SloppyPhraseScorer extends Scorer { @Override public float maxScore() { - return docScorer.maxScore(Float.POSITIVE_INFINITY); + return docScorer.maxScore(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java index ce9d6e073b2..d9335cfe28c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java @@ -31,11 +31,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.search.similarities.Similarity.SimScorer; import org.apache.lucene.util.BytesRef; /** @@ -127,28 +126,28 @@ public final class SynonymQuery extends Query { } class SynonymWeight extends Weight { - private final TermContext termContexts[]; + private final TermStates termStates[]; private final Similarity similarity; - private final Similarity.SimWeight simWeight; + private final Similarity.SimScorer simWeight; SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException { super(query); CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].field()); long docFreq = 0; long totalTermFreq = 0; - termContexts = new TermContext[terms.length]; - for (int i = 0; i < termContexts.length; i++) { - termContexts[i] = TermContext.build(searcher.getTopReaderContext(), terms[i]); - TermStatistics termStats = searcher.termStatistics(terms[i], termContexts[i]); + termStates = new TermStates[terms.length]; + for (int i = 0; i < termStates.length; i++) { + termStates[i] = TermStates.build(searcher.getTopReaderContext(), terms[i], true); + TermStatistics termStats = searcher.termStatistics(terms[i], termStates[i]); if (termStats != null) { docFreq = Math.max(termStats.docFreq(), docFreq); totalTermFreq += termStats.totalTermFreq(); } } - this.similarity = searcher.getSimilarity(true); + this.similarity = searcher.getSimilarity(); if (docFreq > 0) { TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq); - this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats); + this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats); } else { this.simWeight = null; // no terms exist at all, we won't use similarity } @@ -175,7 +174,7 @@ public final class SynonymQuery extends Query { assert scorer instanceof TermScorer; freq = ((TermScorer)scorer).freq(); } - SimScorer docScorer = similarity.simScorer(simWeight, context); + LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), true, Float.MAX_VALUE); Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match( @@ -190,7 +189,6 @@ public final class SynonymQuery extends Query { @Override public Scorer scorer(LeafReaderContext context) throws IOException { - Similarity.SimScorer simScorer = null; IndexOptions indexOptions = IndexOptions.NONE; if (terms.length > 0) { FieldInfo info = context.reader() @@ -202,21 +200,17 @@ public final class SynonymQuery extends Query { } // we use termscorers + disjunction as an impl detail ListsubScorers = new ArrayList<>(); - long maxFreq = 0; + long totalMaxFreq = 0; for (int i = 0; i < terms.length; i++) { - TermState state = termContexts[i].get(context.ord); + TermState state = termStates[i].get(context); if (state != null) { TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator(); termsEnum.seekExact(terms[i].bytes(), state); - - maxFreq += getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()); - + long termMaxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()); + totalMaxFreq += termMaxFreq; PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); - // lazy init sim, in case no terms exist - if (simScorer == null) { - simScorer = similarity.simScorer(simWeight, context); - } - subScorers.add(new TermScorer(this, postings, simScorer, Float.POSITIVE_INFINITY)); + LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, termMaxFreq); + subScorers.add(new TermScorer(this, postings, simScorer)); } } if (subScorers.isEmpty()) { @@ -225,7 +219,8 @@ public final class SynonymQuery extends Query { // we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs return subScorers.get(0); } else { - return new SynonymScorer(simScorer, this, subScorers, maxFreq); + LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, totalMaxFreq); + return new SynonymScorer(simScorer, this, subScorers); } } @@ -248,13 +243,11 @@ public final class SynonymQuery extends Query { } static class SynonymScorer extends DisjunctionScorer { - private final Similarity.SimScorer similarity; - private final float maxFreq; + private final LeafSimScorer similarity; - SynonymScorer(Similarity.SimScorer similarity, Weight weight, List subScorers, float maxFreq) { + SynonymScorer(LeafSimScorer similarity, Weight weight, List subScorers) { super(weight, subScorers, true); this.similarity = similarity; - this.maxFreq = maxFreq; } @Override @@ -264,7 +257,7 @@ public final class SynonymQuery extends Query { @Override public float maxScore() { - return similarity.maxScore(maxFreq); + return similarity.maxScore(); } /** combines TF of all subs. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/TermCollectingRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TermCollectingRewrite.java index fffa5a84fca..86bf34f02fc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermCollectingRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermCollectingRewrite.java @@ -23,7 +23,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.AttributeSource; @@ -43,7 +43,7 @@ abstract class TermCollectingRewrite extends MultiTermQuery.RewriteMethod { addClause(topLevel, term, docCount, boost, null); } - protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermContext states) throws IOException; + protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermStates states) throws IOException; final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java index 4049e1052c4..a8bf5b0679c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermInSetQuery.java @@ -33,7 +33,7 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PrefixCodedTerms; import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -268,9 +268,9 @@ public class TermInSetQuery extends Query implements Accountable { assert builder == null; BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (TermAndState t : matchingTerms) { - final TermContext termContext = new TermContext(searcher.getTopReaderContext()); - termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD); + final TermStates termStates = new TermStates(searcher.getTopReaderContext()); + termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq); + bq.add(new TermQuery(new Term(t.field, t.term), termStates), Occur.SHOULD); } Query q = new ConstantScoreQuery(bq.build()); final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score()); diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index 925fe93f3c5..d629acd89a8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -28,12 +28,10 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.search.similarities.Similarity.SimScorer; /** * A Query that matches documents containing a term. This may be combined with @@ -42,23 +40,23 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer; public class TermQuery extends Query { private final Term term; - private final TermContext perReaderTermState; + private final TermStates perReaderTermState; final class TermWeight extends Weight { private final Similarity similarity; - private final Similarity.SimWeight stats; - private final TermContext termStates; + private final Similarity.SimScorer simScorer; + private final TermStates termStates; private final boolean needsScores; public TermWeight(IndexSearcher searcher, boolean needsScores, - float boost, TermContext termStates) throws IOException { + float boost, TermStates termStates) throws IOException { super(TermQuery.this); if (needsScores && termStates == null) { throw new IllegalStateException("termStates are required when scores are needed"); } this.needsScores = needsScores; this.termStates = termStates; - this.similarity = searcher.getSimilarity(needsScores); + this.similarity = searcher.getSimilarity(); final CollectionStatistics collectionStats; final TermStatistics termStats; @@ -72,9 +70,9 @@ public class TermQuery extends Query { } if (termStats == null) { - this.stats = null; // term doesn't exist in any segment, we won't use similarity at all + this.simScorer = null; // term doesn't exist in any segment, we won't use similarity at all } else { - this.stats = similarity.computeWeight(boost, collectionStats, termStats); + this.simScorer = similarity.scorer(boost, collectionStats, termStats); } } @@ -101,8 +99,8 @@ public class TermQuery extends Query { .getIndexOptions(); PostingsEnum docs = termsEnum.postings(null, needsScores ? PostingsEnum.FREQS : PostingsEnum.NONE); assert docs != null; - return new TermScorer(this, docs, similarity.simScorer(stats, context), - getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq())); + float maxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()); + return new TermScorer(this, docs, new LeafSimScorer(simScorer, context.reader(), needsScores, maxFreq)); } private long getMaxFreq(IndexOptions indexOptions, long ttf, long df) { @@ -126,30 +124,17 @@ public class TermQuery extends Query { * the term does not exist in the given context */ private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException { - if (termStates != null) { - // TermQuery either used as a Query or the term states have been provided at construction time - assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); - final TermState state = termStates.get(context.ord); - if (state == null) { // term is not present in that reader - assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; - return null; - } - final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(); - termsEnum.seekExact(term.bytes(), state); - return termsEnum; - } else { - // TermQuery used as a filter, so the term states have not been built up front - Terms terms = context.reader().terms(term.field()); - if (terms == null) { - return null; - } - final TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seekExact(term.bytes())) { - return termsEnum; - } else { - return null; - } + assert termStates != null; + assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : + "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); + final TermState state = termStates.get(context); + if (state == null) { // term is not present in that reader + assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; + return null; } + final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(); + termsEnum.seekExact(term.bytes(), state); + return termsEnum; } private boolean termNotInReader(LeafReader reader, Term term) throws IOException { @@ -166,7 +151,7 @@ public class TermQuery extends Query { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float freq = scorer.freq(); - SimScorer docScorer = similarity.simScorer(stats, context); + LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Integer.MAX_VALUE); Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document"); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match( @@ -190,7 +175,7 @@ public class TermQuery extends Query { * Expert: constructs a TermQuery that will use the provided docFreq instead * of looking up the docFreq against the searcher. */ - public TermQuery(Term t, TermContext states) { + public TermQuery(Term t, TermStates states) { assert states != null; term = Objects.requireNonNull(t); perReaderTermState = Objects.requireNonNull(states); @@ -204,18 +189,10 @@ public class TermQuery extends Query { @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { final IndexReaderContext context = searcher.getTopReaderContext(); - final TermContext termState; + final TermStates termState; if (perReaderTermState == null || perReaderTermState.wasBuiltFor(context) == false) { - if (scoreMode.needsScores()) { - // make TermQuery single-pass if we don't have a PRTS or if the context - // differs! - termState = TermContext.build(context, term); - } else { - // do not compute the term state, this will help save seeks in the terms - // dict on segments that have a cache entry for this query - termState = null; - } + termState = TermStates.build(context, term, scoreMode.needsScores()); } else { // PRTS was pre-build for this IS termState = this.perReaderTermState; diff --git a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java index a4aeb04eefb..653a60edc9f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java @@ -20,14 +20,12 @@ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.similarities.Similarity; /** Expert: A Scorer
for documents matching aTerm
. */ final class TermScorer extends Scorer { private final PostingsEnum postingsEnum; - private final Similarity.SimScorer docScorer; - private final float maxFreq; + private final LeafSimScorer docScorer; /** * Construct aTermScorer
. @@ -39,14 +37,11 @@ final class TermScorer extends Scorer { * @param docScorer * TheSimilarity.SimScorer
implementation * to be used for score computations. - * @param maxFreq - * An upper bound of the term frequency of the searched term in any document. */ - TermScorer(Weight weight, PostingsEnum td, Similarity.SimScorer docScorer, float maxFreq) { + TermScorer(Weight weight, PostingsEnum td, LeafSimScorer docScorer) { super(weight); this.docScorer = docScorer; this.postingsEnum = td; - this.maxFreq = maxFreq; } @Override @@ -71,7 +66,7 @@ final class TermScorer extends Scorer { @Override public float maxScore() { - return docScorer.maxScore(maxFreq); + return docScorer.maxScore(); } /** Returns a string representation of thisTermScorer
. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java index b75836e16b7..dea4b0e4cbf 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -25,7 +25,7 @@ import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; @@ -82,7 +82,7 @@ public abstract class TopTermsRewrite extends TermCollectingRewrite { // lazy init the initial ScoreTerm because comparator is not known on ctor: if (st == null) - st = new ScoreTerm(new TermContext(topReaderContext)); + st = new ScoreTerm(new TermStates(topReaderContext)); boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); } @@ -139,7 +139,7 @@ public abstract class TopTermsRewrite extends TermCollectingRewrite { visitedTerms.remove(st.bytes.get()); st.termState.clear(); // reset the termstate! } else { - st = new ScoreTerm(new TermContext(topReaderContext)); + st = new ScoreTerm(new TermStates(topReaderContext)); } assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize"; // set maxBoostAtt with values to help FuzzyTermsEnum to optimize @@ -193,8 +193,8 @@ public abstract class TopTermsRewrite extends TermCollectingRewrite { static final class ScoreTerm implements Comparable{ public final BytesRefBuilder bytes = new BytesRefBuilder(); public float boost; - public final TermContext termState; - public ScoreTerm(TermContext termState) { + public final TermStates termState; + public ScoreTerm(TermStates termState) { this.termState = termState; } diff --git a/lucene/core/src/java/org/apache/lucene/search/package-info.java b/lucene/core/src/java/org/apache/lucene/search/package-info.java index 69c5c2a053e..7e53da46620 100644 --- a/lucene/core/src/java/org/apache/lucene/search/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/search/package-info.java @@ -378,7 +378,7 @@ * scored the way it was. * Typically a weight such as TermWeight * that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will make use of the Similarity's implementation: - * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(int, Explanation) SimScorer#explain(int doc, Explanation freq)}. + * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(Explanation, long) SimScorer#explain(Explanation freq, long norm)}. * * * @@ -402,7 +402,7 @@ * {@link org.apache.lucene.search.Scorer#score score()} — Return the score of the * current document. This value can be determined in any appropriate way for an application. For instance, the * {@link org.apache.lucene.search.TermScorer TermScorer} simply defers to the configured Similarity: - * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(int, float) SimScorer.score(int doc, float freq)}. + * {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(float, long) SimScorer.score(float freq, long norm)}. * * * {@link org.apache.lucene.search.Scorer#getChildren getChildren()} — Returns any child subscorers diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java index 1522e5dc3c5..527c2fdb480 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java @@ -112,18 +112,12 @@ public abstract class Axiomatic extends SimilarityBase { return Math.max(0, score); } - @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); double f = freq.getValue().doubleValue(); - explain(subs, stats, doc, f, docLen); + explain(subs, stats, f, docLen); double score = tf(stats, f, docLen) * ln(stats, f, docLen) @@ -132,7 +126,7 @@ public abstract class Axiomatic extends SimilarityBase { - gamma(stats, f, docLen); Explanation explanation = Explanation.match((float) score, - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:", + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:", subs); if (stats.boost != 1f) { explanation = Explanation.match((float) (score * stats.boost), "Boosted score, computed as (score * boost) from:", @@ -148,7 +142,7 @@ public abstract class Axiomatic extends SimilarityBase { } @Override - protected void explain(List subs, BasicStats stats, int doc, + protected void explain(List subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match((float) stats.getBoost(), @@ -165,7 +159,7 @@ public abstract class Axiomatic extends SimilarityBase { subs.add(tflnExplain(stats, freq, docLen)); subs.add(idfExplain(stats, freq, docLen)); subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma")); - super.explain(subs, stats, doc, freq, docLen); + super.explain(subs, stats, freq, docLen); } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index dce156bc060..19ab0d29ca6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -17,13 +17,10 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -176,7 +173,7 @@ public class BM25Similarity extends Similarity { } @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); @@ -184,100 +181,17 @@ public class BM25Similarity extends Similarity { for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl); } - return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache); - } - - @Override - public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - BM25Stats bm25stats = (BM25Stats) stats; - return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field)); - } - - private class BM25DocScorer extends SimScorer { - private final BM25Stats stats; - private final float weightValue; // boost * idf * (k1 + 1) - private final NumericDocValues norms; - /** precomputed cache for all length values */ - private final float[] lengthCache; - /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */ - private final float[] cache; - - BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException { - this.stats = stats; - this.weightValue = stats.weight; - this.norms = norms; - lengthCache = LENGTH_TABLE; - cache = stats.cache; - } - - @Override - public float score(int doc, float freq) throws IOException { - // if there are no norms, we act as if b=0 - double norm; - if (norms == null) { - norm = k1; - } else { - boolean found = norms.advanceExact(doc); - assert found; - norm = cache[((byte) norms.longValue()) & 0xFF]; - } - return weightValue * (float) (freq / (freq + norm)); - } - - @Override - public float maxScore(float maxFreq) { - // TODO: leverage maxFreq and the min norm from the cache - return weightValue; - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - List subs = new ArrayList<>(); - subs.addAll(stats.explain()); - Explanation tfExpl = explainTF(doc, freq); - subs.add(tfExpl); - return Explanation.match(stats.weight * tfExpl.getValue().floatValue(), - "score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs); - } - - private Explanation explainTF(int doc, Explanation freq) throws IOException { - List subs = new ArrayList<>(); - subs.add(freq); - subs.add(Explanation.match(k1, "k1, term saturation parameter")); - if (norms == null) { - subs.add(Explanation.match(0, "b, field omits length norms")); - return Explanation.match( - (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) k1)), - "tf, computed as freq / (freq + k1) from:", subs); - } else { - boolean found = norms.advanceExact(doc); - assert found; - byte norm = (byte) norms.longValue(); - float doclen = lengthCache[norm & 0xff]; - subs.add(Explanation.match(b, "b, length normalization parameter")); - if ((norm & 0xFF) > 39) { - subs.add(Explanation.match(doclen, "dl, length of field (approximate)")); - } else { - subs.add(Explanation.match(doclen, "dl, length of field")); - } - subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field")); - float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl); - return Explanation.match( - (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)), - "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs); - } - } - + return new BM25Scorer(collectionStats.field(), boost, k1, b, idf, avgdl, cache); } /** Collection statistics for the BM25 model. */ - private static class BM25Stats extends SimWeight { - /** field name, for pulling norms */ - private final String field; + private static class BM25Scorer extends SimScorer { /** query boost */ private final float boost; /** k1 value for scale factor */ private final float k1; + /** b value for length normalization impact */ + private final float b; /** BM25's idf */ private final Explanation idf; /** The average document length. */ @@ -287,17 +201,51 @@ public class BM25Similarity extends Similarity { /** weight (idf * boost) */ private final float weight; - BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) { - this.field = field; + BM25Scorer(String field, float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) { + super(field); this.boost = boost; this.idf = idf; this.avgdl = avgdl; this.k1 = k1; + this.b = b; this.cache = cache; this.weight = (k1 + 1) * boost * idf.getValue().floatValue(); } - private List explain() { + @Override + public float score(float freq, long encodedNorm) { + double norm = cache[((byte) encodedNorm) & 0xFF]; + return weight * (float) (freq / (freq + norm)); + } + + @Override + public Explanation explain(Explanation freq, long encodedNorm) { + List subs = new ArrayList<>(explainConstantFactors()); + Explanation tfExpl = explainTF(freq, encodedNorm); + subs.add(tfExpl); + return Explanation.match(weight * tfExpl.getValue().floatValue(), + "score(freq="+freq.getValue()+"), product of:", subs); + } + + private Explanation explainTF(Explanation freq, long norm) { + List subs = new ArrayList<>(); + subs.add(freq); + subs.add(Explanation.match(k1, "k1, term saturation parameter")); + float doclen = LENGTH_TABLE[((byte) norm) & 0xff]; + subs.add(Explanation.match(b, "b, length normalization parameter")); + if ((norm & 0xFF) > 39) { + subs.add(Explanation.match(doclen, "dl, length of field (approximate)")); + } else { + subs.add(Explanation.match(doclen, "dl, length of field")); + } + subs.add(Explanation.match(avgdl, "avgdl, average length of field")); + float normValue = k1 * ((1 - b) + b * doclen / avgdl); + return Explanation.match( + (float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)), + "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs); + } + + private List explainConstantFactors() { List subs = new ArrayList<>(); // scale factor subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1")); @@ -311,7 +259,6 @@ public class BM25Similarity extends Similarity { } } - @Override public String toString() { return "BM25(k1=" + k1 + ",b=" + b + ")"; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java index cc3cab452fb..dc9356f1504 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BasicStats.java @@ -23,7 +23,7 @@ import org.apache.lucene.index.Terms; * Stores all statistics commonly used ranking methods. * @lucene.experimental */ -public class BasicStats extends Similarity.SimWeight { +public class BasicStats { final String field; /** The number of documents. */ protected long numberOfDocuments; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java index 713417233bc..3c9206d5b68 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java @@ -16,10 +16,7 @@ */ package org.apache.lucene.search.similarities; -import java.io.IOException; - import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -47,44 +44,31 @@ public class BooleanSimilarity extends Similarity { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new BooleanWeight(boost); + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new BooleanWeight(collectionStats.field(), boost); } - private static class BooleanWeight extends SimWeight { + private static class BooleanWeight extends SimScorer { final float boost; - BooleanWeight(float boost) { + BooleanWeight(String field, float boost) { + super(field); this.boost = boost; } + + @Override + public float score(float freq, long norm) { + return boost; + } + + @Override + public Explanation explain(Explanation freq, long norm) { + Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost"); + return Explanation.match( + queryBoostExpl.getValue(), + "score(" + getClass().getSimpleName() + "), computed from:", + queryBoostExpl); + } } - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - final float boost = ((BooleanWeight) weight).boost; - - return new SimScorer() { - - @Override - public float score(int doc, float freq) throws IOException { - return boost; - } - - @Override - public float maxScore(float maxFreq) { - return boost; - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost"); - return Explanation.match( - queryBoostExpl.getValue(), - "score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:", - queryBoostExpl); - } - - }; - } - } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index 66f22be0aea..f7f3d523c14 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -64,12 +64,6 @@ public class DFISimilarity extends SimilarityBase { return stats.getBoost() * log2(measure + 1); } - @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - /** * Returns the measure of independence */ @@ -79,12 +73,12 @@ public class DFISimilarity extends SimilarityBase { @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1); if (freq.getValue().doubleValue() <= expected){ return Explanation.match((float) 0, "score(" + - getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), equals to 0"); } Explanation explExpected = Explanation.match((float) expected, @@ -103,7 +97,7 @@ public class DFISimilarity extends SimilarityBase { return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * log2(measure + 1) from:", Explanation.match( (float)stats.getBoost(), "boost, query boost"), explMeasure); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index a41e35cbcf9..cbe6773361f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -113,15 +113,9 @@ public class DFRSimilarity extends SimilarityBase { return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn); } - @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - @Override protected void explain(List subs, - BasicStats stats, int doc, double freq, double docLen) { + BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost")); } @@ -136,13 +130,13 @@ public class DFRSimilarity extends SimilarityBase { @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); - explain(subs, stats, doc, freq.getValue().doubleValue(), docLen); + explain(subs, stats, freq.getValue().doubleValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * " + "basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:", subs); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java index 9a576085f67..d08bdabf1d9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -104,15 +104,9 @@ public class IBSimilarity extends SimilarityBase { lambda.lambda(stats)); } - @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - @Override protected void explain( - List subs, BasicStats stats, int doc, double freq, double docLen) { + List subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost")); } @@ -125,13 +119,13 @@ public class IBSimilarity extends SimilarityBase { @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); - explain(subs, stats, doc, freq.getValue().doubleValue(), docLen); + explain(subs, stats, freq.getValue().doubleValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * " + "distribution.score(stats, normalization.tfn(stats, freq," + " docLen), lambda.lambda(stats)) from:", diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java index c12cba451dc..a66871cfcb2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -78,13 +78,7 @@ public class LMDirichletSimilarity extends LMSimilarity { } @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - - @Override - protected void explain(List subs, BasicStats stats, int doc, + protected void explain(List subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match((float) stats.getBoost(), "query boost")); @@ -107,18 +101,18 @@ public class LMDirichletSimilarity extends LMSimilarity { (float)Math.log(mu / (docLen + mu)), "document norm, computed as log(mu / (dl + mu))")); subs.add(Explanation.match((float) docLen,"dl, length of field")); - super.explain(subs, stats, doc, freq, docLen); + super.explain(subs, stats, freq, docLen); } @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); - explain(subs, stats, doc, freq.getValue().doubleValue(), docLen); + explain(subs, stats, freq.getValue().doubleValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * " + "(term weight + document norm) from:", subs); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java index 42e5a7bc319..3f4f41abc3f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -68,13 +68,7 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { } @Override - protected double maxScore(BasicStats stats, double maxFreq) { - // TODO: can we compute a better upper bound on the produced scores - return Double.POSITIVE_INFINITY; - } - - @Override - protected void explain(List subs, BasicStats stats, int doc, + protected void explain(List subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match((float) stats.getBoost(), "boost")); @@ -88,18 +82,18 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { "freq, number of occurrences of term in the document"); subs.add(explFreq); subs.add(Explanation.match((float) docLen,"dl, length of field")); - super.explain(subs, stats, doc, freq, docLen); + super.explain(subs, stats, freq, docLen); } @Override protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); - explain(subs, stats, doc, freq.getValue().doubleValue(), docLen); + explain(subs, stats, freq.getValue().doubleValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * " + "log(1 + ((1 - lambda) * freq / dl) /(lambda * P)) from:", subs); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index 81548061e5c..73a1276501d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -70,7 +70,7 @@ public abstract class LMSimilarity extends SimilarityBase { } @Override - protected void explain(List subExpls, BasicStats stats, int doc, + protected void explain(List subExpls, BasicStats stats, double freq, double docLen) { subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats), "collection probability")); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java index 2f48cc69333..e558c6ec463 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/MultiSimilarity.java @@ -17,12 +17,10 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -49,64 +47,39 @@ public class MultiSimilarity extends Similarity { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - SimWeight subStats[] = new SimWeight[sims.length]; - for (int i = 0; i < subStats.length; i++) { - subStats[i] = sims[i].computeWeight(boost, collectionStats, termStats); - } - return new MultiStats(subStats); - } - - @Override - public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { SimScorer subScorers[] = new SimScorer[sims.length]; for (int i = 0; i < subScorers.length; i++) { - subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context); + subScorers[i] = sims[i].scorer(boost, collectionStats, termStats); } - return new MultiSimScorer(subScorers); + return new MultiSimScorer(collectionStats.field(), subScorers); } static class MultiSimScorer extends SimScorer { private final SimScorer subScorers[]; - MultiSimScorer(SimScorer subScorers[]) { + MultiSimScorer(String field, SimScorer subScorers[]) { + super(field); this.subScorers = subScorers; } @Override - public float score(int doc, float freq) throws IOException { + public float score(float freq, long norm) { float sum = 0.0f; for (SimScorer subScorer : subScorers) { - sum += subScorer.score(doc, freq); + sum += subScorer.score(freq, norm); } return sum; } @Override - public float maxScore(float freq) { - float sumMaxScore = 0; - for (SimScorer subScorer : subScorers) { - sumMaxScore += subScorer.maxScore(freq); - } - return sumMaxScore; - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { + public Explanation explain(Explanation freq, long norm) { List subs = new ArrayList<>(); for (SimScorer subScorer : subScorers) { - subs.add(subScorer.explain(doc, freq)); + subs.add(subScorer.explain(freq, norm)); } - return Explanation.match(score(doc, freq.getValue().floatValue()), "sum of:", subs); + return Explanation.match(score(freq.getValue().floatValue(), norm), "sum of:", subs); } } - - static class MultiStats extends SimWeight { - final SimWeight subStats[]; - - MultiStats(SimWeight subStats[]) { - this.subStats = subStats; - } - } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java b/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java index 6c05616485c..ee2381f6cda 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/PerFieldSimilarityWrapper.java @@ -17,9 +17,6 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; - -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.TermStatistics; @@ -46,26 +43,13 @@ public abstract class PerFieldSimilarityWrapper extends Similarity { } @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - PerFieldSimWeight weight = new PerFieldSimWeight(); - weight.delegate = get(collectionStats.field()); - weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats); - return weight; - } - - @Override - public final SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight; - return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context); + public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return get(collectionStats.field()).scorer(boost, collectionStats, termStats); } /** * Returns a {@link Similarity} for scoring a field. */ public abstract Similarity get(String name); - - static class PerFieldSimWeight extends SimWeight { - Similarity delegate; - SimWeight delegateWeight; - } + } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java index 5f0bcd0bcb2..f296c02b523 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -17,18 +17,15 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; import java.util.Collections; +import java.util.Objects; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.util.SmallFloat; /** @@ -38,9 +35,9 @@ import org.apache.lucene.util.SmallFloat; * * This is a low-level API, you should only extend this API if you want to implement * an information retrieval model. If you are instead looking for a convenient way - * to alter Lucene's scoring, consider extending a higher-level implementation - * such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or - * just tweaking the default implementation: {@link BM25Similarity}. + * to alter Lucene's scoring, consider just tweaking the default implementation: + * {@link BM25Similarity} or extend {@link SimilarityBase}, which makes it easy to compute + * a score from index statistics. *
* Similarity determines how Lucene weights terms, and Lucene interacts with * this class at both index-time and @@ -49,23 +46,22 @@ import org.apache.lucene.util.SmallFloat; * Indexing Time * At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing * the Similarity implementation to set a per-document value for the field that will - * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption - * about what is in this norm, but it is most useful for encoding length normalization - * information. + * be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. + * Lucene makes no assumption about what is in this norm, but it is most useful for + * encoding length normalization information. *
* Implementations should carefully consider how the normalization is encoded: while - * Lucene's {@link BM25Similarity} encodes a combination of index-time boost - * and length normalization information with {@link SmallFloat} into a single byte, this - * might not be suitable for all purposes. + * Lucene's {@link BM25Similarity} encodes length normalization information with + * {@link SmallFloat} into a single byte, this might not be suitable for all purposes. *
* Many formulas require the use of average document length, which can be computed via a * combination of {@link CollectionStatistics#sumTotalTermFreq()} and - * {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()}, - * depending upon whether the average should reflect field sparsity. + * {@link CollectionStatistics#docCount()}. *
- * Additional scoring factors can be stored in named - *
NumericDocValuesField
s and accessed - * at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}. + * Additional scoring factors can be stored in named {@link NumericDocValuesField}s and + * accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}. + * However this should not be done in the {@link Similarity} but externally, for instance + * by using FunctionScoreQuery. ** Finally, using index-time boosts (either via folding into the normalization byte or * via DocValues), is an inefficient way to boost the scores of different fields if the @@ -76,14 +72,13 @@ import org.apache.lucene.util.SmallFloat; * Query time * At query-time, Queries interact with the Similarity via these steps: *
- *
*- The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time, + *
- The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time, * allowing the implementation to compute any statistics (such as IDF, average document length, etc) * across the entire collection. The {@link TermStatistics} and {@link CollectionStatistics} passed in * already contain all of the raw statistics involved, so a Similarity can freely use any combination * of statistics without causing any additional I/O. Lucene makes no assumption about what is - * stored in the returned {@link Similarity.SimWeight} object. - *
- For each segment in the index, the Query creates a {@link #simScorer(SimWeight, org.apache.lucene.index.LeafReaderContext)} - * The score() method is called for each matching document. + * stored in the returned {@link Similarity.SimScorer} object. + *
- Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score. *
* Explanations @@ -110,7 +105,17 @@ public abstract class Similarity { *
Matches in longer fields are less precise, so implementations of this * method usually set smaller values when
state.getLength()
is large, * and larger values whenstate.getLength()
is small. - * + * + *Note that for a given term-document frequency, greater unsigned norms + * must produce scores that are lower or equal, ie. for two encoded norms + * {@code n1} and {@code n2} so that + * {@code Long.compareUnsigned(n1, n2) > 0} then + * {@code SimScorer.score(freq, n1) <= SimScorer.score(freq, n2)} + * for any legal {@code freq}. + * + *
{@code 0} is not a legal norm, so {@code 1} is the norm that produces + * the highest scores. + * * @lucene.experimental * * @param state current processing state for this field @@ -126,71 +131,68 @@ public abstract class Similarity { * @param termStats term-level statistics, such as the document frequency of a term across the collection. * @return SimWeight object with the information this Similarity needs to score a query. */ - public abstract SimWeight computeWeight(float boost, + public abstract SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats); - - /** - * Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index. - * @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} - * @param context segment of the inverted index to be scored. - * @return SloppySimScorer for scoring documents across
context
- * @throws IOException if there is a low-level I/O error - */ - public abstract SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException; - - /** - * API for scoring "sloppy" queries such as {@link TermQuery}, - * {@link SpanQuery}, and {@link PhraseQuery}. - */ - public static abstract class SimScorer { - - /** - * Sole constructor. (For invocation by subclass - * constructors, typically implicit.) - */ - public SimScorer() {} - - /** - * Score a single document - * @param doc document id within the inverted index segment - * @param freq sloppy term frequency - * @return document's score - */ - public abstract float score(int doc, float freq) throws IOException; - - /** - * Return the maximum score that this scorer may produce for freqs in {@code ]0, maxFreq]}. - * {@code Float.POSITIVE_INFINITY} is a fine return value if scores are not bounded. - * @param maxFreq the maximum frequency - */ - public abstract float maxScore(float maxFreq); - - /** - * Explain the score for a single document - * @param doc document id within the inverted index segment - * @param freq Explanation of how the sloppy term frequency was computed - * @return document's score - */ - public Explanation explain(int doc, Explanation freq) throws IOException { - return Explanation.match( - score(doc, freq.getValue().floatValue()), - "score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:", - Collections.singleton(freq)); - } - } /** Stores the weight for a query across the indexed collection. This abstract * implementation is empty; descendants of {@code Similarity} should * subclass {@code SimWeight} and define the statistics they require in the * subclass. Examples include idf, average field length, etc. */ - public static abstract class SimWeight { - + public static abstract class SimScorer { + + private final String field; + /** * Sole constructor. (For invocation by subclass - * constructors, typically implicit.) + * constructors.) */ - public SimWeight() {} + public SimScorer(String field) { + this.field = Objects.requireNonNull(field); + } + + /** Return the field that this {@link SimScorer} operates on. */ + public final String getField() { + return field; + } + + /** + * Score a single document. {@code freq} is the document-term sloppy + * frequency and must be finite and positive. {@code norm} is the + * encoded normalization factor as computed by + * {@link Similarity#computeNorm(FieldInvertState)} at index time, or + * {@code 1} if norms are disabled. {@code norm} is never {@code 0}. + *+ * Score must not decrease when {@code freq} increases, ie. if + * {@code freq1 > freq2}, then {@code score(freq1, norm) >= + * score(freq2, norm)} for any value of {@code norm} that may be produced + * by {@link Similarity#computeNorm(FieldInvertState)}. + *
+ * Score must not increase when the unsigned {@code norm} increases, ie. if + * {@code Long.compareUnsigned(norm1, norm2) > 0} then + * {@code score(freq, norm1) <= score(freq, norm2)} for any legal + * {@code freq}. + *
+ * As a consequence, the maximum score that this scorer can produce is bound + * by {@code score(Float.MAX_VALUE, 1)}. + * @param freq sloppy term frequency, must be finite and positive + * @param norm encoded normalization factor or {@code 1} if norms are disabled + * @return document's score + */ + public abstract float score(float freq, long norm); + + /** + * Explain the score for a single document + * @param freq Explanation of how the sloppy term frequency was computed + * @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled + * @return document's score + */ + public Explanation explain(Explanation freq, long norm) { + return Explanation.match( + score(freq.getValue().floatValue(), norm), + "score(freq=" + freq.getValue() +"), with freq of:", + Collections.singleton(freq)); + } } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index f227f38fd36..1aefaed7c8f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -17,13 +17,10 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -33,7 +30,7 @@ import org.apache.lucene.util.SmallFloat; * A subclass of {@code Similarity} that provides a simplified API for its * descendants. Subclasses are only required to implement the {@link #score} * and {@link #toString()} methods. Implementing - * {@link #explain(List, BasicStats, int, double, double)} is optional, + * {@link #explain(List, BasicStats, double, double)} is optional, * inasmuch as SimilarityBase already provides a basic explanation of the score * and the term frequency. However, implementers of a subclass are encouraged to * include as much detail about the scoring method as possible. @@ -82,13 +79,18 @@ public abstract class SimilarityBase extends Similarity { } @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - BasicStats stats[] = new BasicStats[termStats.length]; + public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + SimScorer weights[] = new SimScorer[termStats.length]; for (int i = 0; i < termStats.length; i++) { - stats[i] = newStats(collectionStats.field(), boost); - fillBasicStats(stats[i], collectionStats, termStats[i]); + BasicStats stats = newStats(collectionStats.field(), boost); + fillBasicStats(stats, collectionStats, termStats[i]); + weights[i] = new BasicSimScorer(stats); + } + if (weights.length == 1) { + return weights[0]; + } else { + return new MultiSimilarity.MultiSimScorer(collectionStats.field(), weights); } - return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); } /** Factory method to return a custom stats object */ @@ -121,13 +123,6 @@ public abstract class SimilarityBase extends Similarity { */ protected abstract double score(BasicStats stats, double freq, double docLen); - /** - * Return the maximum value that may be returned by {@link #score(BasicStats, double, double)} - * for the given stats. - * @see org.apache.lucene.search.similarities.Similarity.SimScorer#maxScore(float) - */ - protected abstract double maxScore(BasicStats stats, double maxFreq); - /** * Subclasses should implement this method to explain the score. {@code expl} * already contains the score, the name of the class and the doc id, as well @@ -137,12 +132,11 @@ public abstract class SimilarityBase extends Similarity { * * @param subExpls the list of details of the explanation to extend * @param stats the corpus level statistics. - * @param doc the document id. * @param freq the term frequency. * @param docLen the document length. */ protected void explain( - List
subExpls, BasicStats stats, int doc, double freq, double docLen) {} + List subExpls, BasicStats stats, double freq, double docLen) {} /** * Explains the score. The implementation here provides a basic explanation @@ -151,43 +145,24 @@ public abstract class SimilarityBase extends Similarity { * attaches the score (computed via the {@link #score(BasicStats, double, double)} * method) and the explanation for the term frequency. Subclasses content with * this format may add additional details in - * {@link #explain(List, BasicStats, int, double, double)}. + * {@link #explain(List, BasicStats, double, double)}. * * @param stats the corpus level statistics. - * @param doc the document id. * @param freq the term frequency and its explanation. * @param docLen the document length. * @return the explanation. */ protected Explanation explain( - BasicStats stats, int doc, Explanation freq, double docLen) { + BasicStats stats, Explanation freq, double docLen) { List subs = new ArrayList<>(); - explain(subs, stats, doc, freq.getValue().floatValue(), docLen); + explain(subs, stats, freq.getValue().floatValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().floatValue(), docLen), - "score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:", + "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:", subs); } - @Override - public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - if (stats instanceof MultiSimilarity.MultiStats) { - // a multi term query (e.g. phrase). return the summation, - // scoring almost as if it were boolean query - SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats; - SimScorer subScorers[] = new SimScorer[subStats.length]; - for (int i = 0; i < subScorers.length; i++) { - BasicStats basicstats = (BasicStats) subStats[i]; - subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); - } - return new MultiSimilarity.MultiSimScorer(subScorers); - } else { - BasicStats basicstats = (BasicStats) stats; - return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field)); - } - } - /** * Subclasses must override this method to return the name of the Similarity * and preferably the values of parameters (if any) as well. @@ -227,43 +202,32 @@ public abstract class SimilarityBase extends Similarity { // --------------------------------- Classes --------------------------------- - /** Delegates the {@link #score(int, float)} and - * {@link #explain(int, Explanation)} methods to + /** Delegates the {@link #score(float, long)} and + * {@link #explain(Explanation, long)} methods to * {@link SimilarityBase#score(BasicStats, double, double)} and - * {@link SimilarityBase#explain(BasicStats, int, Explanation, double)}, + * {@link SimilarityBase#explain(BasicStats, Explanation, double)}, * respectively. */ final class BasicSimScorer extends SimScorer { - private final BasicStats stats; - private final NumericDocValues norms; + final BasicStats stats; - BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException { + BasicSimScorer(BasicStats stats) { + super(stats.field); this.stats = stats; - this.norms = norms; } - double getLengthValue(int doc) throws IOException { - if (norms == null) { - return 1D; - } - boolean found = norms.advanceExact(doc); - assert found; - return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())]; + double getLengthValue(long norm) { + return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)]; } @Override - public float score(int doc, float freq) throws IOException { - return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc)); + public float score(float freq, long norm) { + return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm)); } @Override - public float maxScore(float maxFreq) { - return (float) SimilarityBase.this.maxScore(stats, maxFreq); - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc)); + public Explanation explain(Explanation freq, long norm) { + return SimilarityBase.this.explain(stats, freq, getLengthValue(norm)); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index 0452371abdc..d3224abb3d9 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -17,13 +17,10 @@ package org.apache.lucene.search.similarities; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -511,7 +508,7 @@ public abstract class TFIDFSimilarity extends Similarity { } @Override - public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); @@ -522,110 +519,59 @@ public abstract class TFIDFSimilarity extends Similarity { normTable[i] = norm; } normTable[0] = 1f / normTable[255]; - return new IDFStats(collectionStats.field(), boost, idf, normTable); + return new TFIDFScorer(collectionStats.field(), boost, idf, normTable); } - @Override - public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - IDFStats idfstats = (IDFStats) stats; - // the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented - final float[] normTable = idfstats.normTable; - return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable); - } - - private final class TFIDFSimScorer extends SimScorer { - private final IDFStats stats; - private final float weightValue; - private final NumericDocValues norms; - private final float[] normTable; - - TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException { - this.stats = stats; - this.weightValue = stats.queryWeight; - this.norms = norms; - this.normTable = normTable; - } - - @Override - public float score(int doc, float freq) throws IOException { - final float raw = tf(freq) * weightValue; // compute tf(f)*weight - - if (norms == null) { - return raw; - } else { - boolean found = norms.advanceExact(doc); - assert found; - float normValue = normTable[(int) (norms.longValue() & 0xFF)]; - return raw * normValue; // normalize for field - } - } - - @Override - public float maxScore(float maxFreq) { - final float raw = tf(maxFreq) * weightValue; - if (norms == null) { - return raw; - } else { - float maxNormValue = Float.NEGATIVE_INFINITY; - for (float norm : normTable) { - maxNormValue = Math.max(maxNormValue, norm); - } - return raw * maxNormValue; - } - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - return explainScore(doc, freq, stats, norms, normTable); - } - } /** Collection statistics for the TF-IDF model. The only statistic of interest * to this model is idf. */ - static class IDFStats extends SimWeight { - private final String field; + class TFIDFScorer extends SimScorer { /** The idf and its explanation */ private final Explanation idf; private final float boost; private final float queryWeight; final float[] normTable; - public IDFStats(String field, float boost, Explanation idf, float[] normTable) { + public TFIDFScorer(String field, float boost, Explanation idf, float[] normTable) { + super(field); // TODO: Validate? - this.field = field; this.idf = idf; this.boost = boost; this.queryWeight = boost * idf.getValue().floatValue(); this.normTable = normTable; } + + @Override + public float score(float freq, long norm) { + final float raw = tf(freq) * queryWeight; // compute tf(f)*weight + float normValue = normTable[(int) (norm & 0xFF)]; + return raw * normValue; // normalize for field + } + + @Override + public Explanation explain(Explanation freq, long norm) { + return explainScore(freq, norm, normTable); + } + + private Explanation explainScore(Explanation freq, long encodedNorm, float[] normTable) { + List subs = new ArrayList (); + if (boost != 1F) { + subs.add(Explanation.match(boost, "boost")); + } + subs.add(idf); + Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq); + subs.add(tf); + + float norm = normTable[(int) (encodedNorm & 0xFF)]; + + Explanation fieldNorm = Explanation.match(norm, "fieldNorm"); + subs.add(fieldNorm); + + return Explanation.match( + queryWeight * tf.getValue().floatValue() * norm, + "score(freq="+freq.getValue()+"), product of:", + subs); + } } - private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException { - List subs = new ArrayList (); - if (stats.boost != 1F) { - subs.add(Explanation.match(stats.boost, "boost")); - } - subs.add(stats.idf); - Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq); - subs.add(tf); - - float norm; - if (norms == null) { - norm = 1f; - } else { - boolean found = norms.advanceExact(doc); - assert found; - norm = normTable[(int) (norms.longValue() & 0xFF)]; - } - - Explanation fieldNorm = Explanation.match( - norm, - "fieldNorm(doc=" + doc + ")"); - subs.add(fieldNorm); - - return Explanation.match( - stats.queryWeight * tf.getValue().floatValue() * norm, - "score(doc="+doc+",freq="+freq.getValue()+"), product of:", - subs); - } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainQuery.java index 8bb263338ca..23c1e2b8292 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainQuery.java @@ -20,7 +20,7 @@ package org.apache.lucene.search.spans; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -61,7 +61,7 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable { final SpanWeight bigWeight; final SpanWeight littleWeight; - public SpanContainWeight(IndexSearcher searcher, Map terms, + public SpanContainWeight(IndexSearcher searcher, Map terms, SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException { super(SpanContainQuery.this, searcher, terms, boost); this.bigWeight = bigWeight; @@ -93,9 +93,9 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable { } @Override - public void extractTermContexts(Map contexts) { - bigWeight.extractTermContexts(contexts); - littleWeight.extractTermContexts(contexts); + public void extractTermStates(Map contexts) { + bigWeight.extractTermStates(contexts); + littleWeight.extractTermStates(contexts); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainingQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainingQuery.java index 0d62f749fb0..63662994bf1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainingQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanContainingQuery.java @@ -23,7 +23,7 @@ import java.util.Map; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -45,15 +45,15 @@ public final class SpanContainingQuery extends SpanContainQuery { @Override public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null, + SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost); + SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost); + return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null, bigWeight, littleWeight, boost); } public class SpanContainingWeight extends SpanContainWeight { - public SpanContainingWeight(IndexSearcher searcher, Map terms, + public SpanContainingWeight(IndexSearcher searcher, Map terms, SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException { super(searcher, terms, bigWeight, littleWeight, boost); } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java index ee3f5deda3d..088e73092de 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanMultiTermQueryWrapper.java @@ -24,7 +24,7 @@ import java.util.Objects; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MultiTermQuery; @@ -163,7 +163,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(ListtopLevel, Term term, int docCount, float boost, TermContext states) { + protected void addClause(List topLevel, Term term, int docCount, float boost, TermStates states) { final SpanTermQuery q = new SpanTermQuery(term, states); topLevel.add(q); } @@ -211,7 +211,7 @@ public class SpanMultiTermQueryWrapper extends SpanQue } @Override - protected void addClause(ListtopLevel, Term term, int docFreq, float boost, TermContext states) { + protected void addClause(List topLevel, Term term, int docFreq, float boost, TermStates states) { final SpanTermQuery q = new SpanTermQuery(term, states); topLevel.add(q); } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java index 24a047fce51..17b9e515130 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNearQuery.java @@ -29,7 +29,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.Terms; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -181,24 +181,24 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { List subWeights = new ArrayList<>(); for (SpanQuery q : clauses) { - subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost)); + subWeights.add(q.createWeight(searcher, scoreMode, boost)); } - return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, boost); + return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, boost); } public class SpanNearWeight extends SpanWeight { final List subWeights; - public SpanNearWeight(List subWeights, IndexSearcher searcher, Map terms, float boost) throws IOException { + public SpanNearWeight(List subWeights, IndexSearcher searcher, Map terms, float boost) throws IOException { super(SpanNearQuery.this, searcher, terms, boost); this.subWeights = subWeights; } @Override - public void extractTermContexts(Map contexts) { + public void extractTermStates(Map contexts) { for (SpanWeight w : subWeights) { - w.extractTermContexts(contexts); + w.extractTermStates(contexts); } } @@ -318,7 +318,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable { } @Override - public void extractTermContexts(Map contexts) { + public void extractTermStates(Map contexts) { } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java index 5b97f8da178..6c56df3abee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java @@ -25,7 +25,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; @@ -99,9 +99,9 @@ public final class SpanNotQuery extends SpanQuery { @Override public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - SpanWeight includeWeight = include.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); + SpanWeight includeWeight = include.createWeight(searcher, scoreMode, boost); SpanWeight excludeWeight = exclude.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermContexts(includeWeight, excludeWeight) : null, + return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermStates(includeWeight) : null, includeWeight, excludeWeight, boost); } @@ -110,7 +110,7 @@ public final class SpanNotQuery extends SpanQuery { final SpanWeight includeWeight; final SpanWeight excludeWeight; - public SpanNotWeight(IndexSearcher searcher, Map terms, + public SpanNotWeight(IndexSearcher searcher, Map terms, SpanWeight includeWeight, SpanWeight excludeWeight, float boost) throws IOException { super(SpanNotQuery.this, searcher, terms, boost); this.includeWeight = includeWeight; @@ -118,8 +118,8 @@ public final class SpanNotQuery extends SpanQuery { } @Override - public void extractTermContexts(Map contexts) { - includeWeight.extractTermContexts(contexts); + public void extractTermStates(Map contexts) { + includeWeight.extractTermStates(contexts); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java index 2e15c92f29e..849edaa30e6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanOrQuery.java @@ -27,7 +27,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.DisiPriorityQueue; import org.apache.lucene.search.DisiWrapper; import org.apache.lucene.search.DisjunctionDISIApproximation; @@ -119,16 +119,16 @@ public final class SpanOrQuery extends SpanQuery { public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { List subWeights = new ArrayList<>(clauses.size()); for (SpanQuery q : clauses) { - subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost)); + subWeights.add(q.createWeight(searcher, scoreMode, boost)); } - return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, subWeights, boost); + return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, subWeights, boost); } public class SpanOrWeight extends SpanWeight { final List subWeights; - public SpanOrWeight(IndexSearcher searcher, Map terms, List subWeights, float boost) throws IOException { + public SpanOrWeight(IndexSearcher searcher, Map terms, List subWeights, float boost) throws IOException { super(SpanOrQuery.this, searcher, terms, boost); this.subWeights = subWeights; } @@ -150,9 +150,9 @@ public final class SpanOrQuery extends SpanQuery { } @Override - public void extractTermContexts(Map contexts) { + public void extractTermStates(Map contexts) { for (SpanWeight w : subWeights) { - w.extractTermContexts(contexts); + w.extractTermStates(contexts); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionCheckQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionCheckQuery.java index f9b76972026..099b627e1ee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionCheckQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanPositionCheckQuery.java @@ -25,7 +25,7 @@ import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; @@ -69,15 +69,15 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea @Override public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - SpanWeight matchWeight = match.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermContexts(matchWeight) : null, boost); + SpanWeight matchWeight = match.createWeight(searcher, scoreMode, boost); + return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermStates(matchWeight) : null, boost); } public class SpanPositionCheckWeight extends SpanWeight { final SpanWeight matchWeight; - public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map terms, float boost) throws IOException { + public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map terms, float boost) throws IOException { super(SpanPositionCheckQuery.this, searcher, terms, boost); this.matchWeight = matchWeight; } @@ -93,8 +93,8 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea } @Override - public void extractTermContexts(Map contexts) { - matchWeight.extractTermContexts(contexts); + public void extractTermStates(Map contexts) { + matchWeight.extractTermStates(contexts); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanQuery.java index 607a3755513..ca657b6cff1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanQuery.java @@ -23,7 +23,7 @@ import java.util.Map; import java.util.TreeMap; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; @@ -40,25 +40,25 @@ public abstract class SpanQuery extends Query { public abstract SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException; /** - * Build a map of terms to termcontexts, for use in constructing SpanWeights + * Build a map of terms to {@link TermStates}, for use in constructing SpanWeights * @lucene.internal */ - public static Map getTermContexts(SpanWeight... weights) { - Map terms = new TreeMap<>(); + public static Map getTermStates(SpanWeight... weights) { + Map terms = new TreeMap<>(); for (SpanWeight w : weights) { - w.extractTermContexts(terms); + w.extractTermStates(terms); } return terms; } /** - * Build a map of terms to termcontexts, for use in constructing SpanWeights + * Build a map of terms to {@link TermStates}, for use in constructing SpanWeights * @lucene.internal */ - public static Map getTermContexts(Collection weights) { - Map terms = new TreeMap<>(); + public static Map getTermStates(Collection weights) { + Map terms = new TreeMap<>(); for (SpanWeight w : weights) { - w.extractTermContexts(terms); + w.extractTermStates(terms); } return terms; } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java index 57a68e493a8..044ac7a5960 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -21,9 +21,9 @@ import java.io.IOException; import java.util.Objects; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; -import org.apache.lucene.search.similarities.Similarity; /** * A basic {@link Scorer} over {@link Spans}. @@ -32,7 +32,7 @@ import org.apache.lucene.search.similarities.Similarity; public class SpanScorer extends Scorer { protected final Spans spans; - protected final Similarity.SimScorer docScorer; + protected final LeafSimScorer docScorer; /** accumulated sloppy freq (computed in setFreqCurrentDoc) */ private float freq; @@ -41,7 +41,7 @@ public class SpanScorer extends Scorer { private int lastScoredDoc = -1; // last doc we called setFreqCurrentDoc() for /** Sole constructor. */ - public SpanScorer(SpanWeight weight, Spans spans, Similarity.SimScorer docScorer) { + public SpanScorer(SpanWeight weight, Spans spans, LeafSimScorer docScorer) { super(weight); this.spans = Objects.requireNonNull(spans); this.docScorer = docScorer; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index 9eea3aac177..9ac7afb81ee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -28,7 +28,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -41,21 +41,21 @@ import org.apache.lucene.search.ScoreMode; public class SpanTermQuery extends SpanQuery { protected final Term term; - protected final TermContext termContext; + protected final TermStates termStates; /** Construct a SpanTermQuery matching the named term's spans. */ public SpanTermQuery(Term term) { this.term = Objects.requireNonNull(term); - this.termContext = null; + this.termStates = null; } /** * Expert: Construct a SpanTermQuery matching the named term's spans, using - * the provided TermContext + * the provided TermStates */ - public SpanTermQuery(Term term, TermContext context) { + public SpanTermQuery(Term term, TermStates termStates) { this.term = Objects.requireNonNull(term); - this.termContext = context; + this.termStates = termStates; } /** Return the term whose spans are matched. */ @@ -66,25 +66,25 @@ public class SpanTermQuery extends SpanQuery { @Override public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - final TermContext context; + final TermStates context; final IndexReaderContext topContext = searcher.getTopReaderContext(); - if (termContext == null || termContext.wasBuiltFor(topContext) == false) { - context = TermContext.build(topContext, term); + if (termStates == null || termStates.wasBuiltFor(topContext) == false) { + context = TermStates.build(topContext, term, scoreMode.needsScores()); } else { - context = termContext; + context = termStates; } return new SpanTermWeight(context, searcher, scoreMode.needsScores() ? Collections.singletonMap(term, context) : null, boost); } public class SpanTermWeight extends SpanWeight { - final TermContext termContext; + final TermStates termStates; - public SpanTermWeight(TermContext termContext, IndexSearcher searcher, Map terms, float boost) throws IOException { + public SpanTermWeight(TermStates termStates, IndexSearcher searcher, Map terms, float boost) throws IOException { super(SpanTermQuery.this, searcher, terms, boost); - this.termContext = termContext; - assert termContext != null : "TermContext must not be null"; + this.termStates = termStates; + assert termStates != null : "TermStates must not be null"; } @Override @@ -98,16 +98,16 @@ public class SpanTermQuery extends SpanQuery { } @Override - public void extractTermContexts(Map contexts) { - contexts.put(term, termContext); + public void extractTermStates(Map contexts) { + contexts.put(term, termStates); } @Override public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException { - assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); + assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); - final TermState state = termContext.get(context.ord); + final TermState state = termStates.get(context); if (state == null) { // term is not present in that reader assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term; return null; diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java index 0dad614bdda..25b58fdc39a 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -24,14 +24,14 @@ import java.util.Map; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.Weight; import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.search.similarities.Similarity.SimScorer; /** * Expert-only. Public for use by other weight implementations @@ -72,48 +72,48 @@ public abstract class SpanWeight extends Weight { } protected final Similarity similarity; - protected final Similarity.SimWeight simWeight; + protected final Similarity.SimScorer simScorer; protected final String field; /** * Create a new SpanWeight * @param query the parent query * @param searcher the IndexSearcher to query against - * @param termContexts a map of terms to termcontexts for use in building the similarity. May + * @param termStates a map of terms to {@link TermStates} for use in building the similarity. May * be null if scores are not required * @throws IOException on error */ - public SpanWeight(SpanQuery query, IndexSearcher searcher, Map termContexts, float boost) throws IOException { + public SpanWeight(SpanQuery query, IndexSearcher searcher, Map termStates, float boost) throws IOException { super(query); this.field = query.getField(); - this.similarity = searcher.getSimilarity(termContexts != null); - this.simWeight = buildSimWeight(query, searcher, termContexts, boost); + this.similarity = searcher.getSimilarity(); + this.simScorer = buildSimWeight(query, searcher, termStates, boost); } - private Similarity.SimWeight buildSimWeight(SpanQuery query, IndexSearcher searcher, Map termContexts, float boost) throws IOException { - if (termContexts == null || termContexts.size() == 0 || query.getField() == null) + private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map termStates, float boost) throws IOException { + if (termStates == null || termStates.size() == 0 || query.getField() == null) return null; - TermStatistics[] termStats = new TermStatistics[termContexts.size()]; + TermStatistics[] termStats = new TermStatistics[termStates.size()]; int termUpTo = 0; - for (Term term : termContexts.keySet()) { - TermStatistics termStatistics = searcher.termStatistics(term, termContexts.get(term)); + for (Term term : termStates.keySet()) { + TermStatistics termStatistics = searcher.termStatistics(term, termStates.get(term)); if (termStatistics != null) { termStats[termUpTo++] = termStatistics; } } CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField()); if (termUpTo > 0) { - return similarity.computeWeight(boost, collectionStats, Arrays.copyOf(termStats, termUpTo)); + return similarity.scorer(boost, collectionStats, Arrays.copyOf(termStats, termUpTo)); } else { return null; // no terms at all exist, we won't use similarity } } /** - * Collect all TermContexts used by this Weight - * @param contexts a map to add the TermContexts to + * Collect all TermStates used by this Weight + * @param contexts a map to add the TermStates to */ - public abstract void extractTermContexts(Map contexts); + public abstract void extractTermStates(Map contexts); /** * Expert: Return a Spans object iterating over matches from this Weight @@ -129,18 +129,18 @@ public abstract class SpanWeight extends Weight { if (spans == null) { return null; } - final Similarity.SimScorer docScorer = getSimScorer(context); + final LeafSimScorer docScorer = getSimScorer(context); return new SpanScorer(this, spans, docScorer); } /** - * Return a SimScorer for this context + * Return a LeafSimScorer for this context * @param context the LeafReaderContext * @return a SimWeight * @throws IOException on error */ - public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException { - return simWeight == null ? null : similarity.simScorer(simWeight, context); + public LeafSimScorer getSimScorer(LeafReaderContext context) throws IOException { + return simScorer == null ? null : new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE); } @Override @@ -150,7 +150,7 @@ public abstract class SpanWeight extends Weight { int newDoc = scorer.iterator().advance(doc); if (newDoc == doc) { float freq = scorer.sloppyFreq(); - SimScorer docScorer = similarity.simScorer(simWeight, context); + LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE); Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq); Explanation scoreExplanation = docScorer.explain(doc, freqExplanation); return Explanation.match(scoreExplanation.getValue(), diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWithinQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWithinQuery.java index 9c618dd2e4c..fba85fe6e86 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanWithinQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanWithinQuery.java @@ -23,7 +23,7 @@ import java.util.Map; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreMode; @@ -46,15 +46,15 @@ public final class SpanWithinQuery extends SpanContainQuery { @Override public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost); - return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null, + SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost); + SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost); + return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null, bigWeight, littleWeight, boost); } public class SpanWithinWeight extends SpanContainWeight { - public SpanWithinWeight(IndexSearcher searcher, Map terms, + public SpanWithinWeight(IndexSearcher searcher, Map terms, SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException { super(searcher, terms, bigWeight, littleWeight, boost); } diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java index f1e1aed6557..625bb0e7010 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/TermSpans.java @@ -22,7 +22,7 @@ import java.util.Objects; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.search.LeafSimScorer; /** * Expert: @@ -39,7 +39,7 @@ public class TermSpans extends Spans { protected boolean readPayload; private final float positionsCost; - public TermSpans(Similarity.SimScorer scorer, + public TermSpans(LeafSimScorer scorer, PostingsEnum postings, Term term, float positionsCost) { this.postings = Objects.requireNonNull(postings); this.term = Objects.requireNonNull(term); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java index a8111921ff6..7737de18276 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCustomNorms.java @@ -103,12 +103,7 @@ public class TestCustomNorms extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java b/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java index d2eff257648..8bb81d2409c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCustomTermFreq.java @@ -17,8 +17,6 @@ package org.apache.lucene.index; -import java.io.IOException; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -432,12 +430,7 @@ public class TestCustomTermFreq extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java index f78b7fa92c5..08635fc79f9 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInvertState.java @@ -17,7 +17,6 @@ package org.apache.lucene.index; -import java.io.IOException; import java.util.HashMap; import java.util.Map; @@ -54,12 +53,7 @@ public class TestFieldInvertState extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java index 0a8799d3a29..6b43c162ec7 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java @@ -1947,13 +1947,8 @@ public class TestIndexSorting extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return in.computeWeight(boost, collectionStats, termStats); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return in.simScorer(weight, context); + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return in.scorer(boost, collectionStats, termStats); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java index f391c5a2af2..216dc211d3e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java @@ -17,7 +17,6 @@ package org.apache.lucene.index; -import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -109,24 +108,14 @@ public class TestMaxTermFrequency extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimWeight() {}; - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - - return new SimScorer() { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) throws IOException { + public float score(float freq, long norm) { return 0; } - @Override - public float maxScore(float maxFreq) { - return 0; - } }; } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java index 70c7a3237e6..805c7e58474 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java @@ -116,12 +116,7 @@ public class TestNorms extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermStates.java b/lucene/core/src/test/org/apache/lucene/index/TestTermStates.java new file mode 100644 index 00000000000..a89fe7bb04a --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermStates.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; + +public class TestTermStates extends LuceneTestCase { + + public void testToStringOnNullTermState() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + w.addDocument(new Document()); + IndexReader r = w.getReader(); + TermStates states = TermStates.build(r.getContext(), new Term("foo", "bar"), random().nextBoolean()); + assertEquals("TermStates\n state=null\n", states.toString()); + IOUtils.close(r, w, dir); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestUniqueTermCount.java b/lucene/core/src/test/org/apache/lucene/index/TestUniqueTermCount.java index a0fca4c62b4..2de02346a27 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestUniqueTermCount.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestUniqueTermCount.java @@ -17,7 +17,6 @@ package org.apache.lucene.index; -import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; @@ -106,12 +105,7 @@ public class TestUniqueTermCount extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } } diff --git a/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java b/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java index 0523e2c04aa..c85732ec047 100644 --- a/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java +++ b/lucene/core/src/test/org/apache/lucene/search/JustCompileSearch.java @@ -194,12 +194,7 @@ final class JustCompileSearch { static final class JustCompileSimilarity extends Similarity { @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - throw new UnsupportedOperationException(UNSUPPORTED_MSG); - } - - @Override - public SimScorer simScorer(SimWeight stats, LeafReaderContext context) { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(UNSUPPORTED_MSG); } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java index 95562819aa0..a9e2891140a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBoolean2.java @@ -148,7 +148,7 @@ public class TestBoolean2 extends LuceneTestCase { } singleSegmentReader = DirectoryReader.open(singleSegmentDirectory); singleSegmentSearcher = newSearcher(singleSegmentReader); - singleSegmentSearcher.setSimilarity(searcher.getSimilarity(true)); + singleSegmentSearcher.setSimilarity(searcher.getSimilarity()); // Make big index dir2 = copyOf(directory); @@ -379,7 +379,7 @@ public class TestBoolean2 extends LuceneTestCase { QueryUtils.check(random(), q1,searcher); // baseline sim try { // a little hackish, QueryUtils.check is too costly to do on bigSearcher in this loop. - searcher.setSimilarity(bigSearcher.getSimilarity(true)); // random sim + searcher.setSimilarity(bigSearcher.getSimilarity()); // random sim QueryUtils.check(random(), q1, searcher); } finally { searcher.setSimilarity(new ClassicSimilarity()); // restore diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java index 19f45f81bb0..de061a2f8d3 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanQueryVisitSubscorers.java @@ -38,7 +38,6 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.Scorer.ChildScorer; -import org.apache.lucene.search.similarities.BasicStats; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; @@ -329,21 +328,12 @@ public class TestBooleanQueryVisitSubscorers extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new BasicStats("", boost); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) throws IOException { + public float score(float freq, long norm) { return freq; } - @Override - public float maxScore(float maxFreq) { - return maxFreq; - } }; } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java index d21f373e045..292dfa9dabf 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestBooleanRewrites.java @@ -357,7 +357,7 @@ public class TestBooleanRewrites extends LuceneTestCase { return original; } }; - searcher2.setSimilarity(searcher1.getSimilarity(true)); + searcher2.setSimilarity(searcher1.getSimilarity()); final int iters = atLeast(1000); for (int i = 0; i < iters; ++i) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java b/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java index a4e959619e5..4cfa4d3f5a5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestConjunctions.java @@ -34,7 +34,6 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.similarities.Similarity; @@ -100,23 +99,13 @@ public class TestConjunctions extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimWeight() {}; - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) { + public float score(float freq, long norm) { return freq; } - - @Override - public float maxScore(float maxFreq) { - return maxFreq; - } }; } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesScoring.java b/lucene/core/src/test/org/apache/lucene/search/TestDocValuesScoring.java deleted file mode 100644 index 88564314fab..00000000000 --- a/lucene/core/src/test/org/apache/lucene/search/TestDocValuesScoring.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.search; - - -import java.io.IOException; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FloatDocValuesField; -import org.apache.lucene.index.DocValues; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; - -/** - * Tests the use of indexdocvalues in scoring. - * - * In the example, a docvalues field is used as a per-document boost (separate from the norm) - * @lucene.experimental - */ -public class TestDocValuesScoring extends LuceneTestCase { - private static final float SCORE_EPSILON = 0.001f; /* for comparing floats */ - - public void testSimple() throws Exception { - Directory dir = newDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(random(), dir); - Document doc = new Document(); - Field field = newTextField("foo", "", Field.Store.NO); - doc.add(field); - Field dvField = new FloatDocValuesField("foo_boost", 0.0F); - doc.add(dvField); - Field field2 = newTextField("bar", "", Field.Store.NO); - doc.add(field2); - - field.setStringValue("quick brown fox"); - field2.setStringValue("quick brown fox"); - dvField.setFloatValue(2f); // boost x2 - iw.addDocument(doc); - field.setStringValue("jumps over lazy brown dog"); - field2.setStringValue("jumps over lazy brown dog"); - dvField.setFloatValue(4f); // boost x4 - iw.addDocument(doc); - IndexReader ir = iw.getReader(); - iw.close(); - - // no boosting - IndexSearcher searcher1 = newSearcher(ir, false); - final Similarity base = searcher1.getSimilarity(true); - // boosting - IndexSearcher searcher2 = newSearcher(ir, false); - searcher2.setSimilarity(new PerFieldSimilarityWrapper() { - final Similarity fooSim = new BoostingSimilarity(base, "foo_boost"); - - @Override - public Similarity get(String field) { - return "foo".equals(field) ? fooSim : base; - } - }); - - // in this case, we searched on field "foo". first document should have 2x the score. - TermQuery tq = new TermQuery(new Term("foo", "quick")); - QueryUtils.check(random(), tq, searcher1); - QueryUtils.check(random(), tq, searcher2); - - TopDocs noboost = searcher1.search(tq, 10); - TopDocs boost = searcher2.search(tq, 10); - assertEquals(1, noboost.totalHits); - assertEquals(1, boost.totalHits); - - //System.out.println(searcher2.explain(tq, boost.scoreDocs[0].doc)); - assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*2f, SCORE_EPSILON); - - // this query matches only the second document, which should have 4x the score. - tq = new TermQuery(new Term("foo", "jumps")); - QueryUtils.check(random(), tq, searcher1); - QueryUtils.check(random(), tq, searcher2); - - noboost = searcher1.search(tq, 10); - boost = searcher2.search(tq, 10); - assertEquals(1, noboost.totalHits); - assertEquals(1, boost.totalHits); - - assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score*4f, SCORE_EPSILON); - - // search on on field bar just for kicks, nothing should happen, since we setup - // our sim provider to only use foo_boost for field foo. - tq = new TermQuery(new Term("bar", "quick")); - QueryUtils.check(random(), tq, searcher1); - QueryUtils.check(random(), tq, searcher2); - - noboost = searcher1.search(tq, 10); - boost = searcher2.search(tq, 10); - assertEquals(1, noboost.totalHits); - assertEquals(1, boost.totalHits); - - assertEquals(boost.scoreDocs[0].score, noboost.scoreDocs[0].score, SCORE_EPSILON); - - ir.close(); - dir.close(); - } - - /** - * Similarity that wraps another similarity and boosts the final score - * according to whats in a docvalues field. - * - * @lucene.experimental - */ - static class BoostingSimilarity extends Similarity { - private final Similarity sim; - private final String boostField; - - public BoostingSimilarity(Similarity sim, String boostField) { - this.sim = sim; - this.boostField = boostField; - } - - @Override - public long computeNorm(FieldInvertState state) { - return sim.computeNorm(state); - } - - @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return sim.computeWeight(boost, collectionStats, termStats); - } - - @Override - public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { - final SimScorer sub = sim.simScorer(stats, context); - final NumericDocValues values = DocValues.getNumeric(context.reader(), boostField); - - return new SimScorer() { - - private float getValueForDoc(int doc) throws IOException { - int curDocID = values.docID(); - if (doc < curDocID) { - throw new IllegalArgumentException("doc=" + doc + " is before curDocID=" + curDocID); - } - if (doc > curDocID) { - curDocID = values.advance(doc); - } - if (curDocID == doc) { - return Float.intBitsToFloat((int)values.longValue()); - } else { - return 0f; - } - } - - @Override - public float score(int doc, float freq) throws IOException { - return getValueForDoc(doc) * sub.score(doc, freq); - } - - @Override - public float maxScore(float maxFreq) { - return Float.POSITIVE_INFINITY; - } - - @Override - public Explanation explain(int doc, Explanation freq) throws IOException { - Explanation boostExplanation = Explanation.match(getValueForDoc(doc), "indexDocValue(" + boostField + ")"); - Explanation simExplanation = sub.explain(doc, freq); - return Explanation.match( - boostExplanation.getValue().doubleValue() * simExplanation.getValue().doubleValue(), - "product of:", boostExplanation, simExplanation); - } - }; - } - } -} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java b/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java index 924a1af0e87..30b03ac9f55 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMinShouldMatch2.java @@ -34,10 +34,9 @@ import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; -import org.apache.lucene.search.similarities.Similarity.SimWeight; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -310,7 +309,7 @@ public class TestMinShouldMatch2 extends LuceneTestCase { final int maxDoc; final Set ords = new HashSet<>(); - final SimScorer[] sims; + final LeafSimScorer[] sims; final int minNrShouldMatch; double score = Float.NaN; @@ -321,7 +320,7 @@ public class TestMinShouldMatch2 extends LuceneTestCase { this.maxDoc = reader.maxDoc(); BooleanQuery bq = (BooleanQuery) weight.getQuery(); this.minNrShouldMatch = bq.getMinimumNumberShouldMatch(); - this.sims = new SimScorer[(int)dv.getValueCount()]; + this.sims = new LeafSimScorer[(int)dv.getValueCount()]; for (BooleanClause clause : bq.clauses()) { assert !clause.isProhibited(); assert !clause.isRequired(); @@ -330,11 +329,11 @@ public class TestMinShouldMatch2 extends LuceneTestCase { if (ord >= 0) { boolean success = ords.add(ord); assert success; // no dups - TermContext context = TermContext.build(reader.getContext(), term); - SimWeight w = weight.similarity.computeWeight(1f, + TermStates context = TermStates.build(reader.getContext(), term, true); + SimScorer w = weight.similarity.scorer(1f, searcher.collectionStatistics("field"), searcher.termStatistics(term, context)); - sims[(int)ord] = weight.similarity.simScorer(w, reader.getContext()); + sims[(int)ord] = new LeafSimScorer(w, reader, true, 1); } } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java index a6970f974ad..f360bedd31a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java @@ -17,15 +17,12 @@ package org.apache.lucene.search; -import java.io.IOException; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.RandomIndexWriter; @@ -113,21 +110,11 @@ public class TestSimilarityProvider extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimWeight() {}; - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) throws IOException { - return 1; - } - - @Override - public float maxScore(float maxFreq) { + public float score(float freq, long norm) { return 1; } }; @@ -143,21 +130,10 @@ public class TestSimilarityProvider extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new SimWeight() {}; - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { - + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) throws IOException { - return 10; - } - - @Override - public float maxScore(float maxFreq) { + public float score(float freq, long norm) { return 10; } }; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSubScorerFreqs.java b/lucene/core/src/test/org/apache/lucene/search/TestSubScorerFreqs.java index 7278a3b2516..f45e304c2fc 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSubScorerFreqs.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSubScorerFreqs.java @@ -34,7 +34,6 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.Scorer.ChildScorer; -import org.apache.lucene.search.similarities.BasicStats; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -231,22 +230,12 @@ public class TestSubScorerFreqs extends LuceneTestCase { } @Override - public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { - return new BasicStats("", boost); - } - - @Override - public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException { - return new SimScorer() { + public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { + return new SimScorer(collectionStats.field()) { @Override - public float score(int doc, float freq) throws IOException { + public float score(float freq, long norm) { return freq; } - - @Override - public float maxScore(float maxFreq) { - return maxFreq; - } }; } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java index f65c54eac0f..dd85c62f663 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java @@ -29,7 +29,7 @@ import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -49,7 +49,7 @@ public class TestTermQuery extends LuceneTestCase { new TermQuery(new Term("foo", "baz"))); QueryUtils.checkEqual( new TermQuery(new Term("foo", "bar")), - new TermQuery(new Term("foo", "bar"), TermContext.build(new MultiReader().getContext(), new Term("foo", "bar")))); + new TermQuery(new Term("foo", "bar"), TermStates.build(new MultiReader().getContext(), new Term("foo", "bar"), true))); } public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException { @@ -84,7 +84,7 @@ public class TestTermQuery extends LuceneTestCase { searcher.search(query, collector); assertEquals(1, collector.getTotalHits()); TermQuery queryWithContext = new TermQuery(new Term("foo", "bar"), - TermContext.build(reader.getContext(), new Term("foo", "bar"))); + TermStates.build(reader.getContext(), new Term("foo", "bar"), true)); collector = new TotalHitCountCollector(); searcher.search(queryWithContext, collector); assertEquals(1, collector.getTotalHits()); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java index a0fa0f371cb..eb7a590bc16 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestClassicSimilarity.java @@ -36,7 +36,6 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.similarities.TFIDFSimilarity.IDFStats; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.TestUtil; @@ -157,7 +156,7 @@ public class TestClassicSimilarity extends BaseSimilarityTestCase { public void testSaneNormValues() throws IOException { ClassicSimilarity sim = new ClassicSimilarity(); - TFIDFSimilarity.IDFStats stats = (IDFStats) sim.computeWeight(1f, indexSearcher.collectionStatistics("test")); + TFIDFSimilarity.TFIDFScorer stats = (TFIDFSimilarity.TFIDFScorer) sim.scorer(1f, indexSearcher.collectionStatistics("test")); for (int i = 0; i < 256; i++) { float boost = stats.normTable[i]; assertFalse("negative boost: " + boost + ", byte=" + i, boost < 0.0f); diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index 279e30ccc0b..b26358251f5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -37,6 +37,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.similarities.SimilarityBase.BasicSimScorer; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; @@ -207,13 +208,13 @@ public class TestSimilarityBase extends LuceneTestCase { */ private void unitTestCore(BasicStats stats, float freq, int docLen) { for (SimilarityBase sim : sims) { - BasicStats realStats = (BasicStats) sim.computeWeight( + BasicStats realStats = ((BasicSimScorer) sim.scorer( (float)stats.getBoost(), toCollectionStats(stats), - toTermStats(stats)); + toTermStats(stats))).stats; float score = (float)sim.score(realStats, freq, docLen); float explScore = sim.explain( - realStats, 1, Explanation.match(freq, "freq"), docLen).getValue().floatValue(); + realStats, Explanation.match(freq, "freq"), docLen).getValue().floatValue(); assertFalse("Score infinite: " + sim.toString(), Float.isInfinite(score)); assertFalse("Score NaN: " + sim.toString(), Float.isNaN(score)); assertTrue("Score negative: " + sim.toString(), score >= 0); @@ -489,10 +490,10 @@ public class TestSimilarityBase extends LuceneTestCase { */ private void correctnessTestCore(SimilarityBase sim, float gold) { BasicStats stats = createStats(); - BasicStats realStats = (BasicStats) sim.computeWeight( + BasicStats realStats = ((BasicSimScorer) sim.scorer( (float)stats.getBoost(), toCollectionStats(stats), - toTermStats(stats)); + toTermStats(stats))).stats; float score = (float) sim.score(realStats, FREQ, DOC_LEN); assertEquals( sim.toString() + " score not correct.", gold, score, FLOAT_EPSILON); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java index 8ed0462c3af..f72ea664b93 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java @@ -235,7 +235,7 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase { public void testSimple2() throws Exception { assumeTrue("Broken scoring: LUCENE-3723", - searcher.getSimilarity(true) instanceof TFIDFSimilarity); + searcher.getSimilarity() instanceof TFIDFSimilarity); SpanQuery q1 = new SpanTermQuery(new Term("gender", "female")); SpanQuery q2 = new SpanTermQuery(new Term("last", "smith")); SpanQuery q = new SpanNearQuery(new SpanQuery[] @@ -291,7 +291,7 @@ public class TestFieldMaskingSpanQuery extends LuceneTestCase { public void testSpans2() throws Exception { assumeTrue("Broken scoring: LUCENE-3723", - searcher.getSimilarity(true) instanceof TFIDFSimilarity); + searcher.getSimilarity() instanceof TFIDFSimilarity); SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female")); SpanQuery qA2 = new SpanTermQuery(new Term("first", "james")); SpanQuery qA = new SpanOrQuery(qA1, new FieldMaskingSpanQuery(qA2, "gender")); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java index 004c06ed5a2..fbb59e3d9bf 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java @@ -148,7 +148,7 @@ public class WeightedSpanTermExtractor { } } else if (query instanceof CommonTermsQuery) { // specialized since rewriting would change the result query - // this query is TermContext sensitive. + // this query is index sensitive. extractWeightedTerms(terms, query, boost); } else if (query instanceof DisjunctionMaxQuery) { for (Query clause : ((DisjunctionMaxQuery) query)) { diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java index cc9f3186304..a0e6d0a9662 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldHighlighter.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; @@ -136,13 +137,15 @@ public class FieldHighlighter { BreakIterator breakIterator = this.breakIterator; final int contentLength = breakIterator.getText().getEndIndex(); + //TODO consider moving this part to an aggregate OffsetsEnum subclass so we have one enum that already has its weight PriorityQueue offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1); for (OffsetsEnum off : offsetsEnums) { off.setWeight(scorer.weight(contentLength, off.freq())); - off.nextPosition(); // go to first position - offsetsEnumQueue.add(off); + if (off.nextPosition()) {// go to first position + offsetsEnumQueue.add(off); + } } - offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY)); // a sentinel for termination + offsetsEnumQueue.add(new OffsetsEnum.OfPostings(new BytesRef(), EMPTY)); // a sentinel for termination PriorityQueue passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> { if (left.getScore() < right.getScore()) { @@ -203,10 +206,9 @@ public class FieldHighlighter { assert term != null; passage.addMatch(start, end, term); // see if there are multiple occurrences of this term in this passage. If so, add them. - if (!off.hasMorePositions()) { + if (!off.nextPosition()) { break; // No more in the entire text. Already removed from pq; move on } - off.nextPosition(); start = off.startOffset(); end = off.endOffset(); if (start >= passage.getEndOffset() || end > contentLength) { // it's beyond this passage @@ -222,7 +224,7 @@ public class FieldHighlighter { p.sort(); } // sort in ascending order - Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset()); + Arrays.sort(passages, Comparator.comparingInt(Passage::getStartOffset)); return passages; } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java index 155f0a76fb9..faef1062208 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java @@ -20,14 +20,12 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.Map; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -41,9 +39,9 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton; public abstract class FieldOffsetStrategy { protected final String field; - protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename - protected final BytesRef[] terms; // Query: free-standing terms - protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query) + protected final PhraseHelper phraseHelper; // Query: position-sensitive information + protected final BytesRef[] terms; // Query: all terms we extracted (some may be position sensitive) + protected final CharacterRunAutomaton[] automata; // Query: wildcards (i.e. multi-term query), not position sensitive public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) { this.field = field; @@ -70,47 +68,50 @@ public abstract class FieldOffsetStrategy { return Collections.emptyList(); } - // For strict positions, get a Map of term to Spans: - // note: ScriptPhraseHelper.NONE does the right thing for these method calls - final Map strictPhrasesTermToSpans = - phraseHelper.getTermToSpans(leafReader, doc); - // Usually simply wraps terms in a List; but if willRewrite() then can be expanded - final List sourceTerms = - phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans); + final List offsetsEnums = new ArrayList<>(terms.length + automata.length); - final List offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length); + // Handle position insensitive terms (a subset of this.terms field): + final BytesRef[] insensitiveTerms; + if (phraseHelper.hasPositionSensitivity()) { + insensitiveTerms = phraseHelper.getAllPositionInsensitiveTerms(); + assert insensitiveTerms.length <= terms.length : "insensitive terms should be smaller set of all terms"; + } else { + insensitiveTerms = terms; + } + if (insensitiveTerms.length > 0) { + createOffsetsEnumsForTerms(insensitiveTerms, termsIndex, doc, offsetsEnums); + } - // Handle sourceTerms: - if (!sourceTerms.isEmpty()) { - TermsEnum termsEnum = termsIndex.iterator();//does not return null - for (BytesRef term : sourceTerms) { - if (termsEnum.seekExact(term)) { - PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); - - if (postingsEnum == null) { - // no offsets or positions available - throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); - } - - if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted - postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term)); - if (postingsEnum != null) { - offsetsEnums.add(new OffsetsEnum(term, postingsEnum)); - } - } - } - } + // Handle spans + if (phraseHelper.hasPositionSensitivity()) { + phraseHelper.createOffsetsEnumsForSpans(leafReader, doc, offsetsEnums); } // Handle automata if (automata.length > 0) { - offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc)); + createOffsetsEnumsForAutomata(termsIndex, doc, offsetsEnums); } return offsetsEnums; } - protected List createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException { + protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List results) throws IOException { + TermsEnum termsEnum = termsIndex.iterator();//does not return null + for (BytesRef term : sourceTerms) { + if (termsEnum.seekExact(term)) { + PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); + if (postingsEnum == null) { + // no offsets or positions available + throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); + } + if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted + results.add(new OffsetsEnum.OfPostings(term, postingsEnum)); + } + } + } + } + + protected void createOffsetsEnumsForAutomata(Terms termsIndex, int doc, List results) throws IOException { List > automataPostings = new ArrayList<>(automata.length); for (int i = 0; i < automata.length; i++) { automataPostings.add(new ArrayList<>()); @@ -118,6 +119,7 @@ public abstract class FieldOffsetStrategy { TermsEnum termsEnum = termsIndex.iterator(); BytesRef term; + CharsRefBuilder refBuilder = new CharsRefBuilder(); while ((term = termsEnum.next()) != null) { for (int i = 0; i < automata.length; i++) { @@ -132,7 +134,6 @@ public abstract class FieldOffsetStrategy { } } - List
offsetsEnums = new ArrayList<>(automata.length); //will be at most this long for (int i = 0; i < automata.length; i++) { CharacterRunAutomaton automaton = automata[i]; List postingsEnums = automataPostings.get(i); @@ -140,14 +141,13 @@ public abstract class FieldOffsetStrategy { if (size > 0) { //only add if we have offsets BytesRef wildcardTerm = new BytesRef(automaton.toString()); if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum - offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0))); + results.add(new OffsetsEnum.OfPostings(wildcardTerm, postingsEnums.get(0))); } else { - offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums))); + results.add(new OffsetsEnum.OfPostings(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums))); } } } - return offsetsEnums; } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java index 708f5c33520..f0a46a5d838 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.search.uhighlight; import java.io.Closeable; @@ -25,25 +26,19 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.util.BytesRef; /** - * Holds the term ({@link BytesRef}), {@link PostingsEnum}, offset iteration tracking. - * It is advanced with the underlying postings and is placed in a priority queue by + * An enumeration/iterator of a term and its offsets for use by {@link FieldHighlighter}. + * It is advanced and is placed in a priority queue by * {@link FieldHighlighter#highlightOffsetsEnums(List)} based on the start offset. * * @lucene.internal */ -public class OffsetsEnum implements Comparable , Closeable { - private final BytesRef term; - private final PostingsEnum postingsEnum; // with offsets +public abstract class OffsetsEnum implements Comparable , Closeable { private float weight; // set once in highlightOffsetsEnums - private int posCounter = 0; // the occurrence counter of this term within the text being highlighted. - - public OffsetsEnum(BytesRef term, PostingsEnum postingsEnum) throws IOException { - this.term = term; // can be null - this.postingsEnum = Objects.requireNonNull(postingsEnum); - } // note: the ordering clearly changes as the postings enum advances + // note: would be neat to use some Comparator utilities with method + // references but our methods throw IOException @Override public int compareTo(OffsetsEnum other) { try { @@ -51,53 +46,41 @@ public class OffsetsEnum implements Comparable , Closeable { if (cmp != 0) { return cmp; // vast majority of the time we return here. } - if (this.term == null || other.term == null) { - if (this.term == null && other.term == null) { + final BytesRef thisTerm = this.getTerm(); + final BytesRef otherTerm = other.getTerm(); + if (thisTerm == null || otherTerm == null) { + if (thisTerm == null && otherTerm == null) { return 0; - } else if (this.term == null) { + } else if (thisTerm == null) { return 1; // put "this" (wildcard mtq enum) last } else { return -1; } } - return term.compareTo(other.term); + return thisTerm.compareTo(otherTerm); } catch (IOException e) { throw new RuntimeException(e); } } - /** The term at this position; usually always the same. This term is a reference that is safe to continue to refer to, - * even after we move to next position. */ - public BytesRef getTerm() throws IOException { - // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then remove this hack here - return term != null ? term : postingsEnum.getPayload(); // abusing payload like this is a total hack! - } + /** + * Advances to the next position and returns true, or if can't then returns false. + * Note that the initial state of this class is not positioned. + */ + public abstract boolean nextPosition() throws IOException; - public PostingsEnum getPostingsEnum() { - return postingsEnum; - } + /** An estimate of the number of occurrences of this term/OffsetsEnum. */ + public abstract int freq() throws IOException; - public int freq() throws IOException { - return postingsEnum.freq(); - } + /** + * The term at this position; usually always the same. + * This BytesRef is safe to continue to refer to, even after we move to the next position. + */ + public abstract BytesRef getTerm() throws IOException; - public boolean hasMorePositions() throws IOException { - return posCounter < postingsEnum.freq(); - } + public abstract int startOffset() throws IOException; - public void nextPosition() throws IOException { - assert hasMorePositions(); - posCounter++; - postingsEnum.nextPosition(); - } - - public int startOffset() throws IOException { - return postingsEnum.startOffset(); - } - - public int endOffset() throws IOException { - return postingsEnum.endOffset(); - } + public abstract int endOffset() throws IOException; public float getWeight() { return weight; @@ -109,9 +92,66 @@ public class OffsetsEnum implements Comparable , Closeable { @Override public void close() throws IOException { - // TODO TokenStreamOffsetStrategy could override OffsetsEnum; then this base impl would be no-op. - if (postingsEnum instanceof Closeable) { - ((Closeable) postingsEnum).close(); + } + + @Override + public String toString() { + final String name = getClass().getSimpleName(); + try { + return name + "(term:" + getTerm().utf8ToString() +")"; + } catch (Exception e) { + return name; } } + + /** + * Based on a {@link PostingsEnum} -- the typical/standard OE impl. + */ + public static class OfPostings extends OffsetsEnum { + private final BytesRef term; + private final PostingsEnum postingsEnum; // with offsets + + private int posCounter = 0; // the occurrence counter of this term within the text being highlighted. + + public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException { + this.term = Objects.requireNonNull(term); + this.postingsEnum = Objects.requireNonNull(postingsEnum); + } + + public PostingsEnum getPostingsEnum() { + return postingsEnum; + } + + @Override + public boolean nextPosition() throws IOException { + if (posCounter < postingsEnum.freq()) { + posCounter++; + postingsEnum.nextPosition(); // note: we don't need to save the position + return true; + } else { + return false; + } + } + + @Override + public int freq() throws IOException { + return postingsEnum.freq(); + } + + @Override + public BytesRef getTerm() throws IOException { + return term; + } + + @Override + public int startOffset() throws IOException { + return postingsEnum.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return postingsEnum.endOffset(); + } + + } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java index 3efb694f9e7..24b1015d104 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java @@ -98,6 +98,24 @@ public class Passage { numMatches = 0; } + /** For debugging. ex: Passage[0-22]{yin[0-3],yang[4-8],yin[10-13]}score=2.4964213 */ + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append("Passage[").append(startOffset).append('-').append(endOffset).append(']'); + buf.append('{'); + for (int i = 0; i < numMatches; i++) { + if (i != 0) { + buf.append(','); + } + buf.append(matchTerms[i].utf8ToString()); + buf.append('[').append(matchStarts[i] - startOffset).append('-').append(matchEnds[i] - startOffset).append(']'); + } + buf.append('}'); + buf.append("score=").append(score); + return buf.toString(); + } + /** * Start offset of this passage. * diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java index cfb65708df8..2edb19244c6 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java @@ -17,82 +17,58 @@ package org.apache.lucene.search.uhighlight; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; -import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import java.util.function.Function; import java.util.function.Predicate; -import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; import org.apache.lucene.search.highlight.WeightedSpanTerm; import org.apache.lucene.search.highlight.WeightedSpanTermExtractor; import org.apache.lucene.search.spans.SpanCollector; import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.search.spans.SpanQuery; -import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.SpanScorer; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PriorityQueue; /** - * Helps the {@link FieldOffsetStrategy} with strict position highlighting (e.g. highlight phrases correctly). + * Helps the {@link FieldOffsetStrategy} with position sensitive queries (e.g. highlight phrases correctly). * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting - * documents. Despite this state; it's immutable after construction. The approach taken in this class is very similar - * to the standard Highlighter's {@link WeightedSpanTermExtractor} which is in fact re-used here. However, we ought to - * completely rewrite it to use the SpanCollector interface to collect offsets directly. We'll get better - * phrase accuracy. + * documents. Despite this state, it's immutable after construction. * * @lucene.internal */ +// TODO rename to SpanHighlighting ? public class PhraseHelper { public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_", (s) -> false, spanQuery -> null, query -> null, true); - //TODO it seems this ought to be a general thing on Spans? - private static final Comparator super Spans> SPANS_COMPARATOR = (o1, o2) -> { - int cmp = Integer.compare(o1.docID(), o2.docID()); - if (cmp != 0) { - return cmp; - } - if (o1.docID() == DocIdSetIterator.NO_MORE_DOCS) { - return 0; // don't ask for start/end position; not sure if we can even call those methods - } - cmp = Integer.compare(o1.startPosition(), o2.startPosition()); - if (cmp != 0) { - return cmp; - } else { - return Integer.compare(o1.endPosition(), o2.endPosition()); - } - }; - private final String fieldName; - private final Set positionInsensitiveTerms; // (TermQuery terms) + private final Set positionInsensitiveTerms; // (TermQuery terms) private final Set spanQueries; private final boolean willRewrite; private final Predicate fieldMatcher; @@ -114,13 +90,27 @@ public class PhraseHelper { this.fieldName = field; this.fieldMatcher = fieldMatcher; // filter terms to those we want - positionInsensitiveTerms = new FieldFilteringTermSet(); + positionInsensitiveTerms = new HashSet<>(); spanQueries = new HashSet<>(); // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class + // When we call Weight.extractTerms, we do it on clauses that are NOT position sensitive. + // We only want the to track a Set of bytes for the Term, not Term class with field part. + Set extractPosInsensitiveTermsTarget = new TreeSet () { + @Override + public boolean add(Term term) { + // don't call super.add; we don't actually use the superclass + if (fieldMatcher.test(term.field())) { + return positionInsensitiveTerms.add(term.bytes()); + } else { + return false; + } + } + }; + // For TermQueries or other position insensitive queries, collect the Terms. // For other Query types, WSTE will convert to an equivalent SpanQuery. NOT extracting position spans here. new WeightedSpanTermExtractor(field) { @@ -155,13 +145,15 @@ public class PhraseHelper { return true; //TODO set to false and provide a hook to customize certain queries. } + // called on Query types that are NOT position sensitive, e.g. TermQuery @Override protected void extractWeightedTerms(Map terms, Query query, float boost) throws IOException { query.createWeight(UnifiedHighlighter.EMPTY_INDEXSEARCHER, ScoreMode.COMPLETE_NO_SCORES, boost) - .extractTerms(positionInsensitiveTerms); + .extractTerms(extractPosInsensitiveTermsTarget); } + // called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand @Override protected void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery, float boost) throws IOException { @@ -174,7 +166,6 @@ public class PhraseHelper { } } - // TODO allow users to override the answer to mustRewriteQuery boolean mustRewriteQuery = mustRewriteQuery(spanQuery); if (ignoreQueriesNeedingRewrite && mustRewriteQuery) { return;// ignore this query @@ -194,14 +185,14 @@ public class PhraseHelper { willRewrite = mustRewriteHolder[0]; } - Set getSpanQueries() { + public Set getSpanQueries() { return spanQueries; } /** * If there is no position sensitivity then use of the instance of this class can be ignored. */ - boolean hasPositionSensitivity() { + public boolean hasPositionSensitivity() { return spanQueries.isEmpty() == false; } @@ -210,335 +201,85 @@ public class PhraseHelper { * custom things. When true, the resulting term list will probably be different than what it was known * to be initially. */ - boolean willRewrite() { + public boolean willRewrite() { return willRewrite; } - /** - * Collect a list of pre-positioned {@link Spans} for each term, given a reader that has just one document. - * It returns no mapping for query terms that occurs in a position insensitive way which therefore don't - * need to be filtered. - */ - Map getTermToSpans(LeafReader leafReader, int doc) - throws IOException { - if (spanQueries.isEmpty()) { - return Collections.emptyMap(); - } - final LeafReader filteredReader = new SingleFieldFilterLeafReader(leafReader, fieldName); - // for each SpanQuery, collect the member spans into a map. - Map result = new HashMap<>(); - for (SpanQuery spanQuery : spanQueries) { - getTermToSpans(spanQuery, filteredReader.getContext(), doc, result); - } + /** Returns the terms that are position-insensitive (sorted). */ + public BytesRef[] getAllPositionInsensitiveTerms() { + BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]); + Arrays.sort(result); return result; } - // code extracted & refactored from WSTE.extractWeightedSpanTerms() - private void getTermToSpans(SpanQuery spanQuery, LeafReaderContext readerContext, - int doc, Map result) - throws IOException { - // note: in WSTE there was some field specific looping that seemed pointless so that isn't here. - final IndexSearcher searcher = new IndexSearcher(readerContext.reader()); + /** Given the internal SpanQueries, produce a number of OffsetsEnum into the {@code results} param. */ + public void createOffsetsEnumsForSpans(LeafReader leafReader, int docId, List results) throws IOException { + leafReader = new SingleFieldWithOffsetsFilterLeafReader(leafReader, fieldName); + //TODO avoid searcher and do what it does to rewrite & get weight? + IndexSearcher searcher = new IndexSearcher(leafReader); searcher.setQueryCache(null); - if (willRewrite) { - spanQuery = (SpanQuery) searcher.rewrite(spanQuery); // searcher.rewrite loops till done - } - - // Get the underlying query terms - TreeSet termSet = new FieldFilteringTermSet(); // sorted so we can loop over results in order shortly... - searcher.createWeight(spanQuery, ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(termSet);//needsScores==false - - // Get Spans by running the query against the reader - // TODO it might make sense to re-use/cache the Spans instance, to advance forward between docs - SpanWeight spanWeight = (SpanWeight) searcher.createNormalizedWeight(spanQuery, ScoreMode.COMPLETE_NO_SCORES); - Spans spans = spanWeight.getSpans(readerContext, SpanWeight.Postings.POSITIONS); - if (spans == null) { - return; - } - TwoPhaseIterator twoPhaseIterator = spans.asTwoPhaseIterator(); - if (twoPhaseIterator != null) { - if (twoPhaseIterator.approximation().advance(doc) != doc || !twoPhaseIterator.matches()) { - return; - } - } else if (spans.advance(doc) != doc) { // preposition, and return doing nothing if find none - return; - } - - // Consume the Spans into a cache. This instance is used as a source for multiple cloned copies. - // It's important we do this and not re-use the same original Spans instance since these will be iterated - // independently later on; sometimes in ways that prevents sharing the original Spans. - CachedSpans cachedSpansSource = new CachedSpans(spans); // consumes spans for this doc only and caches - spans = null;// we don't use it below - - // Map terms to a Spans instance (aggregate if necessary) - for (final Term queryTerm : termSet) { - // note: we expect that at least one query term will pass these filters. This is because the collected - // spanQuery list were already filtered by these conditions. - if (positionInsensitiveTerms.contains(queryTerm)) { - continue; - } - // copy-constructor refers to same data (shallow) but has iteration state from the beginning - CachedSpans cachedSpans = new CachedSpans(cachedSpansSource); - // Add the span to whatever span may or may not exist - Spans existingSpans = result.get(queryTerm.bytes()); - if (existingSpans != null) { - if (existingSpans instanceof MultiSpans) { - ((MultiSpans) existingSpans).addSpans(cachedSpans); - } else { // upgrade to MultiSpans - MultiSpans multiSpans = new MultiSpans(); - multiSpans.addSpans(existingSpans); - multiSpans.addSpans(cachedSpans); - result.put(queryTerm.bytes(), multiSpans); - } - } else { - result.put(queryTerm.bytes(), cachedSpans); - } - } - } - - /** - * Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only - * happen if willRewrite() is true. - */ - List expandTermsIfRewrite(BytesRef[] terms, Map strictPhrasesTermToSpans) { - if (willRewrite()) { - Set allTermSet = new LinkedHashSet<>(terms.length + strictPhrasesTermToSpans.size()); - Collections.addAll(allTermSet, terms);//FYI already sorted; will keep order - if (allTermSet.addAll(strictPhrasesTermToSpans.keySet())) { // true if any were added - List sourceTerms = Arrays.asList(allTermSet.toArray(new BytesRef[allTermSet.size()])); - sourceTerms.sort(Comparator.naturalOrder()); - return sourceTerms; - } - } - return Arrays.asList(terms); // no rewrite; use original terms - } - - /** - * Returns a filtered postings where the position must be in the given Spans. - * The Spans must be in a positioned state (not initial) and should not be shared between other terms. - * {@code postingsEnum} should be positioned at the - * document (the same one as the spans) but it hasn't iterated the positions yet. - * The Spans should be the result of a simple - * lookup from {@link #getTermToSpans(LeafReader, int)}, and so it could be null which could mean - * either it's completely filtered or that there should be no filtering; this class knows what to do. - * - * Due to limitations in filtering, the {@link PostingsEnum#freq()} is un-changed even if some positions - * get filtered. So when {@link PostingsEnum#nextPosition()} is called or {@code startOffset} or {@code - * endOffset} beyond the "real" positions, these methods returns {@link Integer#MAX_VALUE}. - *
- * This will return null if it's completely filtered out (i.e. effectively has no postings). - */ - PostingsEnum filterPostings(BytesRef term, PostingsEnum postingsEnum, Spans spans) - throws IOException { - if (spans == null) { - if (hasPositionSensitivity() == false || positionInsensitiveTerms.contains(new Term(fieldName, term))) { - return postingsEnum; // no filtering - } else { - return null; // completely filtered out - } - } - if (postingsEnum.docID() != spans.docID()) { - throw new IllegalStateException("Spans & Postings doc ID misaligned or not positioned"); - } - - return new FilterLeafReader.FilterPostingsEnum(postingsEnum) { - // freq() is max times nextPosition can be called. We'll set this var to -1 when exhausted. - int remainingPositions = postingsEnum.freq(); + // for each SpanQuery, grab it's Spans and put it into a PriorityQueue + PriorityQueue
spansPriorityQueue = new PriorityQueue (spanQueries.size()) { @Override - public String toString() { - String where; - try { - where = "[" + startOffset() + ":" + endOffset() + "]"; - } catch (IOException e) { - where = "[" + e + "]"; - } - return "'" + term.utf8ToString() + "'@" + where + " filtered by " + spans; - } - - @Override - public int nextDoc() throws IOException { - throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc - } - - @Override - public int advance(int target) throws IOException { - throw new IllegalStateException("not expected"); // don't need to implement; just used on one doc - } - - @Override - public int nextPosition() throws IOException { - // loop over posting positions... - NEXT_POS_LOOP: - while (remainingPositions > 0) { - final int thisPos = super.nextPosition(); - remainingPositions--; - - // loop spans forward (if necessary) while the span end is behind thisPos - while (spans.endPosition() <= thisPos) { - if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) { // advance - break NEXT_POS_LOOP; - } - assert spans.docID() == postingsEnum.docID(); - } - - // is this position within the span? - if (thisPos >= spans.startPosition()) { - assert thisPos < spans.endPosition(); // guaranteed by previous loop - return thisPos; // yay! - } - // else continue and try the next position - } - remainingPositions = -1; // signify done - return Integer.MAX_VALUE; - } - - @Override - public int startOffset() throws IOException { - return remainingPositions >= 0 ? super.startOffset() : Integer.MAX_VALUE; - } - - @Override - public int endOffset() throws IOException { - return remainingPositions >= 0 ? super.endOffset() : Integer.MAX_VALUE; + protected boolean lessThan(Spans a, Spans b) { + return a.startPosition() <= b.startPosition(); } }; - } - - /** - * Simple TreeSet that filters out Terms not matching the provided predicate on {@code add()}. - */ - private class FieldFilteringTermSet extends TreeSet { - @Override - public boolean add(Term term) { - if (fieldMatcher.test(term.field())) { - if (term.field().equals(fieldName)) { - return super.add(term); - } else { - return super.add(new Term(fieldName, term.bytes())); + for (Query query : spanQueries) { + Weight weight = searcher.createNormalizedWeight(query, ScoreMode.COMPLETE_NO_SCORES); + Scorer scorer = weight.scorer(leafReader.getContext()); + if (scorer == null) { + continue; + } + TwoPhaseIterator twoPhaseIterator = scorer.twoPhaseIterator(); + if (twoPhaseIterator != null) { + if (twoPhaseIterator.approximation().advance(docId) != docId || !twoPhaseIterator.matches()) { + continue; } + } else if (scorer.iterator().advance(docId) != docId) { // preposition, and return doing nothing if find none + continue; + } + + Spans spans = ((SpanScorer) scorer).getSpans(); + assert spans.docID() == docId; + if (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) { + spansPriorityQueue.add(spans); + } + } + + // Iterate the Spans in the PriorityQueue, collecting as we go. By using a PriorityQueue ordered by position, + // the underlying offsets in our collector will be mostly appended to the end of arrays (efficient). + // note: alternatively it'd interesting if we produced one OffsetsEnum that internally advanced + // this PriorityQueue when nextPosition is called; it would cap what we have to cache for large docs and + // exiting early (due to maxLen) is easy. + // But at least we have an accurate "freq" and it shouldn't be too much data to collect. Even SpanScorer + // navigates the spans fully to compute a good freq (and thus score)! + OffsetSpanCollector spanCollector = new OffsetSpanCollector(); + while (spansPriorityQueue.size() > 0) { + Spans spans = spansPriorityQueue.top(); + //TODO limit to a capped endOffset length somehow so we can break this loop early + spans.collect(spanCollector); + + if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) { + spansPriorityQueue.pop(); } else { - return false; + spansPriorityQueue.updateTop(); } } + results.addAll(spanCollector.termToOffsetsEnums.values()); } - /** - * A single {@link Spans} view over multiple spans. At least one span is mandatory, but you should probably - * supply more than one. Furthermore, the given spans are expected to be positioned to a document already - * via a call to next or advance). - */ // TODO move to Lucene core as a Spans utility class? - static class MultiSpans extends Spans { - final PriorityQueue spansQueue = new PriorityQueue<>(SPANS_COMPARATOR); - long cost; - - void addSpans(Spans spans) { - if (spans.docID() < 0 || spans.docID() == NO_MORE_DOCS) { - throw new IllegalArgumentException("Expecting given spans to be in a positioned state."); - } - spansQueue.add(spans); - cost = Math.max(cost, spans.cost()); - } - - // DocIdSetIterator methods: - - @Override - public int nextDoc() throws IOException { - if (spansQueue.isEmpty()) { - return NO_MORE_DOCS; - } - return advance(spansQueue.peek().docID() + 1); - } - - @Override - public int advance(int target) throws IOException { - if (spansQueue.isEmpty()) { - return NO_MORE_DOCS; - } - while (true) { - Spans spans = spansQueue.peek(); - if (spans.docID() >= target) { - return spans.docID(); - } - spansQueue.remove(); // must remove before modify state - if (spans.advance(target) != NO_MORE_DOCS) { // ... otherwise it's not re-added - spansQueue.add(spans); - } else if (spansQueue.isEmpty()) { - return NO_MORE_DOCS; - } - } - } - - @Override - public int docID() { - if (spansQueue.isEmpty()) { - return NO_MORE_DOCS; - } - return spansQueue.peek().docID(); - } - - @Override - public long cost() { - return cost; - } - - // Spans methods: - - @Override - public int nextStartPosition() throws IOException { - // advance any spans at the initial position per document - boolean atDocStart = false; - while (spansQueue.peek().startPosition() == -1) { - atDocStart = true; - Spans headSpans = spansQueue.remove(); // remove because we will change state - headSpans.nextStartPosition(); - spansQueue.add(headSpans); - } - if (!atDocStart) { - Spans headSpans = spansQueue.remove(); // remove because we will change state - headSpans.nextStartPosition(); - spansQueue.add(headSpans); - } - return startPosition(); - } - - @Override - public int startPosition() { - return spansQueue.peek().startPosition(); - } - - @Override - public int endPosition() { - return spansQueue.peek().endPosition(); - } - - @Override - public int width() { - return spansQueue.peek().width(); - } - - @Override - public void collect(SpanCollector collector) throws IOException { - spansQueue.peek().collect(collector); - } - - @Override - public float positionsCost() { - return 100f;// no idea; and we can't delegate due to not allowing to call it dependent on TwoPhaseIterator - } - } - - //TODO move up; it's currently inbetween other inner classes that are related /** * Needed to support the ability to highlight a query irrespective of the field a query refers to * (aka requireFieldMatch=false). * This reader will just delegate every call to a single field in the wrapped * LeafReader. This way we ensure that all queries going through this reader target the same field. */ - static final class SingleFieldFilterLeafReader extends FilterLeafReader { + private static final class SingleFieldWithOffsetsFilterLeafReader extends FilterLeafReader { final String fieldName; - SingleFieldFilterLeafReader(LeafReader in, String fieldName) { + SingleFieldWithOffsetsFilterLeafReader(LeafReader in, String fieldName) { super(in); this.fieldName = fieldName; } @@ -550,22 +291,18 @@ public class PhraseHelper { @Override public Terms terms(String field) throws IOException { - return super.terms(fieldName); - } - - @Override - public NumericDocValues getNumericDocValues(String field) throws IOException { - return super.getNumericDocValues(fieldName); - } - - @Override - public BinaryDocValues getBinaryDocValues(String field) throws IOException { - return super.getBinaryDocValues(fieldName); - } - - @Override - public SortedDocValues getSortedDocValues(String field) throws IOException { - return super.getSortedDocValues(fieldName); + // ensure the underlying PostingsEnum returns offsets. It's sad we have to do this to use the SpanCollector. + return new FilterTerms(super.terms(fieldName)) { + @Override + public TermsEnum iterator() throws IOException { + return new FilterTermsEnum(in.iterator()) { + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + return super.postings(reuse, flags | PostingsEnum.OFFSETS); + } + }; + } + }; } @Override @@ -584,99 +321,102 @@ public class PhraseHelper { } } + private class OffsetSpanCollector implements SpanCollector { + Map termToOffsetsEnums = new HashMap<>(); - /** - * A Spans based on a list of cached spans for one doc. It is pre-positioned to this doc. - */ - private static class CachedSpans extends Spans { - - private static class CachedSpan { - final int start; - final int end; - - CachedSpan(int start, int end) { - this.start = start; - this.end = end; + @Override + public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException { + if (!fieldMatcher.test(term.field())) { + return; } - } - final int docId; - final ArrayList cachedSpanList; - int index = -1; - - CachedSpans(Spans spans) throws IOException { - this.docId = spans.docID(); - assert this.docId != -1; - // Consume the spans for this doc into a list. There's always at least one; the first/current one. - cachedSpanList = new ArrayList<>(); - while (spans.nextStartPosition() != NO_MORE_POSITIONS) { - cachedSpanList.add(new CachedSpan(spans.startPosition(), spans.endPosition())); + SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes()); + if (offsetsEnum == null) { + // If it's pos insensitive we handle it outside of PhraseHelper. term.field() is from the Query. + if (positionInsensitiveTerms.contains(term.bytes())) { + return; + } + offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq()); + termToOffsetsEnums.put(term.bytes(), offsetsEnum); } - assert !cachedSpanList.isEmpty(); // bad Span impl? - } - - /** - * Clone; reset iteration state. - */ - CachedSpans(CachedSpans cloneMe) { - docId = cloneMe.docId; - cachedSpanList = cloneMe.cachedSpanList; + offsetsEnum.add(postings.startOffset(), postings.endOffset()); } @Override - public int nextDoc() throws IOException { - throw new UnsupportedOperationException("Not expected"); + public void reset() { // called when at a new position. We don't care. + } + } + + private static class SpanCollectedOffsetsEnum extends OffsetsEnum { + // TODO perhaps optionally collect (and expose) payloads? + private final BytesRef term; + private final int[] startOffsets; + private final int[] endOffsets; + private int numPairs = 0; + private int enumIdx = -1; + + private SpanCollectedOffsetsEnum(BytesRef term, int postingsFreq) { + this.term = term; + this.startOffsets = new int[postingsFreq]; // hopefully not wasteful? At least we needn't resize it. + this.endOffsets = new int[postingsFreq]; + } + + // called from collector before it's navigated + void add(int startOffset, int endOffset) { + assert enumIdx == -1 : "bad state"; + + // loop backwards since we expect a match at the end or close to it. We expect O(1) not O(N). + int pairIdx = numPairs - 1; + for (; pairIdx >= 0; pairIdx--) { + int iStartOffset = startOffsets[pairIdx]; + int iEndOffset = endOffsets[pairIdx]; + int cmp = Integer.compare(iStartOffset, startOffset); + if (cmp == 0) { + cmp = Integer.compare(iEndOffset, endOffset); + } + if (cmp == 0) { + return; // we already have this offset-pair for this term + } else if (cmp < 0) { + break; //we will insert offsetPair to the right of pairIdx + } + } + // pairIdx is now one position to the left of where we insert the new pair + // shift right any pairs by one to make room + final int shiftLen = numPairs - (pairIdx + 1); + if (shiftLen > 0) { + System.arraycopy(startOffsets, pairIdx + 2, startOffsets, pairIdx + 3, shiftLen); + System.arraycopy(endOffsets, pairIdx + 2, endOffsets, pairIdx + 3, shiftLen); + } + // now we can place the offset pair + startOffsets[pairIdx + 1] = startOffset; + endOffsets[pairIdx + 1] = endOffset; + numPairs++; } @Override - public int advance(int target) throws IOException { - throw new UnsupportedOperationException("Not expected"); + public boolean nextPosition() throws IOException { + return ++enumIdx < numPairs; } @Override - public int docID() { - return docId; + public int freq() throws IOException { + return numPairs; } @Override - public long cost() { - return 1; + public BytesRef getTerm() throws IOException { + return term; } @Override - public int nextStartPosition() throws IOException { - index++; - return startPosition(); + public int startOffset() throws IOException { + return startOffsets[enumIdx]; } @Override - public int startPosition() { - return index < 0 ? - -1 : index >= cachedSpanList.size() ? - NO_MORE_POSITIONS : cachedSpanList.get(index).start; + public int endOffset() throws IOException { + return endOffsets[enumIdx]; } + } - @Override - public int endPosition() { - return index < 0 ? - -1 : index >= cachedSpanList.size() ? - NO_MORE_POSITIONS : cachedSpanList.get(index).end; - } - - @Override - public int width() { - return endPosition() - startPosition(); - } - - @Override - public void collect(SpanCollector collector) throws IOException { - throw new UnsupportedOperationException("Not expected"); - } - - @Override - public float positionsCost() { - return 1f; - } - - } // class CachedSpans } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java index 28eb6b1a613..5f47a5daac7 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search.uhighlight; -import java.io.Closeable; import java.io.IOException; import java.util.Collections; import java.util.List; @@ -26,7 +25,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -63,29 +61,20 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { @Override public List getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException { - TokenStream tokenStream = tokenStream(content); - PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata); - mtqPostingsEnum.advance(docId); - return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum)); + return Collections.singletonList(new TokenStreamOffsetsEnum(tokenStream(content), automata)); } - // See class javadocs. - // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl? See TODOs in OffsetsEnum. - private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable { + private static class TokenStreamOffsetsEnum extends OffsetsEnum { TokenStream stream; // becomes null when closed final CharacterRunAutomaton[] matchers; final CharTermAttribute charTermAtt; final OffsetAttribute offsetAtt; - int currentDoc = -1; int currentMatch = -1; - int currentStartOffset = -1; - - int currentEndOffset = -1; final BytesRef matchDescriptions[]; - TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException { + TokenStreamOffsetsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException { this.stream = ts; this.matchers = matchers; matchDescriptions = new BytesRef[matchers.length]; @@ -95,15 +84,13 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { } @Override - public int nextPosition() throws IOException { + public boolean nextPosition() throws IOException { if (stream != null) { while (stream.incrementToken()) { for (int i = 0; i < matchers.length; i++) { if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) { - currentStartOffset = offsetAtt.startOffset(); - currentEndOffset = offsetAtt.endOffset(); currentMatch = i; - return 0; + return true; } } } @@ -111,8 +98,7 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { close(); } // exhausted - currentStartOffset = currentEndOffset = Integer.MAX_VALUE; - return Integer.MAX_VALUE; + return false; } @Override @@ -122,45 +108,23 @@ public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy { @Override public int startOffset() throws IOException { - assert currentStartOffset >= 0; - return currentStartOffset; + return offsetAtt.startOffset(); } @Override public int endOffset() throws IOException { - assert currentEndOffset >= 0; - return currentEndOffset; + return offsetAtt.endOffset(); } - // TOTAL HACK; used in OffsetsEnum.getTerm() @Override - public BytesRef getPayload() throws IOException { + public BytesRef getTerm() throws IOException { if (matchDescriptions[currentMatch] == null) { + // these CharRunAutomata are subclassed so that toString() returns the query matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString()); } return matchDescriptions[currentMatch]; } - @Override - public int docID() { - return currentDoc; - } - - @Override - public int nextDoc() throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public int advance(int target) throws IOException { - return currentDoc = target; - } - - @Override - public long cost() { - return 0; - } - @Override public void close() throws IOException { if (stream != null) { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java index 96ec15501ff..086d7a03cfb 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighter.java @@ -23,11 +23,14 @@ import java.nio.charset.StandardCharsets; import java.text.BreakIterator; import java.util.Arrays; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Predicate; import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; @@ -49,6 +52,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.uhighlight.UnifiedHighlighter.HighlightFlag; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import org.junit.After; @@ -81,6 +85,36 @@ public class TestUnifiedHighlighter extends LuceneTestCase { dir.close(); } + static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer) { + return randomUnifiedHighlighter(searcher, indexAnalyzer, EnumSet.noneOf(HighlightFlag.class)); + } + + static UnifiedHighlighter randomUnifiedHighlighter(IndexSearcher searcher, Analyzer indexAnalyzer, + EnumSet