This commit is contained in:
Karl Wright 2018-01-23 07:35:14 -05:00
commit 653935bbdf
455 changed files with 10139 additions and 4076 deletions

View File

@ -145,6 +145,7 @@
(~$/(?i)\bno(n|)commit\b/$) : 'nocommit',
(~$/\bTOOD:/$) : 'TOOD instead TODO',
(~$/\t/$) : 'tabs instead spaces',
(~$/\Q/**\E((?:\s)|(?:\*))*\Q{@inheritDoc}\E((?:\s)|(?:\*))*\Q*/\E/$) : '{@inheritDoc} on its own is unnecessary',
(~$/\$$(?:LastChanged)?Date\b/$) : 'svn keyword',
(~$/\$$(?:(?:LastChanged)?Revision|Rev)\b/$) : 'svn keyword',
(~$/\$$(?:LastChangedBy|Author)\b/$) : 'svn keyword',

View File

@ -67,6 +67,13 @@
</maintainer>
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>lucene-7.2.1</name>
<created>2018-01-15</created>
<revision>7.2.1</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-7.2.0</name>

View File

@ -67,6 +67,13 @@
</maintainer>
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>solr-7.2.1</name>
<created>2018-01-15</created>
<revision>7.2.1</revision>
</Version>
</release>
<release>
<Version>
<name>solr-7.2.0</name>

View File

@ -31,5 +31,6 @@
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="analysis-common" />
</component>
</module>

View File

@ -0,0 +1,215 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import subprocess
import sys
import urllib.error
import urllib.request
from textwrap import dedent
# Number of iterations per test suite
testIters = 5
usage = dedent('''\
Usage:\n
python3 -u %s URL\n
Must be run from a Lucene/Solr git workspace. Downloads the Jenkins
log pointed to by the given URL, parses it for Git revision and failed
Lucene/Solr tests, checks out the Git revision in the local workspace,
groups the failed tests by module, then runs
'ant test -Dtest.dups=%d -Dtests.class="*.test1[|*.test2[...]]" ...'
in each module of interest, failing at the end if any of the runs fails.
To control the maximum number of concurrent JVMs used for each module's
test run, set 'tests.jvms', e.g. in ~/lucene.build.properties
''' % (sys.argv[0], testIters))
reHelpArg = re.compile(r'-{1,2}(?:\?|h(?:elp)?)')
# Example: Checking out Revision e441a99009a557f82ea17ee9f9c3e9b89c75cee6 (refs/remotes/origin/master)
reGitRev = re.compile(r'Checking out Revision (\S+)')
# Method example: NOTE: reproduce with: ant test -Dtestcase=ZkSolrClientTest -Dtests.method=testMultipleWatchesAsync -Dtests.seed=6EF5AB70F0032849 -Dtests.slow=true -Dtests.locale=he-IL -Dtests.timezone=NST -Dtests.asserts=true -Dtests.file.encoding=UTF-8
# Suite example: NOTE: reproduce with: ant test -Dtestcase=CloudSolrClientTest -Dtests.seed=DB2DF2D8228BAF27 -Dtests.multiplier=3 -Dtests.slow=true -Dtests.locale=es-AR -Dtests.timezone=America/Argentina/Cordoba -Dtests.asserts=true -Dtests.file.encoding=US-ASCII
reReproLine = re.compile(r'NOTE:\s+reproduce\s+with:(\s+ant\s+test\s+-Dtestcase=(\S+)\s+(?:-Dtests.method=\S+\s+)?(.*))')
# Example: https://jenkins.thetaphi.de/job/Lucene-Solr-master-Linux/21108/
reJenkinsURLWithoutConsoleText = re.compile(r'https?://.*/\d+/?\Z', re.IGNORECASE)
reJavaFile = re.compile(r'(.*)\.java\Z')
reModule = re.compile(r'\./(.*)/src/')
reTestOutputFile = re.compile(r'TEST-(.*\.([^-.]+))(?:-\d+)?\.xml\Z')
reErrorFailure = re.compile(r'(?:errors|failures)="[^0]')
# consoleText from Policeman Jenkins's Windows jobs fails to decode as UTF-8
encoding = 'iso-8859-1'
tests = {}
modules = {}
lastFailureCode = 0
gitCheckoutSucceeded = False
def runOutput(cmd):
print('[repro] %s' % cmd)
try:
return subprocess.check_output(cmd.split(' '), universal_newlines=True).strip()
except CalledProcessError as e:
raise RuntimeError("ERROR: Cmd '%s' failed with exit code %d and the following output:\n%s"
% (cmd, e.returncode, e.output))
# Remembers non-zero exit code in lastFailureCode unless rememberFailure==False
def run(cmd, rememberFailure=True):
global lastFailureCode
print('[repro] %s' % cmd)
code = os.system(cmd)
if 0 != code and rememberFailure:
print('\n[repro] Setting last failure code to %d\n' % code)
lastFailureCode = code
return code
def fetchAndParseJenkinsLog(url):
global revision
revision = None
print('[repro] Jenkins log URL: %s\n' % url)
try:
with urllib.request.urlopen(url) as consoleText:
for rawLine in consoleText:
line = rawLine.decode(encoding)
match = reGitRev.match(line)
if match is not None:
revision = match.group(1)
print('[repro] Revision: %s\n' % revision)
else:
match = reReproLine.search(line)
if match is not None:
print('[repro] Repro line: %s\n' % match.group(1))
testcase = match.group(2)
reproLineWithoutMethod = match.group(3).strip()
tests[testcase] = reproLineWithoutMethod
except urllib.error.URLError as e:
raise RuntimeError('ERROR: fetching %s : %s' % (url, e))
if revision == None:
if reJenkinsURLWithoutConsoleText.match(url):
print('[repro] Not a Jenkins log. Appending "/consoleText" and retrying ...\n')
fetchAndParseJenkinsLog(url + '/consoleText')
else:
raise RuntimeError('ERROR: %s does not appear to be a Jenkins log.' % url)
if 0 == len(tests):
print('[repro] No "reproduce with" lines found; exiting.')
sys.exit(0)
def prepareWorkspace():
global gitCheckoutSucceeded
code = run('git checkout %s' % revision)
if 0 != code:
raise RuntimeError('ERROR: "git checkout %s" failed. See above. Maybe try "git pull"?' % revision)
gitCheckoutSucceeded = True
code = run('ant clean')
if 0 != code:
raise RuntimeError('ERROR: "ant clean" failed. See above.')
def groupTestsByModule():
for (dir, _, files) in os.walk('.'):
for file in files:
match = reJavaFile.search(file)
if match is not None:
test = match.group(1)
if test in tests:
match = reModule.match(dir)
module = match.group(1)
if module not in modules:
modules[module] = set()
modules[module].add(test)
print('[repro] Test suites by module:')
for module in modules:
print('[repro] %s' % module)
for test in modules[module]:
print('[repro] %s' % test)
def runTests():
global lastFailureCode
cwd = os.getcwd()
testCmdline = 'ant test-nocompile -Dtests.dups=%d -Dtests.maxfailures=%d -Dtests.class="%s" -Dtests.showOutput=onerror %s'
for module in modules:
moduleTests = list(modules[module])
testList = '|'.join(map(lambda t: '*.%s' % t, moduleTests))
numTests = len(moduleTests)
params = tests[moduleTests[0]] # Assumption: all tests in this module have the same cmdline params
os.chdir(module)
code = run('ant compile-test')
try:
if (0 != code):
raise RuntimeError("ERROR: Compile failed in %s/ with code %d. See above." % (module, code))
run(testCmdline % (testIters, testIters * numTests, testList, params))
finally:
os.chdir(cwd)
def printReport():
failures = {}
for start in ('lucene/build', 'solr/build'):
for (dir, _, files) in os.walk(start):
for file in files:
testOutputFileMatch = reTestOutputFile.search(file)
if testOutputFileMatch is not None:
testcase = testOutputFileMatch.group(1)
if testcase not in failures:
failures[testcase] = 0
with open(os.path.join(dir, file), encoding='UTF-8') as testOutputFile:
for line in testOutputFile:
errorFailureMatch = reErrorFailure.search(line)
if errorFailureMatch is not None:
failures[testcase] += 1
break
print("[repro] Failures:")
for testcase in sorted(failures):
print("[repro] %d/%d failed: %s" % (failures[testcase], testIters, testcase))
def rememberGitBranch():
global origGitBranch
origGitBranch = runOutput('git rev-parse --abbrev-ref HEAD')
if (origGitBranch == 'HEAD'): # In detached HEAD state
origGitBranch = runOutput('git rev-parse HEAD') # Use the SHA when not on a branch
print('[repro] Initial local git branch/revision: %s' % origGitBranch)
def main():
if 2 != len(sys.argv) or reHelpArg.match(sys.argv[1]):
print(usage)
sys.exit(0)
fetchAndParseJenkinsLog(sys.argv[1])
rememberGitBranch()
try:
prepareWorkspace()
groupTestsByModule()
runTests()
printReport()
except Exception as e:
print('[repro] %s' % e)
sys.exit(1)
finally:
if gitCheckoutSucceeded:
run('git checkout %s' % origGitBranch, rememberFailure=False) # Restore original git branch/sha
print('[repro] Exiting with code %d' % lastFailureCode)
sys.exit(lastFailureCode)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print('[repro] Keyboard interrupt...exiting')

View File

@ -32,6 +32,12 @@ API Changes
* LUCENE-8012: Explanation now takes Number rather than float (Alan Woodward,
Robert Muir)
* LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document
scoring factors. (Adrien Grand)
* LUCENE-8113: TermContext has been renamed to TermStates, and can now be
constructed lazily if term statistics are not required (Alan Woodward)
Changes in Runtime Behavior
* LUCENE-7837: Indices that were created before the previous major version
@ -46,6 +52,9 @@ Changes in Runtime Behavior
* LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
when the function produces a negative value. (Adrien Grand)
* LUCENE-8116: Similarities now score fields that omit norms as if the norm was
1. This might change score values on fields that omit norms. (Adrien Grand)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
@ -110,16 +119,55 @@ Improvements
* LUCENE-8094: TermInSetQuery.toString now returns "field:(A B C)" (Mike McCandless)
* LUCENE-8121: UnifiedHighlighter passage relevancy is improved for terms that are
position sensitive (e.g. part of a phrase) by having an accurate freq.
(David Smiley)
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
(Ere Maijala)
Bug Fixes
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
(Xiaoshan Sun via Adrien Grand)
* SOLR-11758: Fixed FloatDocValues.boolVal to correctly return true for all values != 0.0F
(Munendra S N via hossman)
* LUCENE-8121: The UnifiedHighlighter would highlight some terms within some nested
SpanNearQueries at positions where it should not have. It's fixed in the UH by
switching to the SpanCollector API. The original Highlighter still has this
problem (LUCENE-2287, LUCENE-5455, LUCENE-6796). Some public but internal parts of
the UH were refactored. (David Smiley, Steve Davids)
* LUCENE-8120: Fix LatLonBoundingBox's toString() method (Martijn van Groningen, Adrien Grand)
* LUCENE-8130: Fix NullPointerException from TermStates.toString() (Mike McCandless)
* LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly
hyphenation patterns with indicator >= 7. (Holger Bruch via Adrien Grand)
Other
* LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
(Kai Chan via Adrien Grand)
* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
* LUCENE-8106: Add script (reproduceJenkinsFailures.py) to attempt to reproduce
failing tests from a Jenkins log. (Steve Rowe)
* LUCENE-8075: Removed unnecessary null check in IntersectTermsEnum.
(Pulak Ghosh via Adrien Grand)
======================= Lucene 7.2.1 =======================
Bug Fixes
* LUCENE-8117: Fix advanceExact on SortedNumericDocValues produced by Lucene54DocValues. (Jim Ferenczi).
======================= Lucene 7.2.0 =======================
API Changes

View File

@ -19,12 +19,14 @@ FunctionScoreQuery maps negative values to 0.
## CustomScoreQuery, BoostedQuery and BoostingQuery removed (LUCENE-8099) ##
Instead use FunctionScoreQuery and a DoubleValuesSource implementation. For example,
to replace the functionality of BoostedQuery, you could do the following, using
the lucene-expressions module:
Instead use FunctionScoreQuery and a DoubleValuesSource implementation. BoostedQuery
and BoostingQuery may be replaced by calls to FunctionScoreQuery.boostByValue() and
FunctionScoreQuery.boostByQuery(). To replace more complex calculations in
CustomScoreQuery, use the lucene-expressions module:
SimpleBindings bindings = new SimpleBindings();
bindings.add("score", DoubleValuesSource.SCORES);
bindings.add("boost", DoubleValuesSource.fromIntField("myboostfield"));
Expression expr = JavascriptCompiler.compile("score * boost");
bindings.add("boost1", DoubleValuesSource.fromIntField("myboostfield"));
bindings.add("boost2", DoubleValuesSource.fromIntField("myotherboostfield"));
Expression expr = JavascriptCompiler.compile("score * (boost1 + ln(boost2))");
FunctionScoreQuery q = new FunctionScoreQuery(inputQuery, expr.getDoubleValuesSource(bindings));

View File

@ -123,9 +123,6 @@ public final class CommonGramsFilter extends TokenFilter {
return true;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();

View File

@ -62,9 +62,6 @@ public final class CommonGramsQueryFilter extends TokenFilter {
super(input);
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();

View File

@ -89,7 +89,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1 + '0');
char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0');
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
@ -151,7 +151,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1);
char c = (char) (((v & 0xf0 )>>> 4) - 1);
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {

View File

@ -204,9 +204,6 @@ public class FingerprintFilter extends TokenFilter {
}
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();

View File

@ -71,9 +71,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
super(in);
}
/**
* {@inheritDoc}
*/
@Override
public boolean incrementToken() throws IOException {
while (!exhausted && input.incrementToken()) {
@ -112,9 +109,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
return false;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();

View File

@ -43,9 +43,6 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
super(in);
}
/**
* {@inheritDoc}
*/
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
@ -71,9 +68,6 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
return false;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();

View File

@ -1,58 +1,58 @@
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was created using ICU4J v59.1.0.0
* <p>
* Unicode version: 9.0.0.0
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "9.0.0.0";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003,
0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
private static Bits createBits(final int... codepoints) {
final int len = codepoints[codepoints.length - 1] + 1;
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
for (int i : codepoints) bitset.set(i);
return new Bits() {
@Override
public boolean get(int index) {
return index < len && bitset.get(index);
}
@Override
public int length() {
return 0x10FFFF + 1;
}
};
}
}
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was created using ICU4J v60.2.0.0
* <p>
* Unicode version: 10.0.0.0
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "10.0.0.0";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003,
0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
private static Bits createBits(final int... codepoints) {
final int len = codepoints[codepoints.length - 1] + 1;
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
for (int i : codepoints) bitset.set(i);
return new Bits() {
@Override
public boolean get(int index) {
return index < len && bitset.get(index);
}
@Override
public int length() {
return 0x10FFFF + 1;
}
};
}
}

View File

@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
}
public void testLucene8124() throws Exception {
InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
whitespaceMockTokenizer(
"Rindfleisch"),
hyphenator);
// TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
}
public static interface MockRetainAttribute extends Attribute {
void setRetain(boolean attr);

View File

@ -0,0 +1,61 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This file contains the hyphenation patterns for danish.
Adapted from dkhyph.tex, dkcommon.tex and dkspecial.tex
originally created by Frank Jensen (fj@iesd.auc.dk).
FOP adaptation by Carlos Villegas (cav@uniscope.co.jp)
-->
<hyphenation-info>
<hyphen-char value="-"/>
<hyphen-min before="2" after="2"/>
<classes>
aA
bB
cC
dD
eE
fF
gG
hH
iI
jJ
kK
lL
mM
nN
oO
pP
qQ
rR
sS
tT
uU
vV
wW
xX
yY
zZ
æÆ
øØ
åÅ
</classes>
<patterns>
d7f
</patterns>
</hyphenation-info>

View File

@ -14,16 +14,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
# and status code change related to that)
#
# Copyright (C) 2002-2013, International Business Machines Corporation
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# with additions for Emoji Sequences from https://goo.gl/cluFCn
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
@ -35,6 +40,7 @@
##############################################################################
!!chain;
!!quoted_literals_only;
#
@ -43,8 +49,9 @@
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Newline = [\p{Word_Break = Newline} ];
$Extend = [\p{Word_Break = Extend}];
$ZWJ = [\p{Word_Break = ZWJ}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
@ -57,6 +64,13 @@ $MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$E_Base = [\p{Word_Break = EB}];
$E_Modifier = [\p{Word_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
@ -83,21 +97,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
@ -108,12 +122,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
#
$CR $LF;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
$ZWJ ($Extended_Pict | $EmojiNRK);
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
@ -123,6 +142,10 @@ $KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
# rule 5
# Do not break between most letters.
@ -170,9 +193,42 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rule 13c
# rule 14
# Do not break within emoji modifier sequences
$Regional_IndicatorEx $Regional_IndicatorEx;
($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
# rules 15 - 17
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
# Rule 999
# Match a single code point if no other rule applies.
.;
## -------------------------------------------------
!!safe_reverse;
# rule 3
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
# rule 7b
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
# rule 13c
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;

View File

@ -73,12 +73,14 @@
0A4D>
0ABC>
0ACD>
0AFD..0AFF>
0B3C>
0B4D>
0BCD>
0C4D>
0CBC>
0CCD>
0D3B..0D3C>
0D4D>
0DCA>
0E47..0E4C>
@ -112,10 +114,10 @@
1CD0..1CE8>
1CED>
1CF4>
1CF8..1CF9>
1CF7..1CF9>
1D2C..1D6A>
1DC4..1DCF>
1DF5>
1DF5..1DF9>
1DFD..1DFF>
1FBD>
1FBF..1FC1>
@ -175,7 +177,12 @@ FFE3>
1163F>
116B6..116B7>
1172B>
11A34>
11A47>
11A99>
11C3F>
11D42>
11D44..11D45>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>

View File

@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
11C57>0037 # BHAIKSUKI DIGIT SEVEN
11C58>0038 # BHAIKSUKI DIGIT EIGHT
11C59>0039 # BHAIKSUKI DIGIT NINE
11D50>0030 # MASARAM GONDI DIGIT ZERO
11D51>0031 # MASARAM GONDI DIGIT ONE
11D52>0032 # MASARAM GONDI DIGIT TWO
11D53>0033 # MASARAM GONDI DIGIT THREE
11D54>0034 # MASARAM GONDI DIGIT FOUR
11D55>0035 # MASARAM GONDI DIGIT FIVE
11D56>0036 # MASARAM GONDI DIGIT SIX
11D57>0037 # MASARAM GONDI DIGIT SEVEN
11D58>0038 # MASARAM GONDI DIGIT EIGHT
11D59>0039 # MASARAM GONDI DIGIT NINE
16A60>0030 # MRO DIGIT ZERO
16A61>0031 # MRO DIGIT ONE
16A62>0032 # MRO DIGIT TWO

View File

@ -1,3 +1,5 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
@ -7,7 +9,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 9.0.0
* Unicode 10.0.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
@ -164,6 +166,7 @@
0C56:91
0CBC:7
0CCD:9
0D3B..0D3C:9
0D4D:9
0DCA:9
0E38..0E39:103
@ -234,6 +237,9 @@
1DCF:220
1DD0:202
1DD1..1DF5:230
1DF6:232
1DF7..1DF8:228
1DF9:220
1DFB:230
1DFC:233
1DFD:220
@ -322,7 +328,12 @@ FE2E..FE2F:230
116B6:9
116B7:7
1172B:9
11A34:9
11A47:9
11A99:9
11C3F:9
11D42:7
11D44..11D45:9
16AF0..16AF4:1
16B30..16B36:230
1BC9E:1

View File

@ -1,3 +1,5 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
@ -11,7 +13,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 9.0.0
* Unicode 10.0.0
00A0>0020
00A8>0020 0308

View File

@ -1,7 +1,7 @@
# Unicode Character Database
# Copyright (c) 1991-2016 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc_cf.txt
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 9.0.0
* Unicode 10.0.0
0041>0061
0042>0062

View File

@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
* All foldings, case folding, and normalization mappings are applied recursively
* to ensure a fully folded and normalized result.
* </p>
* <p>
* A normalizer with additional settings such as a filter that lists characters not
* to be normalized can be passed in the constructor.
* </p>
*/
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
private static final Normalizer2 normalizer = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);
/**
* A normalizer for search term folding to Unicode text,
* applying foldings from UTR#30 Character Foldings.
*/
public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);
/**
* Create a new ICUFoldingFilter on the specified input
*/
public ICUFoldingFilter(TokenStream input) {
super(input, NORMALIZER);
}
/**
* Create a new ICUFoldingFilter on the specified input with the specified
* normalizer
*/
public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
super(input, normalizer);
}
}

View File

@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
import com.ibm.icu.text.FilteredNormalizer2;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
/**
* Factory for {@link ICUFoldingFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100"&gt;
@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* @since 3.1.0
*/
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
private final Normalizer2 normalizer;
/** Creates a new ICUFoldingFilterFactory */
public ICUFoldingFilterFactory(Map<String,String> args) {
super(args);
Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
String filter = get(args, "filter");
if (filter != null) {
UnicodeSet set = new UnicodeSet(filter);
if (!set.isEmpty()) {
set.freeze();
normalizer = new FilteredNormalizer2(normalizer, set);
}
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
this.normalizer = normalizer;
}
@Override
public TokenStream create(TokenStream input) {
return new ICUFoldingFilter(input);
return new ICUFoldingFilter(input, normalizer);
}
@Override

View File

@ -16,152 +16,84 @@
*/
package org.apache.lucene.analysis.icu.segmentation;
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/**
* Contain all the issues surrounding BreakIterators in ICU in one place.
* Basically this boils down to the fact that they aren't very friendly to any
* sort of OO design.
* <p>
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
* BreakIterator from RuleBasedBreakIterator
* <p>
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
* doesn't actually behave as a subclass: it always returns 0 for
* getRuleStatus():
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
* tags
* Wraps RuleBasedBreakIterator, making object reuse convenient and
* emitting a rule status for emoji sequences.
* @lucene.experimental
*/
abstract class BreakIteratorWrapper {
protected final CharArrayIterator textIterator = new CharArrayIterator();
protected char text[];
protected int start;
protected int length;
final class BreakIteratorWrapper {
private final CharArrayIterator textIterator = new CharArrayIterator();
private final RuleBasedBreakIterator rbbi;
private char text[];
private int start;
private int status;
BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
this.rbbi = rbbi;
}
int current() {
return rbbi.current();
}
abstract int next();
abstract int current();
abstract int getRuleStatus();
abstract void setText(CharacterIterator text);
int getRuleStatus() {
return status;
}
int next() {
int current = rbbi.current();
int next = rbbi.next();
status = calcStatus(current, next);
return next;
}
/** Returns current rule status for the text between breaks. (determines token type) */
private int calcStatus(int current, int next) {
// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
if (next != BreakIterator.DONE && isEmoji(current, next)) {
return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
} else {
return rbbi.getRuleStatus();
}
}
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
// we don't include regional indicators here, because they aren't ambiguous for tagging,
// they need only be treated special for segmentation.
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) {
int begin = start + current;
int end = start + next;
int codepoint = UTF16.charAt(text, 0, end, begin);
// TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
if (EMOJI_RK.contains(codepoint)) {
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
// an emoji presentation selector or keycap follows.
int trailer = begin + Character.charCount(codepoint);
return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
} else {
return true;
}
}
return false;
}
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.length = length;
textIterator.setText(text, start, length);
setText(textIterator);
}
/**
* If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);
}
/**
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
* a DictionaryBasedBreakIterator) behaves correctly.
*/
static final class RBBIWrapper extends BreakIteratorWrapper {
private final RuleBasedBreakIterator rbbi;
RBBIWrapper(RuleBasedBreakIterator rbbi) {
this.rbbi = rbbi;
}
@Override
int current() {
return rbbi.current();
}
@Override
int getRuleStatus() {
return rbbi.getRuleStatus();
}
@Override
int next() {
return rbbi.next();
}
@Override
void setText(CharacterIterator text) {
rbbi.setText(text);
}
}
/**
* Generic BreakIterator wrapper: Either the rulestatus method is not
* available or always returns 0. Calculate a rulestatus here so it behaves
* like RuleBasedBreakIterator.
*
* Note: This is slower than RuleBasedBreakIterator.
*/
static final class BIWrapper extends BreakIteratorWrapper {
private final BreakIterator bi;
private int status;
BIWrapper(BreakIterator bi) {
this.bi = bi;
}
@Override
int current() {
return bi.current();
}
@Override
int getRuleStatus() {
return status;
}
@Override
int next() {
int current = bi.current();
int next = bi.next();
status = calcStatus(current, next);
return next;
}
private int calcStatus(int current, int next) {
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
return RuleBasedBreakIterator.WORD_NONE;
int begin = start + current;
int end = start + next;
int codepoint;
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, begin);
if (UCharacter.isDigit(codepoint))
return RuleBasedBreakIterator.WORD_NUMBER;
else if (UCharacter.isLetter(codepoint)) {
// TODO: try to separately specify ideographic, kana?
// [currently all bundled as letter for this case]
return RuleBasedBreakIterator.WORD_LETTER;
}
}
return RuleBasedBreakIterator.WORD_NONE;
}
@Override
void setText(CharacterIterator text) {
bi.setText(text);
status = RuleBasedBreakIterator.WORD_NONE;
}
rbbi.setText(textIterator);
status = RuleBasedBreakIterator.WORD_NONE;
}
}

View File

@ -123,7 +123,7 @@ final class CompositeBreakIterator {
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
if (wordBreakers[scriptCode] == null)
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
return wordBreakers[scriptCode];
}
}

View File

@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Token type for words that appear to be numbers */
public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** Token type for words that appear to be emoji sequences */
public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
/*
* the default breakiterators in use. these can be expensive to
@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
// the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator =
private static final RuleBasedBreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");
private static final BreakIterator myanmarSyllableIterator =
private static final RuleBasedBreakIterator myanmarSyllableIterator =
readBreakIterator("MyanmarSyllable.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
}
@Override
public BreakIterator getBreakIterator(int script) {
public RuleBasedBreakIterator getBreakIterator(int script) {
switch(script) {
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
case UScript.MYANMAR:
if (myanmarAsWords) {
return (BreakIterator)defaultBreakIterator.clone();
return (RuleBasedBreakIterator)defaultBreakIterator.clone();
} else {
return (BreakIterator)myanmarSyllableIterator.clone();
return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
}
default: return (BreakIterator)defaultBreakIterator.clone();
default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
}
}
@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
case RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
case EMOJI_SEQUENCE_STATUS:
return WORD_EMOJI;
default: /* some other custom code */
return "<OTHER>";
}

View File

@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
*/
private boolean incrementTokenBuffer() {
int start = breaker.current();
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
assert start != BreakIterator.DONE;
// find the next set of boundaries, skipping over non-tokens (rule status 0)
int end = breaker.next();
while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
start = end;
end = breaker.next();
}
if (start == BreakIterator.DONE)
if (end == BreakIterator.DONE) {
return false; // BreakIterator exhausted
}
termAtt.copyBuffer(buffer, start, end - start);
offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));

View File

@ -16,8 +16,7 @@
*/
package org.apache.lucene.analysis.icu.segmentation;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
/**
* Class that allows for tailored Unicode Text Segmentation on
@ -25,14 +24,16 @@ import com.ibm.icu.text.BreakIterator;
* @lucene.experimental
*/
public abstract class ICUTokenizerConfig {
/** Rule status for emoji sequences */
public static final int EMOJI_SEQUENCE_STATUS = 299;
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public ICUTokenizerConfig() {}
/** Return a breakiterator capable of processing a given script. */
public abstract BreakIterator getBreakIterator(int script);
public abstract RuleBasedBreakIterator getBreakIterator(int script);
/** Return a token type value for a given script and BreakIterator
* rule status. */
public abstract String getType(int script, int ruleStatus);

View File

@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
@Override
public BreakIterator getBreakIterator(int script) {
public RuleBasedBreakIterator getBreakIterator(int script) {
if (breakers[script] != null) {
return (BreakIterator) breakers[script].clone();
return (RuleBasedBreakIterator) breakers[script].clone();
} else {
return super.getBreakIterator(script);
}

View File

@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 8.0). However, some users who wish
the most recent version of Unicode (currently 10.0). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
/** basic tests for {@link ICUFoldingFilterFactory} */
public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
/** basic tests to ensure the folding is working */
public void test() throws Exception {
Reader reader = new StringReader("Résumé");
@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "resume" });
}
/** test to ensure the filter parameter is working */
public void testFilter() throws Exception {
HashMap<String,String> args = new HashMap<String,String>();
args.put("filter", "[^ö]");
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
Reader reader = new StringReader("Résumé");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "resume" });
reader = new StringReader("Fönster");
stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] { "fönster" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {

View File

@ -16,13 +16,10 @@
*/
package org.apache.lucene.analysis.icu.segmentation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import com.ibm.icu.lang.UScript;
@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
return new TokenStreamComponents(tokenizer);
}
};
}
@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testArmenian() throws Exception {
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
}
public void testAmharic() throws Exception {
@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testArabic() throws Exception {
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
}
public void testAramaic() throws Exception {
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
}
@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testGreek() throws Exception {
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
}
@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
*/
public void testChinese() throws Exception {
assertAnalyzesTo(a, "我是中国人。 ",
new String[] { "", "", "", "", "", "1234", "tests"});
new String[] { "", "", "", "", "", "", ""});
}
public void testHebrew() throws Exception {
@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
assertAnalyzesTo(a, "2B", new String[]{"2B"});
}
public void testDelimitersSA() throws Exception {
@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she's"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
}
public void testVariousTextSA() throws Exception {
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
public void testKoreanSA() throws Exception {
@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testOffsets() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"},
new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
public void testTypes() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"},
new String[] {"David", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
}
/** simple emoji */
public void testEmoji() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
new String[] { "💩", "💩", "💩" },
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
new String[] { "👩‍❤️‍👩" },
new String[] { "<EMOJI>" });
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
new String[] { "👨🏼‍⚕️" },
new String[] { "<EMOJI>" });
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
new String[] { "🇺🇸", "🇺🇸" },
new String[] { "<EMOJI>", "<EMOJI>" });
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
new String[] { "#️⃣" },
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3",
new String[] { "3",},
new String[] { "<EMOJI>" });
}
public void testEmojiTagSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
new String[] { "<EMOJI>" });
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
new String[] { "poo", "💩", "poo" },
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
// simple emoji around non-latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
new String[] { "💩", "", "", "💩" },
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

View File

@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
);
}
/**
* dictionary segmentation with emoji
*/
public void testSimpleJapaneseWithEmoji() throws Exception {
assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
new String[] { "それ", "", "まだ", "実験", "段階", "", "あり", "ます", "💩" }
);
}
public void testJapaneseTypes() throws Exception {
assertAnalyzesTo(a, "仮名遣い カタカナ",
new String[] { "仮名遣い", "カタカナ" },

View File

@ -62,9 +62,9 @@ import java.util.regex.Pattern;
*/
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-58-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
= "http://source.icu-project.org/repos/icu/tags";
private static final String ICU_RELEASE_TAG = "release-60-2";
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
private static final String NFKC_CF_TXT = "nfkc_cf.txt";

View File

@ -166,9 +166,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter {
buffer.reset(input);
}
/**
* {@inheritDoc}
*/
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
int read = 0;
@ -185,9 +182,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter {
return read == 0 ? -1 : read;
}
/**
* {@inheritDoc}
*/
@Override
public int read() throws IOException {
int ic = buffer.get(bufferPosition);

View File

@ -293,7 +293,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
"7.1.0-cfs",
"7.1.0-nocfs",
"7.2.0-cfs",
"7.2.0-nocfs"
"7.2.0-nocfs",
"7.2.1-cfs",
"7.2.1-nocfs"
};
public static String[] getOldNames() {
@ -304,7 +306,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
"sorted.7.0.0",
"sorted.7.0.1",
"sorted.7.1.0",
"sorted.7.2.0"
"sorted.7.2.0",
"sorted.7.2.1"
};
public static String[] getOldSortedNames() {

View File

@ -99,17 +99,11 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
this.query = query;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
return assignClassNormalizedList(inputDocument).get(0);
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
@ -117,9 +111,6 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);

View File

@ -195,9 +195,6 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<Boolean> assignClass(String text)
throws IOException {
@ -220,18 +217,12 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
return new ClassificationResult<>(output >= bias, score);
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<Boolean>> getClasses(String text)
throws IOException {
return null;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<Boolean>> getClasses(String text, int max)
throws IOException {

View File

@ -103,9 +103,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
TopDocs knnResults = knnSearch(text);
@ -121,9 +118,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
return assignedClass;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
TopDocs knnResults = knnSearch(text);
@ -132,9 +126,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
TopDocs knnResults = knnSearch(text);
@ -213,7 +204,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
", similarity=" + indexSearcher.getSimilarity(true) +
", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}

View File

@ -119,9 +119,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
return classifyFromTopDocs(knnSearch(text));
@ -143,9 +140,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
return assignedClass;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
TopDocs knnResults = knnSearch(text);
@ -154,9 +148,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
TopDocs knnResults = knnSearch(text);
@ -251,7 +242,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
", similarity=" + indexSearcher.getSimilarity(true) +
", similarity=" + indexSearcher.getSimilarity() +
'}';
}
}

View File

@ -98,9 +98,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
this.query = query;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(inputDocument);
@ -115,9 +112,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
return assignedClass;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
@ -125,9 +119,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);

View File

@ -72,17 +72,11 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi
this.field2analyzer = field2analyzer;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(Document document) throws IOException {
return classifyFromTopDocs(knnSearch(document));
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(Document document) throws IOException {
TopDocs knnResults = knnSearch(document);
@ -91,9 +85,6 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(Document document, int max) throws IOException {
TopDocs knnResults = knnSearch(document);

View File

@ -71,9 +71,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
this.field2analyzer = field2analyzer;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(Document document) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);
@ -88,9 +85,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
return assignedClass;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(Document document) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);
@ -98,9 +92,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(Document document, int max) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
@ -210,20 +210,20 @@ public class NearestFuzzyQuery extends Query {
}
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
// we build an artificial TermContext that will give an overall df and ttf
// we build an artificial TermStates that will give an overall df and ttf
// equal to 1
TermContext context = new TermContext(reader.getContext());
TermStates termStates = new TermStates(reader.getContext());
for (LeafReaderContext leafContext : reader.leaves()) {
Terms terms = leafContext.reader().terms(term.field());
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
int freq = 1 - termStates.docFreq(); // we want the total df and ttf to be 1
termStates.register(termsEnum.termState(), leafContext.ord, freq, freq);
}
}
}
return new TermQuery(term, context);
return new TermQuery(term, termStates);
}
@Override

View File

@ -1309,7 +1309,8 @@ ant test "-Dtests.method=*esi*"
ant test -Dtests.seed=DEADBEEF
# Repeats _all_ tests of ClassName N times. Every test repetition
# will have a different seed.
# will have a different seed. NOTE: does not reinitialize
# between repetitions, use only for idempotent tests.
ant test -Dtests.iters=N -Dtestcase=ClassName
# Repeats _all_ tests of ClassName N times. Every test repetition

View File

@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer {
public static final int KATAKANA = 5;
/** Hangul token type */
public static final int HANGUL = 6;
/** Emoji token type. */
public static final int EMOJI = 7;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer {
"<IDEOGRAPHIC>",
"<HIRAGANA>",
"<KATAKANA>",
"<HANGUL>"
"<HANGUL>",
"<EMOJI>"
};
/** Absolute maximum sized token */

View File

@ -103,11 +103,8 @@ final class IntersectTermsEnum extends TermsEnum {
arcs[arcIdx] = new FST.Arc<>();
}
if (fr.index == null) {
fstReader = null;
} else {
fstReader = fr.index.getBytesReader();
}
fstReader = fr.index.getBytesReader();
// TODO: if the automaton is "smallish" we really
// should use the terms index to seek at least to

View File

@ -17,34 +17,37 @@
package org.apache.lucene.index;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.util.Arrays;
/**
* Maintains a {@link IndexReader} {@link TermState} view over
* {@link IndexReader} instances containing a single term. The
* {@link TermContext} doesn't track if the given {@link TermState}
* {@link TermStates} doesn't track if the given {@link TermState}
* objects are valid, neither if the {@link TermState} instances refer to the
* same terms in the associated readers.
*
* @lucene.experimental
*/
public final class TermContext {
public final class TermStates {
private static final TermState EMPTY_TERMSTATE = new TermState() {
@Override
public void copyFrom(TermState other) {
}
};
// Important: do NOT keep hard references to index readers
private final Object topReaderContextIdentity;
private final TermState[] states;
private final Term term; // null if stats are to be used
private int docFreq;
private long totalTermFreq;
//public static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
/**
* Creates an empty {@link TermContext} from a {@link IndexReaderContext}
*/
public TermContext(IndexReaderContext context) {
private TermStates(Term term, IndexReaderContext context) {
assert context != null && context.isTopLevel;
topReaderContextIdentity = context.identity;
docFreq = 0;
@ -56,10 +59,18 @@ public final class TermContext {
len = context.leaves().size();
}
states = new TermState[len];
this.term = term;
}
/**
* Expert: Return whether this {@link TermContext} was built for the given
* Creates an empty {@link TermStates} from a {@link IndexReaderContext}
*/
public TermStates(IndexReaderContext context) {
this(null, context);
}
/**
* Expert: Return whether this {@link TermStates} was built for the given
* {@link IndexReaderContext}. This is typically used for assertions.
* @lucene.internal
*/
@ -68,35 +79,35 @@ public final class TermContext {
}
/**
* Creates a {@link TermContext} with an initial {@link TermState},
* Creates a {@link TermStates} with an initial {@link TermState},
* {@link IndexReader} pair.
*/
public TermContext(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
this(context);
public TermStates(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
this(null, context);
register(state, ord, docFreq, totalTermFreq);
}
/**
* Creates a {@link TermContext} from a top-level {@link IndexReaderContext} and the
* Creates a {@link TermStates} from a top-level {@link IndexReaderContext} and the
* given {@link Term}. This method will lookup the given term in all context's leaf readers
* and register each of the readers containing the term in the returned {@link TermContext}
* and register each of the readers containing the term in the returned {@link TermStates}
* using the leaf reader's ordinal.
* <p>
* Note: the given context must be a top-level context.
*
* @param needsStats if {@code true} then all leaf contexts will be visited up-front to
* collect term statistics. Otherwise, the {@link TermState} objects
* will be built only when requested
*/
public static TermContext build(IndexReaderContext context, Term term)
public static TermStates build(IndexReaderContext context, Term term, boolean needsStats)
throws IOException {
assert context != null && context.isTopLevel;
final String field = term.field();
final BytesRef bytes = term.bytes();
final TermContext perReaderTermState = new TermContext(context);
//if (DEBUG) System.out.println("prts.build term=" + term);
for (final LeafReaderContext ctx : context.leaves()) {
//if (DEBUG) System.out.println(" r=" + leaves[i].reader);
final Terms terms = ctx.reader().terms(field);
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(bytes)) {
final TermStates perReaderTermState = new TermStates(needsStats ? null : term, context);
if (needsStats) {
for (final LeafReaderContext ctx : context.leaves()) {
//if (DEBUG) System.out.println(" r=" + leaves[i].reader);
TermsEnum termsEnum = loadTermsEnum(ctx, term);
if (termsEnum != null) {
final TermState termState = termsEnum.termState();
//if (DEBUG) System.out.println(" found");
perReaderTermState.register(termState, ctx.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
@ -106,8 +117,19 @@ public final class TermContext {
return perReaderTermState;
}
private static TermsEnum loadTermsEnum(LeafReaderContext ctx, Term term) throws IOException {
final Terms terms = ctx.reader().terms(term.field());
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
return termsEnum;
}
}
return null;
}
/**
* Clears the {@link TermContext} internal state and removes all
* Clears the {@link TermStates} internal state and removes all
* registered {@link TermState}s
*/
public void clear() {
@ -149,17 +171,25 @@ public final class TermContext {
}
/**
* Returns the {@link TermState} for an leaf ordinal or <code>null</code> if no
* {@link TermState} for the ordinal was registered.
* Returns the {@link TermState} for a leaf reader context or <code>null</code> if no
* {@link TermState} for the context was registered.
*
* @param ord
* the readers leaf ordinal to get the {@link TermState} for.
* @param ctx
* the {@link LeafReaderContext} to get the {@link TermState} for.
* @return the {@link TermState} for the given readers ord or <code>null</code> if no
* {@link TermState} for the reader was registered
*/
public TermState get(int ord) {
assert ord >= 0 && ord < states.length;
return states[ord];
public TermState get(LeafReaderContext ctx) throws IOException {
assert ctx.ord >= 0 && ctx.ord < states.length;
if (term == null)
return states[ctx.ord];
if (this.states[ctx.ord] == null) {
TermsEnum te = loadTermsEnum(ctx, term);
this.states[ctx.ord] = te == null ? EMPTY_TERMSTATE : te.termState();
}
if (this.states[ctx.ord] == EMPTY_TERMSTATE)
return null;
return this.states[ctx.ord];
}
/**
@ -169,6 +199,9 @@ public final class TermContext {
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public int docFreq() {
if (term != null) {
throw new IllegalStateException("Cannot call docFreq() when needsStats=false");
}
return docFreq;
}
@ -179,19 +212,23 @@ public final class TermContext {
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public long totalTermFreq() {
if (term != null) {
throw new IllegalStateException("Cannot call totalTermFreq() when needsStats=false");
}
return totalTermFreq;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("TermContext\n");
sb.append("TermStates\n");
for(TermState termState : states) {
sb.append(" state=");
sb.append(termState.toString());
sb.append(termState);
sb.append('\n');
}
return sb.toString();
}
}

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.ArrayUtil;
@ -53,7 +53,7 @@ public final class BlendedTermQuery extends Query {
private int numTerms = 0;
private Term[] terms = new Term[0];
private float[] boosts = new float[0];
private TermContext[] contexts = new TermContext[0];
private TermStates[] contexts = new TermStates[0];
private RewriteMethod rewriteMethod = DISJUNCTION_MAX_REWRITE;
/** Sole constructor. */
@ -82,10 +82,10 @@ public final class BlendedTermQuery extends Query {
/**
* Expert: Add a {@link Term} with the provided boost and context.
* This method is useful if you already have a {@link TermContext}
* This method is useful if you already have a {@link TermStates}
* object constructed for the given term.
*/
public Builder add(Term term, float boost, TermContext context) {
public Builder add(Term term, float boost, TermStates context) {
if (numTerms >= BooleanQuery.getMaxClauseCount()) {
throw new BooleanQuery.TooManyClauses();
}
@ -184,10 +184,10 @@ public final class BlendedTermQuery extends Query {
private final Term[] terms;
private final float[] boosts;
private final TermContext[] contexts;
private final TermStates[] contexts;
private final RewriteMethod rewriteMethod;
private BlendedTermQuery(Term[] terms, float[] boosts, TermContext[] contexts,
private BlendedTermQuery(Term[] terms, float[] boosts, TermStates[] contexts,
RewriteMethod rewriteMethod) {
assert terms.length == boosts.length;
assert terms.length == contexts.length;
@ -205,7 +205,7 @@ public final class BlendedTermQuery extends Query {
terms[i] = terms[j];
terms[j] = tmpTerm;
TermContext tmpContext = contexts[i];
TermStates tmpContext = contexts[i];
contexts[i] = contexts[j];
contexts[j] = tmpContext;
@ -263,10 +263,10 @@ public final class BlendedTermQuery extends Query {
@Override
public final Query rewrite(IndexReader reader) throws IOException {
final TermContext[] contexts = Arrays.copyOf(this.contexts, this.contexts.length);
final TermStates[] contexts = Arrays.copyOf(this.contexts, this.contexts.length);
for (int i = 0; i < contexts.length; ++i) {
if (contexts[i] == null || contexts[i].wasBuiltFor(reader.getContext()) == false) {
contexts[i] = TermContext.build(reader.getContext(), terms[i]);
contexts[i] = TermStates.build(reader.getContext(), terms[i], true);
}
}
@ -275,7 +275,7 @@ public final class BlendedTermQuery extends Query {
// ttf will be the sum of all total term freqs
int df = 0;
long ttf = 0;
for (TermContext ctx : contexts) {
for (TermStates ctx : contexts) {
df = Math.max(df, ctx.docFreq());
ttf += ctx.totalTermFreq();
}
@ -294,8 +294,8 @@ public final class BlendedTermQuery extends Query {
return rewriteMethod.rewrite(termQueries);
}
private static TermContext adjustFrequencies(IndexReaderContext readerContext,
TermContext ctx, int artificialDf, long artificialTtf) {
private static TermStates adjustFrequencies(IndexReaderContext readerContext,
TermStates ctx, int artificialDf, long artificialTtf) throws IOException {
List<LeafReaderContext> leaves = readerContext.leaves();
final int len;
if (leaves == null) {
@ -303,9 +303,9 @@ public final class BlendedTermQuery extends Query {
} else {
len = leaves.size();
}
TermContext newCtx = new TermContext(readerContext);
TermStates newCtx = new TermStates(readerContext);
for (int i = 0; i < len; ++i) {
TermState termState = ctx.get(i);
TermState termState = ctx.get(leaves.get(i));
if (termState == null) {
continue;
}

View File

@ -48,7 +48,7 @@ final class BooleanWeight extends Weight {
super(query);
this.query = query;
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
weights = new ArrayList<>();
for (BooleanClause c : query) {
Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost);

View File

@ -22,7 +22,6 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity;
final class ExactPhraseScorer extends Scorer {
@ -42,13 +41,13 @@ final class ExactPhraseScorer extends Scorer {
private int freq;
private final Similarity.SimScorer docScorer;
private final LeafSimScorer docScorer;
private final boolean needsScores, needsTotalHitCount;
private float matchCost;
private float minCompetitiveScore;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity.SimScorer docScorer, ScoreMode scoreMode,
LeafSimScorer docScorer, ScoreMode scoreMode,
float matchCost) throws IOException {
super(weight);
this.docScorer = docScorer;
@ -123,7 +122,7 @@ final class ExactPhraseScorer extends Scorer {
@Override
public float maxScore() {
return docScorer.maxScore(Integer.MAX_VALUE);
return docScorer.maxScore();
}
/** Advance the given pos enum to the first doc on or after {@code target}.

View File

@ -32,7 +32,6 @@ import java.util.concurrent.Future;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.IndexWriter;
@ -40,7 +39,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
@ -75,36 +74,6 @@ import org.apache.lucene.util.ThreadInterruptedException;
*/
public class IndexSearcher {
/** A search-time {@link Similarity} that does not make use of scoring factors
* and may be used when scores are not needed. */
private static final Similarity NON_SCORING_SIMILARITY = new Similarity() {
@Override
public long computeNorm(FieldInvertState state) {
throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing");
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float score(int doc, float freq) {
return 0f;
}
@Override
public float maxScore(float maxFreq) {
return 0f;
}
};
}
};
private static QueryCache DEFAULT_QUERY_CACHE;
private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy();
static {
@ -136,7 +105,7 @@ public class IndexSearcher {
* Expert: returns a default Similarity instance.
* In general, this method is only called to initialize searchers and writers.
* User code and query implementations should respect
* {@link IndexSearcher#getSimilarity(boolean)}.
* {@link IndexSearcher#getSimilarity()}.
* @lucene.internal
*/
public static Similarity getDefaultSimilarity() {
@ -329,15 +298,11 @@ public class IndexSearcher {
this.similarity = similarity;
}
/** Expert: Get the {@link Similarity} to use to compute scores. When
* {@code needsScores} is {@code false}, this method will return a simple
* {@link Similarity} that does not leverage scoring factors such as norms.
* When {@code needsScores} is {@code true}, this returns the
/** Expert: Get the {@link Similarity} to use to compute scores. This returns the
* {@link Similarity} that has been set through {@link #setSimilarity(Similarity)}
* or the {@link #getDefaultSimilarity()} default {@link Similarity} if none
* has been set explicitly. */
public Similarity getSimilarity(boolean needsScores) {
return needsScores ? similarity : NON_SCORING_SIMILARITY;
* or the default {@link Similarity} if none has been set explicitly. */
public Similarity getSimilarity() {
return similarity;
}
/**
@ -774,7 +739,7 @@ public class IndexSearcher {
* across a distributed collection.
* @lucene.experimental
*/
public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
if (context.docFreq() == 0) {
return null;
} else {

View File

@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* {@link SimScorer} on a specific {@link LeafReader}.
*/
public final class LeafSimScorer {
private final SimScorer scorer;
private final NumericDocValues norms;
private final float maxScore;
/**
* Sole constructor: Score documents of {@code reader} with {@code scorer}.
*/
public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException {
this.scorer = scorer;
norms = needsScores ? reader.getNormValues(scorer.getField()) : null;
maxScore = needsScores ? scorer.score(maxFreq, 1) : Float.MAX_VALUE;
}
private long getNormValue(int doc) throws IOException {
if (norms != null) {
boolean found = norms.advanceExact(doc);
assert found;
return norms.longValue();
} else {
return 1L; // default norm
}
}
/** Score the provided document assuming the given term document frequency.
* This method must be called on non-decreasing sequences of doc ids.
* @see SimScorer#score(float, long) */
public float score(int doc, float freq) throws IOException {
return scorer.score(freq, getNormValue(doc));
}
/** Explain the score for the provided document assuming the given term document frequency.
* This method must be called on non-decreasing sequences of doc ids.
* @see SimScorer#explain(Explanation, long) */
public Explanation explain(int doc, Explanation freqExpl) throws IOException {
return scorer.explain(freqExpl, getNormValue(doc));
}
/**
* Return an upper bound of the score.
*/
public float maxScore() {
return maxScore;
}
}

View File

@ -18,19 +18,26 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -183,36 +190,38 @@ public class MultiPhraseQuery extends Query {
private class MultiPhraseWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private final Map<Term,TermContext> termContexts = new HashMap<>();
private final Similarity.SimScorer stats;
private final Map<Term,TermStates> termStates = new HashMap<>();
private final ScoreMode scoreMode;
public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
super(MultiPhraseQuery.this);
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
// compute idf
ArrayList<TermStatistics> allTermStats = new ArrayList<>();
for(final Term[] terms: termArrays) {
for (Term term: terms) {
TermContext termContext = termContexts.get(term);
if (termContext == null) {
termContext = TermContext.build(context, term);
termContexts.put(term, termContext);
TermStates ts = termStates.get(term);
if (ts == null) {
ts = TermStates.build(context, term, scoreMode.needsScores());
termStates.put(term, ts);
}
TermStatistics termStatistics = searcher.termStatistics(term, termContext);
if (termStatistics != null) {
allTermStats.add(termStatistics);
if (scoreMode.needsScores()) {
TermStatistics termStatistics = searcher.termStatistics(term, ts);
if (termStatistics != null) {
allTermStats.add(termStatistics);
}
}
}
}
if (allTermStats.isEmpty()) {
stats = null; // none of the terms were found, we won't use sim at all
} else {
stats = similarity.computeWeight(
stats = similarity.scorer(
boost,
searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
@ -253,7 +262,7 @@ public class MultiPhraseQuery extends Query {
List<PostingsEnum> postings = new ArrayList<>();
for (Term term : terms) {
TermState termState = termContexts.get(term).get(context.ord);
TermState termState = termStates.get(term).get(context);
if (termState != null) {
termsEnum.seekExact(term.bytes(), termState);
postings.add(termsEnum.postings(null, PostingsEnum.POSITIONS));
@ -282,11 +291,11 @@ public class MultiPhraseQuery extends Query {
if (slop == 0) {
return new ExactPhraseScorer(this, postingsFreqs,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE),
scoreMode.needsScores(), totalMatchCost);
}
}
@ -303,7 +312,7 @@ public class MultiPhraseQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
SimScorer docScorer = similarity.simScorer(stats, context);
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(

View File

@ -24,7 +24,7 @@ import org.apache.lucene.index.FilteredTermsEnum; // javadocs
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SingleTermsEnum; // javadocs
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanQuery.Builder;
@ -166,7 +166,7 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermContext states) {
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermStates states) {
final TermQuery tq = new TermQuery(term, states);
topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD);
}
@ -218,7 +218,7 @@ public abstract class MultiTermQuery extends Query {
@Override
protected void addClause(BlendedTermQuery.Builder topLevel, Term term, int docCount,
float boost, TermContext states) {
float boost, TermStates states) {
topLevel.add(term, boost, states);
}
}
@ -262,7 +262,7 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermContext states) {
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermStates states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
topLevel.add(new BoostQuery(q, boost), BooleanClause.Occur.SHOULD);
}

View File

@ -25,7 +25,7 @@ import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -148,9 +148,9 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
// build a boolean query
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : collectedTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
final TermStates termStates = new TermStates(searcher.getTopReaderContext());
termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(query.field, t.term), termStates), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());

View File

@ -32,12 +32,11 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -352,9 +351,9 @@ public class PhraseQuery extends Query {
private class PhraseWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private final Similarity.SimScorer stats;
private final ScoreMode scoreMode;
private transient TermContext states[];
private transient TermStates states[];
public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
@ -366,21 +365,23 @@ public class PhraseQuery extends Query {
throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
}
this.scoreMode = scoreMode;
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
this.similarity = searcher.getSimilarity();
final IndexReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.length];
states = new TermStates[terms.length];
TermStatistics termStats[] = new TermStatistics[terms.length];
int termUpTo = 0;
for (int i = 0; i < terms.length; i++) {
final Term term = terms[i];
states[i] = TermContext.build(context, term);
TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
if (termStatistics != null) {
termStats[termUpTo++] = termStatistics;
states[i] = TermStates.build(context, term, scoreMode.needsScores());
if (scoreMode.needsScores()) {
TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
if (termStatistics != null) {
termStats[termUpTo++] = termStatistics;
}
}
}
if (termUpTo > 0) {
stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
} else {
stats = null; // no terms at all, we won't use similarity
}
@ -415,7 +416,7 @@ public class PhraseQuery extends Query {
for (int i = 0; i < terms.length; i++) {
final Term t = terms[i];
final TermState state = states[i].get(context.ord);
final TermState state = states[i].get(context);
if (state == null) { /* term doesnt exist in this segment */
assert termNotInReader(reader, t): "no termstate found but term exists in reader";
return null;
@ -433,11 +434,11 @@ public class PhraseQuery extends Query {
if (slop == 0) { // optimize exact case
return new ExactPhraseScorer(this, postingsFreqs,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
scoreMode, totalMatchCost);
} else {
return new SloppyPhraseScorer(this, postingsFreqs, slop,
similarity.simScorer(stats, context),
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE),
scoreMode.needsScores(), totalMatchCost);
}
}
@ -459,7 +460,7 @@ public class PhraseQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
SimScorer docScorer = similarity.simScorer(stats, context);
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(

View File

@ -20,7 +20,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
@ -64,7 +64,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
@Override
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount,
float boost, TermContext states) {
float boost, TermStates states) {
final TermQuery tq = new TermQuery(term, states);
topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD);
}
@ -109,7 +109,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
if (size > 0) {
final int sort[] = col.terms.sort();
final float[] boost = col.array.boost;
final TermContext[] termStates = col.array.termState;
final TermStates[] termStates = col.array.termState;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
@ -146,7 +146,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
} else {
// new entry: we populate the entry initially
array.boost[e] = boostAtt.getBoost();
array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
array.termState[e] = new TermStates(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
ScoringRewrite.this.checkMaxClauseCount(terms.size());
}
return true;
@ -156,7 +156,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
float[] boost;
TermContext[] termState;
TermStates[] termState;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
@ -166,7 +166,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, Float.BYTES)];
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
termState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length && boost.length >= ord.length;
return ord;
}
@ -176,7 +176,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
final int[] ord = super.grow();
boost = ArrayUtil.grow(boost, ord.length);
if (termState.length < ord.length) {
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
TermStates[] tmpTermState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}

View File

@ -26,7 +26,6 @@ import java.util.HashSet;
import java.util.LinkedHashMap;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.FixedBitSet;
final class SloppyPhraseScorer extends Scorer {
@ -36,7 +35,7 @@ final class SloppyPhraseScorer extends Scorer {
private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().
private final Similarity.SimScorer docScorer;
private final LeafSimScorer docScorer;
private final int slop;
private final int numPostings;
@ -55,7 +54,7 @@ final class SloppyPhraseScorer extends Scorer {
private final float matchCost;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
int slop, Similarity.SimScorer docScorer, boolean needsScores,
int slop, LeafSimScorer docScorer, boolean needsScores,
float matchCost) {
super(weight);
this.docScorer = docScorer;
@ -558,7 +557,7 @@ final class SloppyPhraseScorer extends Scorer {
@Override
public float maxScore() {
return docScorer.maxScore(Float.POSITIVE_INFINITY);
return docScorer.maxScore();
}
@Override

View File

@ -31,11 +31,10 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.BytesRef;
/**
@ -127,28 +126,28 @@ public final class SynonymQuery extends Query {
}
class SynonymWeight extends Weight {
private final TermContext termContexts[];
private final TermStates termStates[];
private final Similarity similarity;
private final Similarity.SimWeight simWeight;
private final Similarity.SimScorer simWeight;
SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException {
super(query);
CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].field());
long docFreq = 0;
long totalTermFreq = 0;
termContexts = new TermContext[terms.length];
for (int i = 0; i < termContexts.length; i++) {
termContexts[i] = TermContext.build(searcher.getTopReaderContext(), terms[i]);
TermStatistics termStats = searcher.termStatistics(terms[i], termContexts[i]);
termStates = new TermStates[terms.length];
for (int i = 0; i < termStates.length; i++) {
termStates[i] = TermStates.build(searcher.getTopReaderContext(), terms[i], true);
TermStatistics termStats = searcher.termStatistics(terms[i], termStates[i]);
if (termStats != null) {
docFreq = Math.max(termStats.docFreq(), docFreq);
totalTermFreq += termStats.totalTermFreq();
}
}
this.similarity = searcher.getSimilarity(true);
this.similarity = searcher.getSimilarity();
if (docFreq > 0) {
TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq);
this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats);
this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats);
} else {
this.simWeight = null; // no terms exist at all, we won't use similarity
}
@ -175,7 +174,7 @@ public final class SynonymQuery extends Query {
assert scorer instanceof TermScorer;
freq = ((TermScorer)scorer).freq();
}
SimScorer docScorer = similarity.simScorer(simWeight, context);
LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), true, Float.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
@ -190,7 +189,6 @@ public final class SynonymQuery extends Query {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Similarity.SimScorer simScorer = null;
IndexOptions indexOptions = IndexOptions.NONE;
if (terms.length > 0) {
FieldInfo info = context.reader()
@ -202,21 +200,17 @@ public final class SynonymQuery extends Query {
}
// we use termscorers + disjunction as an impl detail
List<Scorer> subScorers = new ArrayList<>();
long maxFreq = 0;
long totalMaxFreq = 0;
for (int i = 0; i < terms.length; i++) {
TermState state = termContexts[i].get(context.ord);
TermState state = termStates[i].get(context);
if (state != null) {
TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
termsEnum.seekExact(terms[i].bytes(), state);
maxFreq += getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
long termMaxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
totalMaxFreq += termMaxFreq;
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
// lazy init sim, in case no terms exist
if (simScorer == null) {
simScorer = similarity.simScorer(simWeight, context);
}
subScorers.add(new TermScorer(this, postings, simScorer, Float.POSITIVE_INFINITY));
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, termMaxFreq);
subScorers.add(new TermScorer(this, postings, simScorer));
}
}
if (subScorers.isEmpty()) {
@ -225,7 +219,8 @@ public final class SynonymQuery extends Query {
// we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs
return subScorers.get(0);
} else {
return new SynonymScorer(simScorer, this, subScorers, maxFreq);
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, totalMaxFreq);
return new SynonymScorer(simScorer, this, subScorers);
}
}
@ -248,13 +243,11 @@ public final class SynonymQuery extends Query {
}
static class SynonymScorer extends DisjunctionScorer {
private final Similarity.SimScorer similarity;
private final float maxFreq;
private final LeafSimScorer similarity;
SynonymScorer(Similarity.SimScorer similarity, Weight weight, List<Scorer> subScorers, float maxFreq) {
SynonymScorer(LeafSimScorer similarity, Weight weight, List<Scorer> subScorers) {
super(weight, subScorers, true);
this.similarity = similarity;
this.maxFreq = maxFreq;
}
@Override
@ -264,7 +257,7 @@ public final class SynonymQuery extends Query {
@Override
public float maxScore() {
return similarity.maxScore(maxFreq);
return similarity.maxScore();
}
/** combines TF of all subs. */

View File

@ -23,7 +23,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.AttributeSource;
@ -43,7 +43,7 @@ abstract class TermCollectingRewrite<B> extends MultiTermQuery.RewriteMethod {
addClause(topLevel, term, docCount, boost, null);
}
protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermContext states) throws IOException;
protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermStates states) throws IOException;
final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {

View File

@ -33,7 +33,7 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -268,9 +268,9 @@ public class TermInSetQuery extends Query implements Accountable {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : matchingTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
final TermStates termStates = new TermStates(searcher.getTopReaderContext());
termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(t.field, t.term), termStates), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());

View File

@ -28,12 +28,10 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* A Query that matches documents containing a term. This may be combined with
@ -42,23 +40,23 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
public class TermQuery extends Query {
private final Term term;
private final TermContext perReaderTermState;
private final TermStates perReaderTermState;
final class TermWeight extends Weight {
private final Similarity similarity;
private final Similarity.SimWeight stats;
private final TermContext termStates;
private final Similarity.SimScorer simScorer;
private final TermStates termStates;
private final boolean needsScores;
public TermWeight(IndexSearcher searcher, boolean needsScores,
float boost, TermContext termStates) throws IOException {
float boost, TermStates termStates) throws IOException {
super(TermQuery.this);
if (needsScores && termStates == null) {
throw new IllegalStateException("termStates are required when scores are needed");
}
this.needsScores = needsScores;
this.termStates = termStates;
this.similarity = searcher.getSimilarity(needsScores);
this.similarity = searcher.getSimilarity();
final CollectionStatistics collectionStats;
final TermStatistics termStats;
@ -72,9 +70,9 @@ public class TermQuery extends Query {
}
if (termStats == null) {
this.stats = null; // term doesn't exist in any segment, we won't use similarity at all
this.simScorer = null; // term doesn't exist in any segment, we won't use similarity at all
} else {
this.stats = similarity.computeWeight(boost, collectionStats, termStats);
this.simScorer = similarity.scorer(boost, collectionStats, termStats);
}
}
@ -101,8 +99,8 @@ public class TermQuery extends Query {
.getIndexOptions();
PostingsEnum docs = termsEnum.postings(null, needsScores ? PostingsEnum.FREQS : PostingsEnum.NONE);
assert docs != null;
return new TermScorer(this, docs, similarity.simScorer(stats, context),
getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()));
float maxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
return new TermScorer(this, docs, new LeafSimScorer(simScorer, context.reader(), needsScores, maxFreq));
}
private long getMaxFreq(IndexOptions indexOptions, long ttf, long df) {
@ -126,30 +124,17 @@ public class TermQuery extends Query {
* the term does not exist in the given context
*/
private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException {
if (termStates != null) {
// TermQuery either used as a Query or the term states have been provided at construction time
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
return null;
}
final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
termsEnum.seekExact(term.bytes(), state);
return termsEnum;
} else {
// TermQuery used as a filter, so the term states have not been built up front
Terms terms = context.reader().terms(term.field());
if (terms == null) {
return null;
}
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
return termsEnum;
} else {
return null;
}
assert termStates != null;
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) :
"The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
final TermState state = termStates.get(context);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
return null;
}
final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
termsEnum.seekExact(term.bytes(), state);
return termsEnum;
}
private boolean termNotInReader(LeafReader reader, Term term) throws IOException {
@ -166,7 +151,7 @@ public class TermQuery extends Query {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SimScorer docScorer = similarity.simScorer(stats, context);
LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Integer.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document");
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
@ -190,7 +175,7 @@ public class TermQuery extends Query {
* Expert: constructs a TermQuery that will use the provided docFreq instead
* of looking up the docFreq against the searcher.
*/
public TermQuery(Term t, TermContext states) {
public TermQuery(Term t, TermStates states) {
assert states != null;
term = Objects.requireNonNull(t);
perReaderTermState = Objects.requireNonNull(states);
@ -204,18 +189,10 @@ public class TermQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final IndexReaderContext context = searcher.getTopReaderContext();
final TermContext termState;
final TermStates termState;
if (perReaderTermState == null
|| perReaderTermState.wasBuiltFor(context) == false) {
if (scoreMode.needsScores()) {
// make TermQuery single-pass if we don't have a PRTS or if the context
// differs!
termState = TermContext.build(context, term);
} else {
// do not compute the term state, this will help save seeks in the terms
// dict on segments that have a cache entry for this query
termState = null;
}
termState = TermStates.build(context, term, scoreMode.needsScores());
} else {
// PRTS was pre-build for this IS
termState = this.perReaderTermState;

View File

@ -20,14 +20,12 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity;
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
*/
final class TermScorer extends Scorer {
private final PostingsEnum postingsEnum;
private final Similarity.SimScorer docScorer;
private final float maxFreq;
private final LeafSimScorer docScorer;
/**
* Construct a <code>TermScorer</code>.
@ -39,14 +37,11 @@ final class TermScorer extends Scorer {
* @param docScorer
* The <code>Similarity.SimScorer</code> implementation
* to be used for score computations.
* @param maxFreq
* An upper bound of the term frequency of the searched term in any document.
*/
TermScorer(Weight weight, PostingsEnum td, Similarity.SimScorer docScorer, float maxFreq) {
TermScorer(Weight weight, PostingsEnum td, LeafSimScorer docScorer) {
super(weight);
this.docScorer = docScorer;
this.postingsEnum = td;
this.maxFreq = maxFreq;
}
@Override
@ -71,7 +66,7 @@ final class TermScorer extends Scorer {
@Override
public float maxScore() {
return docScorer.maxScore(maxFreq);
return docScorer.maxScore();
}
/** Returns a string representation of this <code>TermScorer</code>. */

View File

@ -25,7 +25,7 @@ import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
@ -82,7 +82,7 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(new TermContext(topReaderContext));
st = new ScoreTerm(new TermStates(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@ -139,7 +139,7 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
visitedTerms.remove(st.bytes.get());
st.termState.clear(); // reset the termstate!
} else {
st = new ScoreTerm(new TermContext(topReaderContext));
st = new ScoreTerm(new TermStates(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
@ -193,8 +193,8 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
static final class ScoreTerm implements Comparable<ScoreTerm> {
public final BytesRefBuilder bytes = new BytesRefBuilder();
public float boost;
public final TermContext termState;
public ScoreTerm(TermContext termState) {
public final TermStates termState;
public ScoreTerm(TermStates termState) {
this.termState = termState;
}

View File

@ -378,7 +378,7 @@
* scored the way it was.
* Typically a weight such as TermWeight
* that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will make use of the Similarity's implementation:
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(int, Explanation) SimScorer#explain(int doc, Explanation freq)}.
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(Explanation, long) SimScorer#explain(Explanation freq, long norm)}.
* </li>
* </ol>
* <a name="scorerClass"></a>
@ -402,7 +402,7 @@
* {@link org.apache.lucene.search.Scorer#score score()} &mdash; Return the score of the
* current document. This value can be determined in any appropriate way for an application. For instance, the
* {@link org.apache.lucene.search.TermScorer TermScorer} simply defers to the configured Similarity:
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(int, float) SimScorer.score(int doc, float freq)}.
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(float, long) SimScorer.score(float freq, long norm)}.
* </li>
* <li>
* {@link org.apache.lucene.search.Scorer#getChildren getChildren()} &mdash; Returns any child subscorers

View File

@ -112,18 +112,12 @@ public abstract class Axiomatic extends SimilarityBase {
return Math.max(0, score);
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
double f = freq.getValue().doubleValue();
explain(subs, stats, doc, f, docLen);
explain(subs, stats, f, docLen);
double score = tf(stats, f, docLen)
* ln(stats, f, docLen)
@ -132,7 +126,7 @@ public abstract class Axiomatic extends SimilarityBase {
- gamma(stats, f, docLen);
Explanation explanation = Explanation.match((float) score,
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
subs);
if (stats.boost != 1f) {
explanation = Explanation.match((float) (score * stats.boost), "Boosted score, computed as (score * boost) from:",
@ -148,7 +142,7 @@ public abstract class Axiomatic extends SimilarityBase {
}
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(),
@ -165,7 +159,7 @@ public abstract class Axiomatic extends SimilarityBase {
subs.add(tflnExplain(stats, freq, docLen));
subs.add(idfExplain(stats, freq, docLen));
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
super.explain(subs, stats, doc, freq, docLen);
super.explain(subs, stats, freq, docLen);
}
/**

View File

@ -17,13 +17,10 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -176,7 +173,7 @@ public class BM25Similarity extends Similarity {
}
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
@ -184,100 +181,17 @@ public class BM25Similarity extends Similarity {
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
BM25Stats bm25stats = (BM25Stats) stats;
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
}
private class BM25DocScorer extends SimScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final NumericDocValues norms;
/** precomputed cache for all length values */
private final float[] lengthCache;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.weight;
this.norms = norms;
lengthCache = LENGTH_TABLE;
cache = stats.cache;
}
@Override
public float score(int doc, float freq) throws IOException {
// if there are no norms, we act as if b=0
double norm;
if (norms == null) {
norm = k1;
} else {
boolean found = norms.advanceExact(doc);
assert found;
norm = cache[((byte) norms.longValue()) & 0xFF];
}
return weightValue * (float) (freq / (freq + norm));
}
@Override
public float maxScore(float maxFreq) {
// TODO: leverage maxFreq and the min norm from the cache
return weightValue;
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.addAll(stats.explain());
Explanation tfExpl = explainTF(doc, freq);
subs.add(tfExpl);
return Explanation.match(stats.weight * tfExpl.getValue().floatValue(),
"score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs);
}
private Explanation explainTF(int doc, Explanation freq) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
if (norms == null) {
subs.add(Explanation.match(0, "b, field omits length norms"));
return Explanation.match(
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) k1)),
"tf, computed as freq / (freq + k1) from:", subs);
} else {
boolean found = norms.advanceExact(doc);
assert found;
byte norm = (byte) norms.longValue();
float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "b, length normalization parameter"));
if ((norm & 0xFF) > 39) {
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
} else {
subs.add(Explanation.match(doclen, "dl, length of field"));
}
subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field"));
float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl);
return Explanation.match(
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
}
}
return new BM25Scorer(collectionStats.field(), boost, k1, b, idf, avgdl, cache);
}
/** Collection statistics for the BM25 model. */
private static class BM25Stats extends SimWeight {
/** field name, for pulling norms */
private final String field;
private static class BM25Scorer extends SimScorer {
/** query boost */
private final float boost;
/** k1 value for scale factor */
private final float k1;
/** b value for length normalization impact */
private final float b;
/** BM25's idf */
private final Explanation idf;
/** The average document length. */
@ -287,17 +201,51 @@ public class BM25Similarity extends Similarity {
/** weight (idf * boost) */
private final float weight;
BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) {
this.field = field;
BM25Scorer(String field, float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
super(field);
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.k1 = k1;
this.b = b;
this.cache = cache;
this.weight = (k1 + 1) * boost * idf.getValue().floatValue();
}
private List<Explanation> explain() {
@Override
public float score(float freq, long encodedNorm) {
double norm = cache[((byte) encodedNorm) & 0xFF];
return weight * (float) (freq / (freq + norm));
}
@Override
public Explanation explain(Explanation freq, long encodedNorm) {
List<Explanation> subs = new ArrayList<>(explainConstantFactors());
Explanation tfExpl = explainTF(freq, encodedNorm);
subs.add(tfExpl);
return Explanation.match(weight * tfExpl.getValue().floatValue(),
"score(freq="+freq.getValue()+"), product of:", subs);
}
private Explanation explainTF(Explanation freq, long norm) {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
float doclen = LENGTH_TABLE[((byte) norm) & 0xff];
subs.add(Explanation.match(b, "b, length normalization parameter"));
if ((norm & 0xFF) > 39) {
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
} else {
subs.add(Explanation.match(doclen, "dl, length of field"));
}
subs.add(Explanation.match(avgdl, "avgdl, average length of field"));
float normValue = k1 * ((1 - b) + b * doclen / avgdl);
return Explanation.match(
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
}
private List<Explanation> explainConstantFactors() {
List<Explanation> subs = new ArrayList<>();
// scale factor
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
@ -311,7 +259,6 @@ public class BM25Similarity extends Similarity {
}
}
@Override
public String toString() {
return "BM25(k1=" + k1 + ",b=" + b + ")";

View File

@ -23,7 +23,7 @@ import org.apache.lucene.index.Terms;
* Stores all statistics commonly used ranking methods.
* @lucene.experimental
*/
public class BasicStats extends Similarity.SimWeight {
public class BasicStats {
final String field;
/** The number of documents. */
protected long numberOfDocuments;

View File

@ -16,10 +16,7 @@
*/
package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -47,44 +44,31 @@ public class BooleanSimilarity extends Similarity {
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new BooleanWeight(boost);
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new BooleanWeight(collectionStats.field(), boost);
}
private static class BooleanWeight extends SimWeight {
private static class BooleanWeight extends SimScorer {
final float boost;
BooleanWeight(float boost) {
BooleanWeight(String field, float boost) {
super(field);
this.boost = boost;
}
@Override
public float score(float freq, long norm) {
return boost;
}
@Override
public Explanation explain(Explanation freq, long norm) {
Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
return Explanation.match(
queryBoostExpl.getValue(),
"score(" + getClass().getSimpleName() + "), computed from:",
queryBoostExpl);
}
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
final float boost = ((BooleanWeight) weight).boost;
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
return boost;
}
@Override
public float maxScore(float maxFreq) {
return boost;
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
return Explanation.match(
queryBoostExpl.getValue(),
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
queryBoostExpl);
}
};
}
}

View File

@ -64,12 +64,6 @@ public class DFISimilarity extends SimilarityBase {
return stats.getBoost() * log2(measure + 1);
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
/**
* Returns the measure of independence
*/
@ -79,12 +73,12 @@ public class DFISimilarity extends SimilarityBase {
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
final double expected = (stats.getTotalTermFreq() + 1) * docLen /
(stats.getNumberOfFieldTokens() + 1);
if (freq.getValue().doubleValue() <= expected){
return Explanation.match((float) 0, "score(" +
getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), equals to 0");
}
Explanation explExpected = Explanation.match((float) expected,
@ -103,7 +97,7 @@ public class DFISimilarity extends SimilarityBase {
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
"score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * log2(measure + 1) from:",
Explanation.match( (float)stats.getBoost(), "boost, query boost"),
explMeasure);

View File

@ -113,15 +113,9 @@ public class DFRSimilarity extends SimilarityBase {
return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
@Override
protected void explain(List<Explanation> subs,
BasicStats stats, int doc, double freq, double docLen) {
BasicStats stats, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
}
@ -136,13 +130,13 @@ public class DFRSimilarity extends SimilarityBase {
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
"score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
subs);

View File

@ -104,15 +104,9 @@ public class IBSimilarity extends SimilarityBase {
lambda.lambda(stats));
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
@Override
protected void explain(
List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
List<Explanation> subs, BasicStats stats, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost"));
}
@ -125,13 +119,13 @@ public class IBSimilarity extends SimilarityBase {
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
"score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"distribution.score(stats, normalization.tfn(stats, freq," +
" docLen), lambda.lambda(stats)) from:",

View File

@ -78,13 +78,7 @@ public class LMDirichletSimilarity extends LMSimilarity {
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "query boost"));
@ -107,18 +101,18 @@ public class LMDirichletSimilarity extends LMSimilarity {
(float)Math.log(mu / (docLen + mu)),
"document norm, computed as log(mu / (dl + mu))"));
subs.add(Explanation.match((float) docLen,"dl, length of field"));
super.explain(subs, stats, doc, freq, docLen);
super.explain(subs, stats, freq, docLen);
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
"score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"(term weight + document norm) from:",
subs);

View File

@ -68,13 +68,7 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
}
@Override
protected double maxScore(BasicStats stats, double maxFreq) {
// TODO: can we compute a better upper bound on the produced scores
return Double.POSITIVE_INFINITY;
}
@Override
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
protected void explain(List<Explanation> subs, BasicStats stats,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
@ -88,18 +82,18 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
"freq, number of occurrences of term in the document");
subs.add(explFreq);
subs.add(Explanation.match((float) docLen,"dl, length of field"));
super.explain(subs, stats, doc, freq, docLen);
super.explain(subs, stats, freq, docLen);
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
explain(subs, stats, freq.getValue().doubleValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().doubleValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
"score(" + getClass().getSimpleName() + ", freq=" +
freq.getValue() +"), computed as boost * " +
"log(1 + ((1 - lambda) * freq / dl) /(lambda * P)) from:",
subs);

View File

@ -70,7 +70,7 @@ public abstract class LMSimilarity extends SimilarityBase {
}
@Override
protected void explain(List<Explanation> subExpls, BasicStats stats, int doc,
protected void explain(List<Explanation> subExpls, BasicStats stats,
double freq, double docLen) {
subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats),
"collection probability"));

View File

@ -17,12 +17,10 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -49,64 +47,39 @@ public class MultiSimilarity extends Similarity {
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimWeight subStats[] = new SimWeight[sims.length];
for (int i = 0; i < subStats.length; i++) {
subStats[i] = sims[i].computeWeight(boost, collectionStats, termStats);
}
return new MultiStats(subStats);
}
@Override
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer subScorers[] = new SimScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context);
subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
}
return new MultiSimScorer(subScorers);
return new MultiSimScorer(collectionStats.field(), subScorers);
}
static class MultiSimScorer extends SimScorer {
private final SimScorer subScorers[];
MultiSimScorer(SimScorer subScorers[]) {
MultiSimScorer(String field, SimScorer subScorers[]) {
super(field);
this.subScorers = subScorers;
}
@Override
public float score(int doc, float freq) throws IOException {
public float score(float freq, long norm) {
float sum = 0.0f;
for (SimScorer subScorer : subScorers) {
sum += subScorer.score(doc, freq);
sum += subScorer.score(freq, norm);
}
return sum;
}
@Override
public float maxScore(float freq) {
float sumMaxScore = 0;
for (SimScorer subScorer : subScorers) {
sumMaxScore += subScorer.maxScore(freq);
}
return sumMaxScore;
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
public Explanation explain(Explanation freq, long norm) {
List<Explanation> subs = new ArrayList<>();
for (SimScorer subScorer : subScorers) {
subs.add(subScorer.explain(doc, freq));
subs.add(subScorer.explain(freq, norm));
}
return Explanation.match(score(doc, freq.getValue().floatValue()), "sum of:", subs);
return Explanation.match(score(freq.getValue().floatValue(), norm), "sum of:", subs);
}
}
static class MultiStats extends SimWeight {
final SimWeight subStats[];
MultiStats(SimWeight subStats[]) {
this.subStats = subStats;
}
}
}

View File

@ -17,9 +17,6 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
@ -46,26 +43,13 @@ public abstract class PerFieldSimilarityWrapper extends Similarity {
}
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
PerFieldSimWeight weight = new PerFieldSimWeight();
weight.delegate = get(collectionStats.field());
weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats);
return weight;
}
@Override
public final SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context);
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return get(collectionStats.field()).scorer(boost, collectionStats, termStats);
}
/**
* Returns a {@link Similarity} for scoring a field.
*/
public abstract Similarity get(String name);
static class PerFieldSimWeight extends SimWeight {
Similarity delegate;
SimWeight delegateWeight;
}
}

View File

@ -17,18 +17,15 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.util.SmallFloat;
/**
@ -38,9 +35,9 @@ import org.apache.lucene.util.SmallFloat;
* <p>
* This is a low-level API, you should only extend this API if you want to implement
* an information retrieval <i>model</i>. If you are instead looking for a convenient way
* to alter Lucene's scoring, consider extending a higher-level implementation
* such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
* just tweaking the default implementation: {@link BM25Similarity}.
* to alter Lucene's scoring, consider just tweaking the default implementation:
* {@link BM25Similarity} or extend {@link SimilarityBase}, which makes it easy to compute
* a score from index statistics.
* <p>
* Similarity determines how Lucene weights terms, and Lucene interacts with
* this class at both <a href="#indextime">index-time</a> and
@ -49,23 +46,22 @@ import org.apache.lucene.util.SmallFloat;
* <a name="indextime">Indexing Time</a>
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* the Similarity implementation to set a per-document value for the field that will
* be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption
* about what is in this norm, but it is most useful for encoding length normalization
* information.
* be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}.
* Lucene makes no assumption about what is in this norm, but it is most useful for
* encoding length normalization information.
* <p>
* Implementations should carefully consider how the normalization is encoded: while
* Lucene's {@link BM25Similarity} encodes a combination of index-time boost
* and length normalization information with {@link SmallFloat} into a single byte, this
* might not be suitable for all purposes.
* Lucene's {@link BM25Similarity} encodes length normalization information with
* {@link SmallFloat} into a single byte, this might not be suitable for all purposes.
* <p>
* Many formulas require the use of average document length, which can be computed via a
* combination of {@link CollectionStatistics#sumTotalTermFreq()} and
* {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()},
* depending upon whether the average should reflect field sparsity.
* {@link CollectionStatistics#docCount()}.
* <p>
* Additional scoring factors can be stored in named
* <code>NumericDocValuesField</code>s and accessed
* at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
* Additional scoring factors can be stored in named {@link NumericDocValuesField}s and
* accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
* However this should not be done in the {@link Similarity} but externally, for instance
* by using <tt>FunctionScoreQuery</tt>.
* <p>
* Finally, using index-time boosts (either via folding into the normalization byte or
* via DocValues), is an inefficient way to boost the scores of different fields if the
@ -76,14 +72,13 @@ import org.apache.lucene.util.SmallFloat;
* <a name="querytime">Query time</a>
* At query-time, Queries interact with the Similarity via these steps:
* <ol>
* <li>The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time,
* <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time,
* allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* across <i>the entire collection</i>. The {@link TermStatistics} and {@link CollectionStatistics} passed in
* already contain all of the raw statistics involved, so a Similarity can freely use any combination
* of statistics without causing any additional I/O. Lucene makes no assumption about what is
* stored in the returned {@link Similarity.SimWeight} object.
* <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, org.apache.lucene.index.LeafReaderContext)}
* The score() method is called for each matching document.
* stored in the returned {@link Similarity.SimScorer} object.
* <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score.
* </ol>
* <p>
* <a name="explaintime">Explanations</a>
@ -110,7 +105,17 @@ public abstract class Similarity {
* <p>Matches in longer fields are less precise, so implementations of this
* method usually set smaller values when <code>state.getLength()</code> is large,
* and larger values when <code>state.getLength()</code> is small.
*
*
* <p>Note that for a given term-document frequency, greater unsigned norms
* must produce scores that are lower or equal, ie. for two encoded norms
* {@code n1} and {@code n2} so that
* {@code Long.compareUnsigned(n1, n2) &gt; 0} then
* {@code SimScorer.score(freq, n1) &lt;= SimScorer.score(freq, n2)}
* for any legal {@code freq}.
*
* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces
* the highest scores.
*
* @lucene.experimental
*
* @param state current processing state for this field
@ -126,71 +131,68 @@ public abstract class Similarity {
* @param termStats term-level statistics, such as the document frequency of a term across the collection.
* @return SimWeight object with the information this Similarity needs to score a query.
*/
public abstract SimWeight computeWeight(float boost,
public abstract SimScorer scorer(float boost,
CollectionStatistics collectionStats, TermStatistics... termStats);
/**
* Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index.
* @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
* @param context segment of the inverted index to be scored.
* @return SloppySimScorer for scoring documents across <code>context</code>
* @throws IOException if there is a low-level I/O error
*/
public abstract SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException;
/**
* API for scoring "sloppy" queries such as {@link TermQuery},
* {@link SpanQuery}, and {@link PhraseQuery}.
*/
public static abstract class SimScorer {
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public SimScorer() {}
/**
* Score a single document
* @param doc document id within the inverted index segment
* @param freq sloppy term frequency
* @return document's score
*/
public abstract float score(int doc, float freq) throws IOException;
/**
* Return the maximum score that this scorer may produce for freqs in {@code ]0, maxFreq]}.
* {@code Float.POSITIVE_INFINITY} is a fine return value if scores are not bounded.
* @param maxFreq the maximum frequency
*/
public abstract float maxScore(float maxFreq);
/**
* Explain the score for a single document
* @param doc document id within the inverted index segment
* @param freq Explanation of how the sloppy term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) throws IOException {
return Explanation.match(
score(doc, freq.getValue().floatValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:",
Collections.singleton(freq));
}
}
/** Stores the weight for a query across the indexed collection. This abstract
* implementation is empty; descendants of {@code Similarity} should
* subclass {@code SimWeight} and define the statistics they require in the
* subclass. Examples include idf, average field length, etc.
*/
public static abstract class SimWeight {
public static abstract class SimScorer {
private final String field;
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
* constructors.)
*/
public SimWeight() {}
public SimScorer(String field) {
this.field = Objects.requireNonNull(field);
}
/** Return the field that this {@link SimScorer} operates on. */
public final String getField() {
return field;
}
/**
* Score a single document. {@code freq} is the document-term sloppy
* frequency and must be finite and positive. {@code norm} is the
* encoded normalization factor as computed by
* {@link Similarity#computeNorm(FieldInvertState)} at index time, or
* {@code 1} if norms are disabled. {@code norm} is never {@code 0}.
* <p>
* Score must not decrease when {@code freq} increases, ie. if
* {@code freq1 &gt; freq2}, then {@code score(freq1, norm) &gt;=
* score(freq2, norm)} for any value of {@code norm} that may be produced
* by {@link Similarity#computeNorm(FieldInvertState)}.
* <p>
* Score must not increase when the unsigned {@code norm} increases, ie. if
* {@code Long.compareUnsigned(norm1, norm2) &gt; 0} then
* {@code score(freq, norm1) &lt;= score(freq, norm2)} for any legal
* {@code freq}.
* <p>
* As a consequence, the maximum score that this scorer can produce is bound
* by {@code score(Float.MAX_VALUE, 1)}.
* @param freq sloppy term frequency, must be finite and positive
* @param norm encoded normalization factor or {@code 1} if norms are disabled
* @return document's score
*/
public abstract float score(float freq, long norm);
/**
* Explain the score for a single document
* @param freq Explanation of how the sloppy term frequency was computed
* @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled
* @return document's score
*/
public Explanation explain(Explanation freq, long norm) {
return Explanation.match(
score(freq.getValue().floatValue(), norm),
"score(freq=" + freq.getValue() +"), with freq of:",
Collections.singleton(freq));
}
}
}

View File

@ -17,13 +17,10 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@ -33,7 +30,7 @@ import org.apache.lucene.util.SmallFloat;
* A subclass of {@code Similarity} that provides a simplified API for its
* descendants. Subclasses are only required to implement the {@link #score}
* and {@link #toString()} methods. Implementing
* {@link #explain(List, BasicStats, int, double, double)} is optional,
* {@link #explain(List, BasicStats, double, double)} is optional,
* inasmuch as SimilarityBase already provides a basic explanation of the score
* and the term frequency. However, implementers of a subclass are encouraged to
* include as much detail about the scoring method as possible.
@ -82,13 +79,18 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
BasicStats stats[] = new BasicStats[termStats.length];
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer weights[] = new SimScorer[termStats.length];
for (int i = 0; i < termStats.length; i++) {
stats[i] = newStats(collectionStats.field(), boost);
fillBasicStats(stats[i], collectionStats, termStats[i]);
BasicStats stats = newStats(collectionStats.field(), boost);
fillBasicStats(stats, collectionStats, termStats[i]);
weights[i] = new BasicSimScorer(stats);
}
if (weights.length == 1) {
return weights[0];
} else {
return new MultiSimilarity.MultiSimScorer(collectionStats.field(), weights);
}
return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
}
/** Factory method to return a custom stats object */
@ -121,13 +123,6 @@ public abstract class SimilarityBase extends Similarity {
*/
protected abstract double score(BasicStats stats, double freq, double docLen);
/**
* Return the maximum value that may be returned by {@link #score(BasicStats, double, double)}
* for the given stats.
* @see org.apache.lucene.search.similarities.Similarity.SimScorer#maxScore(float)
*/
protected abstract double maxScore(BasicStats stats, double maxFreq);
/**
* Subclasses should implement this method to explain the score. {@code expl}
* already contains the score, the name of the class and the doc id, as well
@ -137,12 +132,11 @@ public abstract class SimilarityBase extends Similarity {
*
* @param subExpls the list of details of the explanation to extend
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency.
* @param docLen the document length.
*/
protected void explain(
List<Explanation> subExpls, BasicStats stats, int doc, double freq, double docLen) {}
List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}
/**
* Explains the score. The implementation here provides a basic explanation
@ -151,43 +145,24 @@ public abstract class SimilarityBase extends Similarity {
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
* method) and the explanation for the term frequency. Subclasses content with
* this format may add additional details in
* {@link #explain(List, BasicStats, int, double, double)}.
* {@link #explain(List, BasicStats, double, double)}.
*
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency and its explanation.
* @param docLen the document length.
* @return the explanation.
*/
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue().floatValue(), docLen);
explain(subs, stats, freq.getValue().floatValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().floatValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
subs);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
SimScorer subScorers[] = new SimScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
BasicStats basicstats = (BasicStats) subStats[i];
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
}
return new MultiSimilarity.MultiSimScorer(subScorers);
} else {
BasicStats basicstats = (BasicStats) stats;
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
}
}
/**
* Subclasses must override this method to return the name of the Similarity
* and preferably the values of parameters (if any) as well.
@ -227,43 +202,32 @@ public abstract class SimilarityBase extends Similarity {
// --------------------------------- Classes ---------------------------------
/** Delegates the {@link #score(int, float)} and
* {@link #explain(int, Explanation)} methods to
/** Delegates the {@link #score(float, long)} and
* {@link #explain(Explanation, long)} methods to
* {@link SimilarityBase#score(BasicStats, double, double)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, double)},
* {@link SimilarityBase#explain(BasicStats, Explanation, double)},
* respectively.
*/
final class BasicSimScorer extends SimScorer {
private final BasicStats stats;
private final NumericDocValues norms;
final BasicStats stats;
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
BasicSimScorer(BasicStats stats) {
super(stats.field);
this.stats = stats;
this.norms = norms;
}
double getLengthValue(int doc) throws IOException {
if (norms == null) {
return 1D;
}
boolean found = norms.advanceExact(doc);
assert found;
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
double getLengthValue(long norm) {
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
}
@Override
public float score(int doc, float freq) throws IOException {
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc));
public float score(float freq, long norm) {
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
}
@Override
public float maxScore(float maxFreq) {
return (float) SimilarityBase.this.maxScore(stats, maxFreq);
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
public Explanation explain(Explanation freq, long norm) {
return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
}
}

View File

@ -17,13 +17,10 @@
package org.apache.lucene.search.similarities;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@ -511,7 +508,7 @@ public abstract class TFIDFSimilarity extends Similarity {
}
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
@ -522,110 +519,59 @@ public abstract class TFIDFSimilarity extends Similarity {
normTable[i] = norm;
}
normTable[0] = 1f / normTable[255];
return new IDFStats(collectionStats.field(), boost, idf, normTable);
return new TFIDFScorer(collectionStats.field(), boost, idf, normTable);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
IDFStats idfstats = (IDFStats) stats;
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
final float[] normTable = idfstats.normTable;
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
}
private final class TFIDFSimScorer extends SimScorer {
private final IDFStats stats;
private final float weightValue;
private final NumericDocValues norms;
private final float[] normTable;
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
this.stats = stats;
this.weightValue = stats.queryWeight;
this.norms = norms;
this.normTable = normTable;
}
@Override
public float score(int doc, float freq) throws IOException {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
if (norms == null) {
return raw;
} else {
boolean found = norms.advanceExact(doc);
assert found;
float normValue = normTable[(int) (norms.longValue() & 0xFF)];
return raw * normValue; // normalize for field
}
}
@Override
public float maxScore(float maxFreq) {
final float raw = tf(maxFreq) * weightValue;
if (norms == null) {
return raw;
} else {
float maxNormValue = Float.NEGATIVE_INFINITY;
for (float norm : normTable) {
maxNormValue = Math.max(maxNormValue, norm);
}
return raw * maxNormValue;
}
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms, normTable);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
static class IDFStats extends SimWeight {
private final String field;
class TFIDFScorer extends SimScorer {
/** The idf and its explanation */
private final Explanation idf;
private final float boost;
private final float queryWeight;
final float[] normTable;
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
public TFIDFScorer(String field, float boost, Explanation idf, float[] normTable) {
super(field);
// TODO: Validate?
this.field = field;
this.idf = idf;
this.boost = boost;
this.queryWeight = boost * idf.getValue().floatValue();
this.normTable = normTable;
}
@Override
public float score(float freq, long norm) {
final float raw = tf(freq) * queryWeight; // compute tf(f)*weight
float normValue = normTable[(int) (norm & 0xFF)];
return raw * normValue; // normalize for field
}
@Override
public Explanation explain(Explanation freq, long norm) {
return explainScore(freq, norm, normTable);
}
private Explanation explainScore(Explanation freq, long encodedNorm, float[] normTable) {
List<Explanation> subs = new ArrayList<Explanation>();
if (boost != 1F) {
subs.add(Explanation.match(boost, "boost"));
}
subs.add(idf);
Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
subs.add(tf);
float norm = normTable[(int) (encodedNorm & 0xFF)];
Explanation fieldNorm = Explanation.match(norm, "fieldNorm");
subs.add(fieldNorm);
return Explanation.match(
queryWeight * tf.getValue().floatValue() * norm,
"score(freq="+freq.getValue()+"), product of:",
subs);
}
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
List<Explanation> subs = new ArrayList<Explanation>();
if (stats.boost != 1F) {
subs.add(Explanation.match(stats.boost, "boost"));
}
subs.add(stats.idf);
Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
subs.add(tf);
float norm;
if (norms == null) {
norm = 1f;
} else {
boolean found = norms.advanceExact(doc);
assert found;
norm = normTable[(int) (norms.longValue() & 0xFF)];
}
Explanation fieldNorm = Explanation.match(
norm,
"fieldNorm(doc=" + doc + ")");
subs.add(fieldNorm);
return Explanation.match(
stats.queryWeight * tf.getValue().floatValue() * norm,
"score(doc="+doc+",freq="+freq.getValue()+"), product of:",
subs);
}
}

View File

@ -20,7 +20,7 @@ package org.apache.lucene.search.spans;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -61,7 +61,7 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable {
final SpanWeight bigWeight;
final SpanWeight littleWeight;
public SpanContainWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
public SpanContainWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
super(SpanContainQuery.this, searcher, terms, boost);
this.bigWeight = bigWeight;
@ -93,9 +93,9 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable {
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
bigWeight.extractTermContexts(contexts);
littleWeight.extractTermContexts(contexts);
public void extractTermStates(Map<Term, TermStates> contexts) {
bigWeight.extractTermStates(contexts);
littleWeight.extractTermStates(contexts);
}
}

View File

@ -23,7 +23,7 @@ import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
@ -45,15 +45,15 @@ public final class SpanContainingQuery extends SpanContainQuery {
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null,
SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost);
SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost);
return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null,
bigWeight, littleWeight, boost);
}
public class SpanContainingWeight extends SpanContainWeight {
public SpanContainingWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
public SpanContainingWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
super(searcher, terms, bigWeight, littleWeight, boost);
}

View File

@ -24,7 +24,7 @@ import java.util.Objects;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiTermQuery;
@ -163,7 +163,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(List<SpanQuery> topLevel, Term term, int docCount, float boost, TermContext states) {
protected void addClause(List<SpanQuery> topLevel, Term term, int docCount, float boost, TermStates states) {
final SpanTermQuery q = new SpanTermQuery(term, states);
topLevel.add(q);
}
@ -211,7 +211,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(List<SpanQuery> topLevel, Term term, int docFreq, float boost, TermContext states) {
protected void addClause(List<SpanQuery> topLevel, Term term, int docFreq, float boost, TermStates states) {
final SpanTermQuery q = new SpanTermQuery(term, states);
topLevel.add(q);
}

View File

@ -29,7 +29,7 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -181,24 +181,24 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
List<SpanWeight> subWeights = new ArrayList<>();
for (SpanQuery q : clauses) {
subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost));
subWeights.add(q.createWeight(searcher, scoreMode, boost));
}
return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, boost);
return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, boost);
}
public class SpanNearWeight extends SpanWeight {
final List<SpanWeight> subWeights;
public SpanNearWeight(List<SpanWeight> subWeights, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
public SpanNearWeight(List<SpanWeight> subWeights, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
super(SpanNearQuery.this, searcher, terms, boost);
this.subWeights = subWeights;
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
public void extractTermStates(Map<Term, TermStates> contexts) {
for (SpanWeight w : subWeights) {
w.extractTermContexts(contexts);
w.extractTermStates(contexts);
}
}
@ -318,7 +318,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
public void extractTermStates(Map<Term, TermStates> contexts) {
}

View File

@ -25,7 +25,7 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
@ -99,9 +99,9 @@ public final class SpanNotQuery extends SpanQuery {
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
SpanWeight includeWeight = include.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
SpanWeight includeWeight = include.createWeight(searcher, scoreMode, boost);
SpanWeight excludeWeight = exclude.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermContexts(includeWeight, excludeWeight) : null,
return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermStates(includeWeight) : null,
includeWeight, excludeWeight, boost);
}
@ -110,7 +110,7 @@ public final class SpanNotQuery extends SpanQuery {
final SpanWeight includeWeight;
final SpanWeight excludeWeight;
public SpanNotWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
public SpanNotWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
SpanWeight includeWeight, SpanWeight excludeWeight, float boost) throws IOException {
super(SpanNotQuery.this, searcher, terms, boost);
this.includeWeight = includeWeight;
@ -118,8 +118,8 @@ public final class SpanNotQuery extends SpanQuery {
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
includeWeight.extractTermContexts(contexts);
public void extractTermStates(Map<Term, TermStates> contexts) {
includeWeight.extractTermStates(contexts);
}
@Override

View File

@ -27,7 +27,7 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.DisiPriorityQueue;
import org.apache.lucene.search.DisiWrapper;
import org.apache.lucene.search.DisjunctionDISIApproximation;
@ -119,16 +119,16 @@ public final class SpanOrQuery extends SpanQuery {
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
List<SpanWeight> subWeights = new ArrayList<>(clauses.size());
for (SpanQuery q : clauses) {
subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost));
subWeights.add(q.createWeight(searcher, scoreMode, boost));
}
return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, subWeights, boost);
return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, subWeights, boost);
}
public class SpanOrWeight extends SpanWeight {
final List<SpanWeight> subWeights;
public SpanOrWeight(IndexSearcher searcher, Map<Term, TermContext> terms, List<SpanWeight> subWeights, float boost) throws IOException {
public SpanOrWeight(IndexSearcher searcher, Map<Term, TermStates> terms, List<SpanWeight> subWeights, float boost) throws IOException {
super(SpanOrQuery.this, searcher, terms, boost);
this.subWeights = subWeights;
}
@ -150,9 +150,9 @@ public final class SpanOrQuery extends SpanQuery {
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
public void extractTermStates(Map<Term, TermStates> contexts) {
for (SpanWeight w : subWeights) {
w.extractTermContexts(contexts);
w.extractTermStates(contexts);
}
}

View File

@ -25,7 +25,7 @@ import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
@ -69,15 +69,15 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
SpanWeight matchWeight = match.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermContexts(matchWeight) : null, boost);
SpanWeight matchWeight = match.createWeight(searcher, scoreMode, boost);
return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermStates(matchWeight) : null, boost);
}
public class SpanPositionCheckWeight extends SpanWeight {
final SpanWeight matchWeight;
public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
super(SpanPositionCheckQuery.this, searcher, terms, boost);
this.matchWeight = matchWeight;
}
@ -93,8 +93,8 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
matchWeight.extractTermContexts(contexts);
public void extractTermStates(Map<Term, TermStates> contexts) {
matchWeight.extractTermStates(contexts);
}
@Override

View File

@ -23,7 +23,7 @@ import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
@ -40,25 +40,25 @@ public abstract class SpanQuery extends Query {
public abstract SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException;
/**
* Build a map of terms to termcontexts, for use in constructing SpanWeights
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
* @lucene.internal
*/
public static Map<Term, TermContext> getTermContexts(SpanWeight... weights) {
Map<Term, TermContext> terms = new TreeMap<>();
public static Map<Term, TermStates> getTermStates(SpanWeight... weights) {
Map<Term, TermStates> terms = new TreeMap<>();
for (SpanWeight w : weights) {
w.extractTermContexts(terms);
w.extractTermStates(terms);
}
return terms;
}
/**
* Build a map of terms to termcontexts, for use in constructing SpanWeights
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
* @lucene.internal
*/
public static Map<Term, TermContext> getTermContexts(Collection<SpanWeight> weights) {
Map<Term, TermContext> terms = new TreeMap<>();
public static Map<Term, TermStates> getTermStates(Collection<SpanWeight> weights) {
Map<Term, TermStates> terms = new TreeMap<>();
for (SpanWeight w : weights) {
w.extractTermContexts(terms);
w.extractTermStates(terms);
}
return terms;
}

View File

@ -21,9 +21,9 @@ import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.similarities.Similarity;
/**
* A basic {@link Scorer} over {@link Spans}.
@ -32,7 +32,7 @@ import org.apache.lucene.search.similarities.Similarity;
public class SpanScorer extends Scorer {
protected final Spans spans;
protected final Similarity.SimScorer docScorer;
protected final LeafSimScorer docScorer;
/** accumulated sloppy freq (computed in setFreqCurrentDoc) */
private float freq;
@ -41,7 +41,7 @@ public class SpanScorer extends Scorer {
private int lastScoredDoc = -1; // last doc we called setFreqCurrentDoc() for
/** Sole constructor. */
public SpanScorer(SpanWeight weight, Spans spans, Similarity.SimScorer docScorer) {
public SpanScorer(SpanWeight weight, Spans spans, LeafSimScorer docScorer) {
super(weight);
this.spans = Objects.requireNonNull(spans);
this.docScorer = docScorer;

View File

@ -28,7 +28,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -41,21 +41,21 @@ import org.apache.lucene.search.ScoreMode;
public class SpanTermQuery extends SpanQuery {
protected final Term term;
protected final TermContext termContext;
protected final TermStates termStates;
/** Construct a SpanTermQuery matching the named term's spans. */
public SpanTermQuery(Term term) {
this.term = Objects.requireNonNull(term);
this.termContext = null;
this.termStates = null;
}
/**
* Expert: Construct a SpanTermQuery matching the named term's spans, using
* the provided TermContext
* the provided TermStates
*/
public SpanTermQuery(Term term, TermContext context) {
public SpanTermQuery(Term term, TermStates termStates) {
this.term = Objects.requireNonNull(term);
this.termContext = context;
this.termStates = termStates;
}
/** Return the term whose spans are matched. */
@ -66,25 +66,25 @@ public class SpanTermQuery extends SpanQuery {
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final TermContext context;
final TermStates context;
final IndexReaderContext topContext = searcher.getTopReaderContext();
if (termContext == null || termContext.wasBuiltFor(topContext) == false) {
context = TermContext.build(topContext, term);
if (termStates == null || termStates.wasBuiltFor(topContext) == false) {
context = TermStates.build(topContext, term, scoreMode.needsScores());
}
else {
context = termContext;
context = termStates;
}
return new SpanTermWeight(context, searcher, scoreMode.needsScores() ? Collections.singletonMap(term, context) : null, boost);
}
public class SpanTermWeight extends SpanWeight {
final TermContext termContext;
final TermStates termStates;
public SpanTermWeight(TermContext termContext, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
public SpanTermWeight(TermStates termStates, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
super(SpanTermQuery.this, searcher, terms, boost);
this.termContext = termContext;
assert termContext != null : "TermContext must not be null";
this.termStates = termStates;
assert termStates != null : "TermStates must not be null";
}
@Override
@ -98,16 +98,16 @@ public class SpanTermQuery extends SpanQuery {
}
@Override
public void extractTermContexts(Map<Term, TermContext> contexts) {
contexts.put(term, termContext);
public void extractTermStates(Map<Term, TermStates> contexts) {
contexts.put(term, termStates);
}
@Override
public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException {
assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
final TermState state = termContext.get(context.ord);
final TermState state = termStates.get(context);
if (state == null) { // term is not present in that reader
assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term;
return null;

View File

@ -24,14 +24,14 @@ import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafSimScorer;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
/**
* Expert-only. Public for use by other weight implementations
@ -72,48 +72,48 @@ public abstract class SpanWeight extends Weight {
}
protected final Similarity similarity;
protected final Similarity.SimWeight simWeight;
protected final Similarity.SimScorer simScorer;
protected final String field;
/**
* Create a new SpanWeight
* @param query the parent query
* @param searcher the IndexSearcher to query against
* @param termContexts a map of terms to termcontexts for use in building the similarity. May
* @param termStates a map of terms to {@link TermStates} for use in building the similarity. May
* be null if scores are not required
* @throws IOException on error
*/
public SpanWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermContext> termContexts, float boost) throws IOException {
public SpanWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
super(query);
this.field = query.getField();
this.similarity = searcher.getSimilarity(termContexts != null);
this.simWeight = buildSimWeight(query, searcher, termContexts, boost);
this.similarity = searcher.getSimilarity();
this.simScorer = buildSimWeight(query, searcher, termStates, boost);
}
private Similarity.SimWeight buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermContext> termContexts, float boost) throws IOException {
if (termContexts == null || termContexts.size() == 0 || query.getField() == null)
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
if (termStates == null || termStates.size() == 0 || query.getField() == null)
return null;
TermStatistics[] termStats = new TermStatistics[termContexts.size()];
TermStatistics[] termStats = new TermStatistics[termStates.size()];
int termUpTo = 0;
for (Term term : termContexts.keySet()) {
TermStatistics termStatistics = searcher.termStatistics(term, termContexts.get(term));
for (Term term : termStates.keySet()) {
TermStatistics termStatistics = searcher.termStatistics(term, termStates.get(term));
if (termStatistics != null) {
termStats[termUpTo++] = termStatistics;
}
}
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
if (termUpTo > 0) {
return similarity.computeWeight(boost, collectionStats, Arrays.copyOf(termStats, termUpTo));
return similarity.scorer(boost, collectionStats, Arrays.copyOf(termStats, termUpTo));
} else {
return null; // no terms at all exist, we won't use similarity
}
}
/**
* Collect all TermContexts used by this Weight
* @param contexts a map to add the TermContexts to
* Collect all TermStates used by this Weight
* @param contexts a map to add the TermStates to
*/
public abstract void extractTermContexts(Map<Term, TermContext> contexts);
public abstract void extractTermStates(Map<Term, TermStates> contexts);
/**
* Expert: Return a Spans object iterating over matches from this Weight
@ -129,18 +129,18 @@ public abstract class SpanWeight extends Weight {
if (spans == null) {
return null;
}
final Similarity.SimScorer docScorer = getSimScorer(context);
final LeafSimScorer docScorer = getSimScorer(context);
return new SpanScorer(this, spans, docScorer);
}
/**
* Return a SimScorer for this context
* Return a LeafSimScorer for this context
* @param context the LeafReaderContext
* @return a SimWeight
* @throws IOException on error
*/
public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException {
return simWeight == null ? null : similarity.simScorer(simWeight, context);
public LeafSimScorer getSimScorer(LeafReaderContext context) throws IOException {
return simScorer == null ? null : new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE);
}
@Override
@ -150,7 +150,7 @@ public abstract class SpanWeight extends Weight {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float freq = scorer.sloppyFreq();
SimScorer docScorer = similarity.simScorer(simWeight, context);
LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE);
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(scoreExplanation.getValue(),

View File

@ -23,7 +23,7 @@ import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
@ -46,15 +46,15 @@ public final class SpanWithinQuery extends SpanContainQuery {
@Override
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null,
SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost);
SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost);
return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null,
bigWeight, littleWeight, boost);
}
public class SpanWithinWeight extends SpanContainWeight {
public SpanWithinWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
public SpanWithinWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
super(searcher, terms, bigWeight, littleWeight, boost);
}

Some files were not shown because too many files have changed in this diff Show More