mirror of https://github.com/apache/lucene.git
Merge branch 'master' of https://git-wip-us.apache.org/repos/asf/lucene-solr
This commit is contained in:
commit
653935bbdf
|
@ -145,6 +145,7 @@
|
|||
(~$/(?i)\bno(n|)commit\b/$) : 'nocommit',
|
||||
(~$/\bTOOD:/$) : 'TOOD instead TODO',
|
||||
(~$/\t/$) : 'tabs instead spaces',
|
||||
(~$/\Q/**\E((?:\s)|(?:\*))*\Q{@inheritDoc}\E((?:\s)|(?:\*))*\Q*/\E/$) : '{@inheritDoc} on its own is unnecessary',
|
||||
(~$/\$$(?:LastChanged)?Date\b/$) : 'svn keyword',
|
||||
(~$/\$$(?:(?:LastChanged)?Revision|Rev)\b/$) : 'svn keyword',
|
||||
(~$/\$$(?:LastChangedBy|Author)\b/$) : 'svn keyword',
|
||||
|
|
|
@ -67,6 +67,13 @@
|
|||
</maintainer>
|
||||
|
||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-7.2.1</name>
|
||||
<created>2018-01-15</created>
|
||||
<revision>7.2.1</revision>
|
||||
</Version>
|
||||
</release>
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-7.2.0</name>
|
||||
|
|
|
@ -67,6 +67,13 @@
|
|||
</maintainer>
|
||||
|
||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||
<release>
|
||||
<Version>
|
||||
<name>solr-7.2.1</name>
|
||||
<created>2018-01-15</created>
|
||||
<revision>7.2.1</revision>
|
||||
</Version>
|
||||
</release>
|
||||
<release>
|
||||
<Version>
|
||||
<name>solr-7.2.0</name>
|
||||
|
|
|
@ -31,5 +31,6 @@
|
|||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -0,0 +1,215 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from textwrap import dedent
|
||||
|
||||
# Number of iterations per test suite
|
||||
testIters = 5
|
||||
|
||||
usage = dedent('''\
|
||||
Usage:\n
|
||||
python3 -u %s URL\n
|
||||
Must be run from a Lucene/Solr git workspace. Downloads the Jenkins
|
||||
log pointed to by the given URL, parses it for Git revision and failed
|
||||
Lucene/Solr tests, checks out the Git revision in the local workspace,
|
||||
groups the failed tests by module, then runs
|
||||
'ant test -Dtest.dups=%d -Dtests.class="*.test1[|*.test2[...]]" ...'
|
||||
in each module of interest, failing at the end if any of the runs fails.
|
||||
To control the maximum number of concurrent JVMs used for each module's
|
||||
test run, set 'tests.jvms', e.g. in ~/lucene.build.properties
|
||||
''' % (sys.argv[0], testIters))
|
||||
|
||||
reHelpArg = re.compile(r'-{1,2}(?:\?|h(?:elp)?)')
|
||||
|
||||
# Example: Checking out Revision e441a99009a557f82ea17ee9f9c3e9b89c75cee6 (refs/remotes/origin/master)
|
||||
reGitRev = re.compile(r'Checking out Revision (\S+)')
|
||||
|
||||
# Method example: NOTE: reproduce with: ant test -Dtestcase=ZkSolrClientTest -Dtests.method=testMultipleWatchesAsync -Dtests.seed=6EF5AB70F0032849 -Dtests.slow=true -Dtests.locale=he-IL -Dtests.timezone=NST -Dtests.asserts=true -Dtests.file.encoding=UTF-8
|
||||
# Suite example: NOTE: reproduce with: ant test -Dtestcase=CloudSolrClientTest -Dtests.seed=DB2DF2D8228BAF27 -Dtests.multiplier=3 -Dtests.slow=true -Dtests.locale=es-AR -Dtests.timezone=America/Argentina/Cordoba -Dtests.asserts=true -Dtests.file.encoding=US-ASCII
|
||||
reReproLine = re.compile(r'NOTE:\s+reproduce\s+with:(\s+ant\s+test\s+-Dtestcase=(\S+)\s+(?:-Dtests.method=\S+\s+)?(.*))')
|
||||
|
||||
# Example: https://jenkins.thetaphi.de/job/Lucene-Solr-master-Linux/21108/
|
||||
reJenkinsURLWithoutConsoleText = re.compile(r'https?://.*/\d+/?\Z', re.IGNORECASE)
|
||||
|
||||
reJavaFile = re.compile(r'(.*)\.java\Z')
|
||||
reModule = re.compile(r'\./(.*)/src/')
|
||||
reTestOutputFile = re.compile(r'TEST-(.*\.([^-.]+))(?:-\d+)?\.xml\Z')
|
||||
reErrorFailure = re.compile(r'(?:errors|failures)="[^0]')
|
||||
|
||||
# consoleText from Policeman Jenkins's Windows jobs fails to decode as UTF-8
|
||||
encoding = 'iso-8859-1'
|
||||
|
||||
tests = {}
|
||||
modules = {}
|
||||
|
||||
lastFailureCode = 0
|
||||
gitCheckoutSucceeded = False
|
||||
|
||||
def runOutput(cmd):
|
||||
print('[repro] %s' % cmd)
|
||||
try:
|
||||
return subprocess.check_output(cmd.split(' '), universal_newlines=True).strip()
|
||||
except CalledProcessError as e:
|
||||
raise RuntimeError("ERROR: Cmd '%s' failed with exit code %d and the following output:\n%s"
|
||||
% (cmd, e.returncode, e.output))
|
||||
|
||||
# Remembers non-zero exit code in lastFailureCode unless rememberFailure==False
|
||||
def run(cmd, rememberFailure=True):
|
||||
global lastFailureCode
|
||||
print('[repro] %s' % cmd)
|
||||
code = os.system(cmd)
|
||||
if 0 != code and rememberFailure:
|
||||
print('\n[repro] Setting last failure code to %d\n' % code)
|
||||
lastFailureCode = code
|
||||
return code
|
||||
|
||||
def fetchAndParseJenkinsLog(url):
|
||||
global revision
|
||||
revision = None
|
||||
print('[repro] Jenkins log URL: %s\n' % url)
|
||||
try:
|
||||
with urllib.request.urlopen(url) as consoleText:
|
||||
for rawLine in consoleText:
|
||||
line = rawLine.decode(encoding)
|
||||
match = reGitRev.match(line)
|
||||
if match is not None:
|
||||
revision = match.group(1)
|
||||
print('[repro] Revision: %s\n' % revision)
|
||||
else:
|
||||
match = reReproLine.search(line)
|
||||
if match is not None:
|
||||
print('[repro] Repro line: %s\n' % match.group(1))
|
||||
testcase = match.group(2)
|
||||
reproLineWithoutMethod = match.group(3).strip()
|
||||
tests[testcase] = reproLineWithoutMethod
|
||||
except urllib.error.URLError as e:
|
||||
raise RuntimeError('ERROR: fetching %s : %s' % (url, e))
|
||||
|
||||
if revision == None:
|
||||
if reJenkinsURLWithoutConsoleText.match(url):
|
||||
print('[repro] Not a Jenkins log. Appending "/consoleText" and retrying ...\n')
|
||||
fetchAndParseJenkinsLog(url + '/consoleText')
|
||||
else:
|
||||
raise RuntimeError('ERROR: %s does not appear to be a Jenkins log.' % url)
|
||||
if 0 == len(tests):
|
||||
print('[repro] No "reproduce with" lines found; exiting.')
|
||||
sys.exit(0)
|
||||
|
||||
def prepareWorkspace():
|
||||
global gitCheckoutSucceeded
|
||||
code = run('git checkout %s' % revision)
|
||||
if 0 != code:
|
||||
raise RuntimeError('ERROR: "git checkout %s" failed. See above. Maybe try "git pull"?' % revision)
|
||||
gitCheckoutSucceeded = True
|
||||
code = run('ant clean')
|
||||
if 0 != code:
|
||||
raise RuntimeError('ERROR: "ant clean" failed. See above.')
|
||||
|
||||
def groupTestsByModule():
|
||||
for (dir, _, files) in os.walk('.'):
|
||||
for file in files:
|
||||
match = reJavaFile.search(file)
|
||||
if match is not None:
|
||||
test = match.group(1)
|
||||
if test in tests:
|
||||
match = reModule.match(dir)
|
||||
module = match.group(1)
|
||||
if module not in modules:
|
||||
modules[module] = set()
|
||||
modules[module].add(test)
|
||||
print('[repro] Test suites by module:')
|
||||
for module in modules:
|
||||
print('[repro] %s' % module)
|
||||
for test in modules[module]:
|
||||
print('[repro] %s' % test)
|
||||
|
||||
def runTests():
|
||||
global lastFailureCode
|
||||
cwd = os.getcwd()
|
||||
testCmdline = 'ant test-nocompile -Dtests.dups=%d -Dtests.maxfailures=%d -Dtests.class="%s" -Dtests.showOutput=onerror %s'
|
||||
for module in modules:
|
||||
moduleTests = list(modules[module])
|
||||
testList = '|'.join(map(lambda t: '*.%s' % t, moduleTests))
|
||||
numTests = len(moduleTests)
|
||||
params = tests[moduleTests[0]] # Assumption: all tests in this module have the same cmdline params
|
||||
os.chdir(module)
|
||||
code = run('ant compile-test')
|
||||
try:
|
||||
if (0 != code):
|
||||
raise RuntimeError("ERROR: Compile failed in %s/ with code %d. See above." % (module, code))
|
||||
run(testCmdline % (testIters, testIters * numTests, testList, params))
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
def printReport():
|
||||
failures = {}
|
||||
for start in ('lucene/build', 'solr/build'):
|
||||
for (dir, _, files) in os.walk(start):
|
||||
for file in files:
|
||||
testOutputFileMatch = reTestOutputFile.search(file)
|
||||
if testOutputFileMatch is not None:
|
||||
testcase = testOutputFileMatch.group(1)
|
||||
if testcase not in failures:
|
||||
failures[testcase] = 0
|
||||
with open(os.path.join(dir, file), encoding='UTF-8') as testOutputFile:
|
||||
for line in testOutputFile:
|
||||
errorFailureMatch = reErrorFailure.search(line)
|
||||
if errorFailureMatch is not None:
|
||||
failures[testcase] += 1
|
||||
break
|
||||
print("[repro] Failures:")
|
||||
for testcase in sorted(failures):
|
||||
print("[repro] %d/%d failed: %s" % (failures[testcase], testIters, testcase))
|
||||
|
||||
def rememberGitBranch():
|
||||
global origGitBranch
|
||||
origGitBranch = runOutput('git rev-parse --abbrev-ref HEAD')
|
||||
if (origGitBranch == 'HEAD'): # In detached HEAD state
|
||||
origGitBranch = runOutput('git rev-parse HEAD') # Use the SHA when not on a branch
|
||||
print('[repro] Initial local git branch/revision: %s' % origGitBranch)
|
||||
|
||||
def main():
|
||||
if 2 != len(sys.argv) or reHelpArg.match(sys.argv[1]):
|
||||
print(usage)
|
||||
sys.exit(0)
|
||||
fetchAndParseJenkinsLog(sys.argv[1])
|
||||
rememberGitBranch()
|
||||
|
||||
try:
|
||||
prepareWorkspace()
|
||||
groupTestsByModule()
|
||||
runTests()
|
||||
printReport()
|
||||
except Exception as e:
|
||||
print('[repro] %s' % e)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
if gitCheckoutSucceeded:
|
||||
run('git checkout %s' % origGitBranch, rememberFailure=False) # Restore original git branch/sha
|
||||
|
||||
print('[repro] Exiting with code %d' % lastFailureCode)
|
||||
sys.exit(lastFailureCode)
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print('[repro] Keyboard interrupt...exiting')
|
|
@ -32,6 +32,12 @@ API Changes
|
|||
* LUCENE-8012: Explanation now takes Number rather than float (Alan Woodward,
|
||||
Robert Muir)
|
||||
|
||||
* LUCENE-8116: SimScorer now only takes a frequency and a norm as per-document
|
||||
scoring factors. (Adrien Grand)
|
||||
|
||||
* LUCENE-8113: TermContext has been renamed to TermStates, and can now be
|
||||
constructed lazily if term statistics are not required (Alan Woodward)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-7837: Indices that were created before the previous major version
|
||||
|
@ -46,6 +52,9 @@ Changes in Runtime Behavior
|
|||
* LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
|
||||
when the function produces a negative value. (Adrien Grand)
|
||||
|
||||
* LUCENE-8116: Similarities now score fields that omit norms as if the norm was
|
||||
1. This might change score values on fields that omit norms. (Adrien Grand)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
|
||||
|
@ -110,16 +119,55 @@ Improvements
|
|||
|
||||
* LUCENE-8094: TermInSetQuery.toString now returns "field:(A B C)" (Mike McCandless)
|
||||
|
||||
* LUCENE-8121: UnifiedHighlighter passage relevancy is improved for terms that are
|
||||
position sensitive (e.g. part of a phrase) by having an accurate freq.
|
||||
(David Smiley)
|
||||
|
||||
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
||||
|
||||
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
|
||||
(Ere Maijala)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-8077: Fixed bug in how CheckIndex verifies doc-value iterators.
|
||||
(Xiaoshan Sun via Adrien Grand)
|
||||
|
||||
* SOLR-11758: Fixed FloatDocValues.boolVal to correctly return true for all values != 0.0F
|
||||
(Munendra S N via hossman)
|
||||
|
||||
* LUCENE-8121: The UnifiedHighlighter would highlight some terms within some nested
|
||||
SpanNearQueries at positions where it should not have. It's fixed in the UH by
|
||||
switching to the SpanCollector API. The original Highlighter still has this
|
||||
problem (LUCENE-2287, LUCENE-5455, LUCENE-6796). Some public but internal parts of
|
||||
the UH were refactored. (David Smiley, Steve Davids)
|
||||
|
||||
* LUCENE-8120: Fix LatLonBoundingBox's toString() method (Martijn van Groningen, Adrien Grand)
|
||||
|
||||
* LUCENE-8130: Fix NullPointerException from TermStates.toString() (Mike McCandless)
|
||||
|
||||
* LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly
|
||||
hyphenation patterns with indicator >= 7. (Holger Bruch via Adrien Grand)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
|
||||
(Kai Chan via Adrien Grand)
|
||||
|
||||
* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
|
||||
|
||||
* LUCENE-8106: Add script (reproduceJenkinsFailures.py) to attempt to reproduce
|
||||
failing tests from a Jenkins log. (Steve Rowe)
|
||||
|
||||
* LUCENE-8075: Removed unnecessary null check in IntersectTermsEnum.
|
||||
(Pulak Ghosh via Adrien Grand)
|
||||
|
||||
======================= Lucene 7.2.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-8117: Fix advanceExact on SortedNumericDocValues produced by Lucene54DocValues. (Jim Ferenczi).
|
||||
|
||||
======================= Lucene 7.2.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -19,12 +19,14 @@ FunctionScoreQuery maps negative values to 0.
|
|||
|
||||
## CustomScoreQuery, BoostedQuery and BoostingQuery removed (LUCENE-8099) ##
|
||||
|
||||
Instead use FunctionScoreQuery and a DoubleValuesSource implementation. For example,
|
||||
to replace the functionality of BoostedQuery, you could do the following, using
|
||||
the lucene-expressions module:
|
||||
Instead use FunctionScoreQuery and a DoubleValuesSource implementation. BoostedQuery
|
||||
and BoostingQuery may be replaced by calls to FunctionScoreQuery.boostByValue() and
|
||||
FunctionScoreQuery.boostByQuery(). To replace more complex calculations in
|
||||
CustomScoreQuery, use the lucene-expressions module:
|
||||
|
||||
SimpleBindings bindings = new SimpleBindings();
|
||||
bindings.add("score", DoubleValuesSource.SCORES);
|
||||
bindings.add("boost", DoubleValuesSource.fromIntField("myboostfield"));
|
||||
Expression expr = JavascriptCompiler.compile("score * boost");
|
||||
bindings.add("boost1", DoubleValuesSource.fromIntField("myboostfield"));
|
||||
bindings.add("boost2", DoubleValuesSource.fromIntField("myotherboostfield"));
|
||||
Expression expr = JavascriptCompiler.compile("score * (boost1 + ln(boost2))");
|
||||
FunctionScoreQuery q = new FunctionScoreQuery(inputQuery, expr.getDoubleValuesSource(bindings));
|
||||
|
|
|
@ -123,9 +123,6 @@ public final class CommonGramsFilter extends TokenFilter {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
|
|
@ -62,9 +62,6 @@ public final class CommonGramsQueryFilter extends TokenFilter {
|
|||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
|
|
@ -89,7 +89,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
|
|||
StringBuilder buf = new StringBuilder();
|
||||
byte v = vspace.get(k++);
|
||||
while (v != 0) {
|
||||
char c = (char) ((v >>> 4) - 1 + '0');
|
||||
char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0');
|
||||
buf.append(c);
|
||||
c = (char) (v & 0x0f);
|
||||
if (c == 0) {
|
||||
|
@ -151,7 +151,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
|
|||
StringBuilder buf = new StringBuilder();
|
||||
byte v = vspace.get(k++);
|
||||
while (v != 0) {
|
||||
char c = (char) ((v >>> 4) - 1);
|
||||
char c = (char) (((v & 0xf0 )>>> 4) - 1);
|
||||
buf.append(c);
|
||||
c = (char) (v & 0x0f);
|
||||
if (c == 0) {
|
||||
|
|
|
@ -204,9 +204,6 @@ public class FingerprintFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
|
|
@ -71,9 +71,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (!exhausted && input.incrementToken()) {
|
||||
|
@ -112,9 +109,6 @@ public final class HyphenatedWordsFilter extends TokenFilter {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
|
|
@ -43,9 +43,6 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
while (input.incrementToken()) {
|
||||
|
@ -71,9 +68,6 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
|
|
|
@ -1,58 +1,58 @@
|
|||
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was created using ICU4J v59.1.0.0
|
||||
* <p>
|
||||
* Unicode version: 9.0.0.0
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "9.0.0.0";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(
|
||||
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003,
|
||||
0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
|
||||
|
||||
private static Bits createBits(final int... codepoints) {
|
||||
final int len = codepoints[codepoints.length - 1] + 1;
|
||||
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
|
||||
for (int i : codepoints) bitset.set(i);
|
||||
return new Bits() {
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return index < len && bitset.get(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return 0x10FFFF + 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was created using ICU4J v60.2.0.0
|
||||
* <p>
|
||||
* Unicode version: 10.0.0.0
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "10.0.0.0";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(
|
||||
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003,
|
||||
0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000);
|
||||
|
||||
private static Bits createBits(final int... codepoints) {
|
||||
final int len = codepoints[codepoints.length - 1] + 1;
|
||||
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
|
||||
for (int i : codepoints) bitset.set(i);
|
||||
return new Bits() {
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return index < len && bitset.get(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return 0x10FFFF + 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
public void testLucene8124() throws Exception {
|
||||
InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(is);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
whitespaceMockTokenizer(
|
||||
"Rindfleisch"),
|
||||
hyphenator);
|
||||
|
||||
// TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
|
||||
assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
|
||||
}
|
||||
|
||||
|
||||
public static interface MockRetainAttribute extends Attribute {
|
||||
void setRetain(boolean attr);
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!--
|
||||
This file contains the hyphenation patterns for danish.
|
||||
Adapted from dkhyph.tex, dkcommon.tex and dkspecial.tex
|
||||
originally created by Frank Jensen (fj@iesd.auc.dk).
|
||||
FOP adaptation by Carlos Villegas (cav@uniscope.co.jp)
|
||||
-->
|
||||
<hyphenation-info>
|
||||
|
||||
<hyphen-char value="-"/>
|
||||
<hyphen-min before="2" after="2"/>
|
||||
|
||||
<classes>
|
||||
aA
|
||||
bB
|
||||
cC
|
||||
dD
|
||||
eE
|
||||
fF
|
||||
gG
|
||||
hH
|
||||
iI
|
||||
jJ
|
||||
kK
|
||||
lL
|
||||
mM
|
||||
nN
|
||||
oO
|
||||
pP
|
||||
qQ
|
||||
rR
|
||||
sS
|
||||
tT
|
||||
uU
|
||||
vV
|
||||
wW
|
||||
xX
|
||||
yY
|
||||
zZ
|
||||
æÆ
|
||||
øØ
|
||||
åÅ
|
||||
</classes>
|
||||
<patterns>
|
||||
d7f
|
||||
</patterns>
|
||||
</hyphenation-info>
|
|
@ -14,16 +14,21 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
|
||||
# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
|
||||
# and status code change related to that)
|
||||
#
|
||||
# Copyright (C) 2002-2013, International Business Machines Corporation
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -35,6 +40,7 @@
|
|||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
#
|
||||
|
@ -43,8 +49,9 @@
|
|||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Newline = [\p{Word_Break = Newline} ];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
|
@ -57,6 +64,13 @@ $MidLetter = [\p{Word_Break = MidLetter}];
|
|||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$E_Base = [\p{Word_Break = EB}];
|
||||
$E_Modifier = [\p{Word_Break = EM}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
@ -83,21 +97,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
|||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
||||
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
|
@ -108,12 +122,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
|
|||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
|
@ -123,6 +142,10 @@ $KatakanaEx {300}; # note: these status values override those from rule 5
|
|||
$HiraganaEx {300}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$Extended_Pict ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
|
@ -170,9 +193,42 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
|||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
# rule 13c
|
||||
# rule 14
|
||||
# Do not break within emoji modifier sequences
|
||||
|
||||
$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
|
|
@ -73,12 +73,14 @@
|
|||
0A4D>
|
||||
0ABC>
|
||||
0ACD>
|
||||
0AFD..0AFF>
|
||||
0B3C>
|
||||
0B4D>
|
||||
0BCD>
|
||||
0C4D>
|
||||
0CBC>
|
||||
0CCD>
|
||||
0D3B..0D3C>
|
||||
0D4D>
|
||||
0DCA>
|
||||
0E47..0E4C>
|
||||
|
@ -112,10 +114,10 @@
|
|||
1CD0..1CE8>
|
||||
1CED>
|
||||
1CF4>
|
||||
1CF8..1CF9>
|
||||
1CF7..1CF9>
|
||||
1D2C..1D6A>
|
||||
1DC4..1DCF>
|
||||
1DF5>
|
||||
1DF5..1DF9>
|
||||
1DFD..1DFF>
|
||||
1FBD>
|
||||
1FBF..1FC1>
|
||||
|
@ -175,7 +177,12 @@ FFE3>
|
|||
1163F>
|
||||
116B6..116B7>
|
||||
1172B>
|
||||
11A34>
|
||||
11A47>
|
||||
11A99>
|
||||
11C3F>
|
||||
11D42>
|
||||
11D44..11D45>
|
||||
16AF0..16AF4>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
|
|
|
@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
11C57>0037 # BHAIKSUKI DIGIT SEVEN
|
||||
11C58>0038 # BHAIKSUKI DIGIT EIGHT
|
||||
11C59>0039 # BHAIKSUKI DIGIT NINE
|
||||
11D50>0030 # MASARAM GONDI DIGIT ZERO
|
||||
11D51>0031 # MASARAM GONDI DIGIT ONE
|
||||
11D52>0032 # MASARAM GONDI DIGIT TWO
|
||||
11D53>0033 # MASARAM GONDI DIGIT THREE
|
||||
11D54>0034 # MASARAM GONDI DIGIT FOUR
|
||||
11D55>0035 # MASARAM GONDI DIGIT FIVE
|
||||
11D56>0036 # MASARAM GONDI DIGIT SIX
|
||||
11D57>0037 # MASARAM GONDI DIGIT SEVEN
|
||||
11D58>0038 # MASARAM GONDI DIGIT EIGHT
|
||||
11D59>0039 # MASARAM GONDI DIGIT NINE
|
||||
16A60>0030 # MRO DIGIT ZERO
|
||||
16A61>0031 # MRO DIGIT ONE
|
||||
16A62>0032 # MRO DIGIT TWO
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 1999-2016, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
|
@ -7,7 +9,7 @@
|
|||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 9.0.0
|
||||
* Unicode 10.0.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
|
@ -164,6 +166,7 @@
|
|||
0C56:91
|
||||
0CBC:7
|
||||
0CCD:9
|
||||
0D3B..0D3C:9
|
||||
0D4D:9
|
||||
0DCA:9
|
||||
0E38..0E39:103
|
||||
|
@ -234,6 +237,9 @@
|
|||
1DCF:220
|
||||
1DD0:202
|
||||
1DD1..1DF5:230
|
||||
1DF6:232
|
||||
1DF7..1DF8:228
|
||||
1DF9:220
|
||||
1DFB:230
|
||||
1DFC:233
|
||||
1DFD:220
|
||||
|
@ -322,7 +328,12 @@ FE2E..FE2F:230
|
|||
116B6:9
|
||||
116B7:7
|
||||
1172B:9
|
||||
11A34:9
|
||||
11A47:9
|
||||
11A99:9
|
||||
11C3F:9
|
||||
11D42:7
|
||||
11D44..11D45:9
|
||||
16AF0..16AF4:1
|
||||
16B30..16B36:230
|
||||
1BC9E:1
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 1999-2016, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
|
@ -11,7 +13,7 @@
|
|||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 9.0.0
|
||||
* Unicode 10.0.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2016 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 1999-2016, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc_cf.txt
|
||||
#
|
||||
|
@ -12,7 +12,7 @@
|
|||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 9.0.0
|
||||
* Unicode 10.0.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
|
|
@ -59,18 +59,34 @@ import com.ibm.icu.text.Normalizer2;
|
|||
* All foldings, case folding, and normalization mappings are applied recursively
|
||||
* to ensure a fully folded and normalized result.
|
||||
* </p>
|
||||
* <p>
|
||||
* A normalizer with additional settings such as a filter that lists characters not
|
||||
* to be normalized can be passed in the constructor.
|
||||
* </p>
|
||||
*/
|
||||
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
|
||||
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||
private static final Normalizer2 normalizer = Normalizer2.getInstance(
|
||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
||||
"utr30", Normalizer2.Mode.COMPOSE);
|
||||
|
||||
/**
|
||||
* A normalizer for search term folding to Unicode text,
|
||||
* applying foldings from UTR#30 Character Foldings.
|
||||
*/
|
||||
public static final Normalizer2 NORMALIZER = Normalizer2.getInstance(
|
||||
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
||||
"utr30", Normalizer2.Mode.COMPOSE);
|
||||
|
||||
/**
|
||||
* Create a new ICUFoldingFilter on the specified input
|
||||
*/
|
||||
public ICUFoldingFilter(TokenStream input) {
|
||||
super(input, NORMALIZER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new ICUFoldingFilter on the specified input with the specified
|
||||
* normalizer
|
||||
*/
|
||||
public ICUFoldingFilter(TokenStream input, Normalizer2 normalizer) {
|
||||
super(input, normalizer);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,7 +25,11 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory; // javadocs
|
|||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
import com.ibm.icu.text.FilteredNormalizer2;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Factory for {@link ICUFoldingFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_folded" class="solr.TextField" positionIncrementGap="100">
|
||||
|
@ -37,18 +41,30 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* @since 3.1.0
|
||||
*/
|
||||
public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
|
||||
private final Normalizer2 normalizer;
|
||||
|
||||
/** Creates a new ICUFoldingFilterFactory */
|
||||
public ICUFoldingFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
||||
Normalizer2 normalizer = ICUFoldingFilter.NORMALIZER;
|
||||
String filter = get(args, "filter");
|
||||
if (filter != null) {
|
||||
UnicodeSet set = new UnicodeSet(filter);
|
||||
if (!set.isEmpty()) {
|
||||
set.freeze();
|
||||
normalizer = new FilteredNormalizer2(normalizer, set);
|
||||
}
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
this.normalizer = normalizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new ICUFoldingFilter(input);
|
||||
return new ICUFoldingFilter(input, normalizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -16,152 +16,84 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Contain all the issues surrounding BreakIterators in ICU in one place.
|
||||
* Basically this boils down to the fact that they aren't very friendly to any
|
||||
* sort of OO design.
|
||||
* <p>
|
||||
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
|
||||
* BreakIterator from RuleBasedBreakIterator
|
||||
* <p>
|
||||
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
|
||||
* doesn't actually behave as a subclass: it always returns 0 for
|
||||
* getRuleStatus():
|
||||
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
|
||||
* tags
|
||||
* Wraps RuleBasedBreakIterator, making object reuse convenient and
|
||||
* emitting a rule status for emoji sequences.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
abstract class BreakIteratorWrapper {
|
||||
protected final CharArrayIterator textIterator = new CharArrayIterator();
|
||||
protected char text[];
|
||||
protected int start;
|
||||
protected int length;
|
||||
final class BreakIteratorWrapper {
|
||||
private final CharArrayIterator textIterator = new CharArrayIterator();
|
||||
private final RuleBasedBreakIterator rbbi;
|
||||
private char text[];
|
||||
private int start;
|
||||
private int status;
|
||||
|
||||
BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
|
||||
this.rbbi = rbbi;
|
||||
}
|
||||
|
||||
int current() {
|
||||
return rbbi.current();
|
||||
}
|
||||
|
||||
abstract int next();
|
||||
abstract int current();
|
||||
abstract int getRuleStatus();
|
||||
abstract void setText(CharacterIterator text);
|
||||
int getRuleStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
int next() {
|
||||
int current = rbbi.current();
|
||||
int next = rbbi.next();
|
||||
status = calcStatus(current, next);
|
||||
return next;
|
||||
}
|
||||
|
||||
/** Returns current rule status for the text between breaks. (determines token type) */
|
||||
private int calcStatus(int current, int next) {
|
||||
// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
|
||||
// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
|
||||
if (next != BreakIterator.DONE && isEmoji(current, next)) {
|
||||
return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
|
||||
} else {
|
||||
return rbbi.getRuleStatus();
|
||||
}
|
||||
}
|
||||
|
||||
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
|
||||
// we don't include regional indicators here, because they aren't ambiguous for tagging,
|
||||
// they need only be treated special for segmentation.
|
||||
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
|
||||
|
||||
/** Returns true if the current text represents emoji character or sequence */
|
||||
private boolean isEmoji(int current, int next) {
|
||||
int begin = start + current;
|
||||
int end = start + next;
|
||||
int codepoint = UTF16.charAt(text, 0, end, begin);
|
||||
// TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
|
||||
if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
|
||||
if (EMOJI_RK.contains(codepoint)) {
|
||||
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
|
||||
// an emoji presentation selector or keycap follows.
|
||||
int trailer = begin + Character.charCount(codepoint);
|
||||
return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void setText(char text[], int start, int length) {
|
||||
this.text = text;
|
||||
this.start = start;
|
||||
this.length = length;
|
||||
textIterator.setText(text, start, length);
|
||||
setText(textIterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
|
||||
* any other BreakIterator, the rulestatus method is not available, so treat
|
||||
* it like a generic BreakIterator.
|
||||
*/
|
||||
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
|
||||
if (breakIterator instanceof RuleBasedBreakIterator)
|
||||
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
|
||||
else
|
||||
return new BIWrapper(breakIterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
|
||||
* a DictionaryBasedBreakIterator) behaves correctly.
|
||||
*/
|
||||
static final class RBBIWrapper extends BreakIteratorWrapper {
|
||||
private final RuleBasedBreakIterator rbbi;
|
||||
|
||||
RBBIWrapper(RuleBasedBreakIterator rbbi) {
|
||||
this.rbbi = rbbi;
|
||||
}
|
||||
|
||||
@Override
|
||||
int current() {
|
||||
return rbbi.current();
|
||||
}
|
||||
|
||||
@Override
|
||||
int getRuleStatus() {
|
||||
return rbbi.getRuleStatus();
|
||||
}
|
||||
|
||||
@Override
|
||||
int next() {
|
||||
return rbbi.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
void setText(CharacterIterator text) {
|
||||
rbbi.setText(text);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic BreakIterator wrapper: Either the rulestatus method is not
|
||||
* available or always returns 0. Calculate a rulestatus here so it behaves
|
||||
* like RuleBasedBreakIterator.
|
||||
*
|
||||
* Note: This is slower than RuleBasedBreakIterator.
|
||||
*/
|
||||
static final class BIWrapper extends BreakIteratorWrapper {
|
||||
private final BreakIterator bi;
|
||||
private int status;
|
||||
|
||||
BIWrapper(BreakIterator bi) {
|
||||
this.bi = bi;
|
||||
}
|
||||
|
||||
@Override
|
||||
int current() {
|
||||
return bi.current();
|
||||
}
|
||||
|
||||
@Override
|
||||
int getRuleStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
@Override
|
||||
int next() {
|
||||
int current = bi.current();
|
||||
int next = bi.next();
|
||||
status = calcStatus(current, next);
|
||||
return next;
|
||||
}
|
||||
|
||||
private int calcStatus(int current, int next) {
|
||||
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
|
||||
return RuleBasedBreakIterator.WORD_NONE;
|
||||
|
||||
int begin = start + current;
|
||||
int end = start + next;
|
||||
|
||||
int codepoint;
|
||||
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||
codepoint = UTF16.charAt(text, 0, end, begin);
|
||||
|
||||
if (UCharacter.isDigit(codepoint))
|
||||
return RuleBasedBreakIterator.WORD_NUMBER;
|
||||
else if (UCharacter.isLetter(codepoint)) {
|
||||
// TODO: try to separately specify ideographic, kana?
|
||||
// [currently all bundled as letter for this case]
|
||||
return RuleBasedBreakIterator.WORD_LETTER;
|
||||
}
|
||||
}
|
||||
|
||||
return RuleBasedBreakIterator.WORD_NONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
void setText(CharacterIterator text) {
|
||||
bi.setText(text);
|
||||
status = RuleBasedBreakIterator.WORD_NONE;
|
||||
}
|
||||
rbbi.setText(textIterator);
|
||||
status = RuleBasedBreakIterator.WORD_NONE;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -123,7 +123,7 @@ final class CompositeBreakIterator {
|
|||
|
||||
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
|
||||
if (wordBreakers[scriptCode] == null)
|
||||
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
|
||||
wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
|
||||
return wordBreakers[scriptCode];
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
|
||||
/** Token type for words that appear to be numbers */
|
||||
public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
|
||||
/** Token type for words that appear to be emoji sequences */
|
||||
public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
|
||||
|
||||
/*
|
||||
* the default breakiterators in use. these can be expensive to
|
||||
|
@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||
|
||||
// the same as ROOT, except no dictionary segmentation for cjk
|
||||
private static final BreakIterator defaultBreakIterator =
|
||||
private static final RuleBasedBreakIterator defaultBreakIterator =
|
||||
readBreakIterator("Default.brk");
|
||||
private static final BreakIterator myanmarSyllableIterator =
|
||||
private static final RuleBasedBreakIterator myanmarSyllableIterator =
|
||||
readBreakIterator("MyanmarSyllable.brk");
|
||||
|
||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||
|
@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
}
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
public RuleBasedBreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||
case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
|
||||
case UScript.MYANMAR:
|
||||
if (myanmarAsWords) {
|
||||
return (BreakIterator)defaultBreakIterator.clone();
|
||||
return (RuleBasedBreakIterator)defaultBreakIterator.clone();
|
||||
} else {
|
||||
return (BreakIterator)myanmarSyllableIterator.clone();
|
||||
return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
|
||||
}
|
||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||
default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
|
||||
case RuleBasedBreakIterator.WORD_NUMBER:
|
||||
return WORD_NUMBER;
|
||||
case EMOJI_SEQUENCE_STATUS:
|
||||
return WORD_EMOJI;
|
||||
default: /* some other custom code */
|
||||
return "<OTHER>";
|
||||
}
|
||||
|
|
|
@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
*/
|
||||
private boolean incrementTokenBuffer() {
|
||||
int start = breaker.current();
|
||||
if (start == BreakIterator.DONE)
|
||||
return false; // BreakIterator exhausted
|
||||
assert start != BreakIterator.DONE;
|
||||
|
||||
// find the next set of boundaries, skipping over non-tokens (rule status 0)
|
||||
int end = breaker.next();
|
||||
while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
|
||||
while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
|
||||
start = end;
|
||||
end = breaker.next();
|
||||
}
|
||||
|
||||
if (start == BreakIterator.DONE)
|
||||
if (end == BreakIterator.DONE) {
|
||||
return false; // BreakIterator exhausted
|
||||
}
|
||||
|
||||
termAtt.copyBuffer(buffer, start, end - start);
|
||||
offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
|
||||
|
|
|
@ -16,8 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* Class that allows for tailored Unicode Text Segmentation on
|
||||
|
@ -25,14 +24,16 @@ import com.ibm.icu.text.BreakIterator;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class ICUTokenizerConfig {
|
||||
|
||||
/** Rule status for emoji sequences */
|
||||
public static final int EMOJI_SEQUENCE_STATUS = 299;
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
*/
|
||||
public ICUTokenizerConfig() {}
|
||||
/** Return a breakiterator capable of processing a given script. */
|
||||
public abstract BreakIterator getBreakIterator(int script);
|
||||
public abstract RuleBasedBreakIterator getBreakIterator(int script);
|
||||
/** Return a token type value for a given script and BreakIterator
|
||||
* rule status. */
|
||||
public abstract String getType(int script, int ruleStatus);
|
||||
|
|
|
@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
public RuleBasedBreakIterator getBreakIterator(int script) {
|
||||
if (breakers[script] != null) {
|
||||
return (BreakIterator) breakers[script].clone();
|
||||
return (RuleBasedBreakIterator) breakers[script].clone();
|
||||
} else {
|
||||
return super.getBreakIterator(script);
|
||||
}
|
||||
|
|
|
@ -353,7 +353,7 @@ and
|
|||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 8.0). However, some users who wish
|
||||
the most recent version of Unicode (currently 10.0). However, some users who wish
|
||||
for stronger backwards compatibility can restrict
|
||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
|
||||
/** basic tests for {@link ICUFoldingFilterFactory} */
|
||||
public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
|
||||
|
||||
|
||||
/** basic tests to ensure the folding is working */
|
||||
public void test() throws Exception {
|
||||
Reader reader = new StringReader("Résumé");
|
||||
|
@ -35,7 +35,24 @@ public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase {
|
|||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "resume" });
|
||||
}
|
||||
|
||||
|
||||
/** test to ensure the filter parameter is working */
|
||||
public void testFilter() throws Exception {
|
||||
HashMap<String,String> args = new HashMap<String,String>();
|
||||
args.put("filter", "[^ö]");
|
||||
ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(args);
|
||||
|
||||
Reader reader = new StringReader("Résumé");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "resume" });
|
||||
|
||||
reader = new StringReader("Fönster");
|
||||
stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "fönster" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
|
||||
|
|
|
@ -16,13 +16,10 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
|
||||
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testArmenian() throws Exception {
|
||||
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
|
||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
|
||||
}
|
||||
|
||||
public void testAmharic() throws Exception {
|
||||
|
@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
public void testArabic() throws Exception {
|
||||
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||
}
|
||||
|
||||
public void testAramaic() throws Exception {
|
||||
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
||||
}
|
||||
|
||||
|
@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testGreek() throws Exception {
|
||||
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
||||
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||
}
|
||||
|
||||
|
@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
*/
|
||||
public void testChinese() throws Exception {
|
||||
assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
||||
new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
|
||||
new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
|
||||
}
|
||||
|
||||
public void testHebrew() throws Exception {
|
||||
|
@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
||||
public void testAlphanumericSA() throws Exception {
|
||||
// alphanumeric tokens
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||
assertAnalyzesTo(a, "2B", new String[]{"2b"});
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
|
||||
assertAnalyzesTo(a, "2B", new String[]{"2B"});
|
||||
}
|
||||
|
||||
public void testDelimitersSA() throws Exception {
|
||||
|
@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testApostrophesSA() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
|
||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
|
||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
|
||||
}
|
||||
|
||||
public void testNumericSA() throws Exception {
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
|
||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbersSA() throws Exception {
|
||||
// numbers
|
||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
|
||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
|
||||
}
|
||||
|
||||
public void testVariousTextSA() throws Exception {
|
||||
// various
|
||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
|
||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
|
||||
}
|
||||
|
||||
public void testKoreanSA() throws Exception {
|
||||
|
@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testOffsets() throws Exception {
|
||||
assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"david", "has", "5000", "bones"},
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new int[] {0, 6, 10, 15},
|
||||
new int[] {5, 9, 14, 20});
|
||||
}
|
||||
|
||||
public void testTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"david", "has", "5000", "bones"},
|
||||
new String[] {"David", "has", "5000", "bones"},
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
|
@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||
}
|
||||
|
||||
/** simple emoji */
|
||||
public void testEmoji() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
|
||||
new String[] { "💩", "💩", "💩" },
|
||||
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** emoji zwj sequence */
|
||||
public void testEmojiSequence() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
|
||||
new String[] { "👩❤️👩" },
|
||||
new String[] { "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** emoji zwj sequence with fitzpatrick modifier */
|
||||
public void testEmojiSequenceWithModifier() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
|
||||
new String[] { "👨🏼⚕️" },
|
||||
new String[] { "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** regional indicator */
|
||||
public void testEmojiRegionalIndicator() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
|
||||
new String[] { "🇺🇸", "🇺🇸" },
|
||||
new String[] { "<EMOJI>", "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** variation sequence */
|
||||
public void testEmojiVariationSequence() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
|
||||
new String[] { "#️⃣" },
|
||||
new String[] { "<EMOJI>" });
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
|
||||
new String[] { "3️⃣",},
|
||||
new String[] { "<EMOJI>" });
|
||||
}
|
||||
|
||||
public void testEmojiTagSequence() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
|
||||
new String[] { "🏴" },
|
||||
new String[] { "<EMOJI>" });
|
||||
}
|
||||
|
||||
public void testEmojiTokenization() throws Exception {
|
||||
// simple emoji around latin
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
|
||||
new String[] { "poo", "💩", "poo" },
|
||||
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
|
||||
// simple emoji around non-latin
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
|
||||
new String[] { "💩", "中", "國", "💩" },
|
||||
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* dictionary segmentation with emoji
|
||||
*/
|
||||
public void testSimpleJapaneseWithEmoji() throws Exception {
|
||||
assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
|
||||
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testJapaneseTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "仮名遣い カタカナ",
|
||||
new String[] { "仮名遣い", "カタカナ" },
|
||||
|
|
|
@ -62,9 +62,9 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-58-1";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
= "http://source.icu-project.org/repos/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-60-2";
|
||||
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
private static final String NFKC_CF_TXT = "nfkc_cf.txt";
|
||||
|
|
|
@ -166,9 +166,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter {
|
|||
buffer.reset(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int read(char[] buffer, int offset, int length) throws IOException {
|
||||
int read = 0;
|
||||
|
@ -185,9 +182,6 @@ public class JapaneseIterationMarkCharFilter extends CharFilter {
|
|||
return read == 0 ? -1 : read;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
int ic = buffer.get(bufferPosition);
|
||||
|
|
|
@ -293,7 +293,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
"7.1.0-cfs",
|
||||
"7.1.0-nocfs",
|
||||
"7.2.0-cfs",
|
||||
"7.2.0-nocfs"
|
||||
"7.2.0-nocfs",
|
||||
"7.2.1-cfs",
|
||||
"7.2.1-nocfs"
|
||||
};
|
||||
|
||||
public static String[] getOldNames() {
|
||||
|
@ -304,7 +306,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
|||
"sorted.7.0.0",
|
||||
"sorted.7.0.1",
|
||||
"sorted.7.1.0",
|
||||
"sorted.7.2.0"
|
||||
"sorted.7.2.0",
|
||||
"sorted.7.2.1"
|
||||
};
|
||||
|
||||
public static String[] getOldSortedNames() {
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -99,17 +99,11 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
this.query = query;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
|
||||
return assignClassNormalizedList(inputDocument).get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
|
@ -117,9 +111,6 @@ public class BM25NBClassifier implements Classifier<BytesRef> {
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
|
|
|
@ -195,9 +195,6 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<Boolean> assignClass(String text)
|
||||
throws IOException {
|
||||
|
@ -220,18 +217,12 @@ public class BooleanPerceptronClassifier implements Classifier<Boolean> {
|
|||
return new ClassificationResult<>(output >= bias, score);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<Boolean>> getClasses(String text)
|
||||
throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<Boolean>> getClasses(String text, int max)
|
||||
throws IOException {
|
||||
|
|
|
@ -103,9 +103,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
|
@ -121,9 +118,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
|
@ -132,9 +126,6 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
|
@ -213,7 +204,7 @@ public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
|||
", classFieldName='" + classFieldName + '\'' +
|
||||
", k=" + k +
|
||||
", query=" + query +
|
||||
", similarity=" + indexSearcher.getSimilarity(true) +
|
||||
", similarity=" + indexSearcher.getSimilarity() +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,9 +119,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
|
||||
return classifyFromTopDocs(knnSearch(text));
|
||||
|
@ -143,9 +140,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
|
@ -154,9 +148,6 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
|
@ -251,7 +242,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
", classFieldName='" + classFieldName + '\'' +
|
||||
", k=" + k +
|
||||
", query=" + query +
|
||||
", similarity=" + indexSearcher.getSimilarity(true) +
|
||||
", similarity=" + indexSearcher.getSimilarity() +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,9 +98,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
this.query = query;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(inputDocument);
|
||||
|
@ -115,9 +112,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
|
@ -125,9 +119,6 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
|
|
|
@ -72,17 +72,11 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi
|
|||
this.field2analyzer = field2analyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(Document document) throws IOException {
|
||||
return classifyFromTopDocs(knnSearch(document));
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(Document document) throws IOException {
|
||||
TopDocs knnResults = knnSearch(document);
|
||||
|
@ -91,9 +85,6 @@ public class KNearestNeighborDocumentClassifier extends KNearestNeighborClassifi
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(Document document, int max) throws IOException {
|
||||
TopDocs knnResults = knnSearch(document);
|
||||
|
|
|
@ -71,9 +71,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
|
|||
this.field2analyzer = field2analyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(Document document) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);
|
||||
|
@ -88,9 +85,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
|
|||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(Document document) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);
|
||||
|
@ -98,9 +92,6 @@ public class SimpleNaiveBayesDocumentClassifier extends SimpleNaiveBayesClassifi
|
|||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(Document document, int max) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignNormClasses(document);
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
|
@ -210,20 +210,20 @@ public class NearestFuzzyQuery extends Query {
|
|||
}
|
||||
|
||||
private Query newTermQuery(IndexReader reader, Term term) throws IOException {
|
||||
// we build an artificial TermContext that will give an overall df and ttf
|
||||
// we build an artificial TermStates that will give an overall df and ttf
|
||||
// equal to 1
|
||||
TermContext context = new TermContext(reader.getContext());
|
||||
TermStates termStates = new TermStates(reader.getContext());
|
||||
for (LeafReaderContext leafContext : reader.leaves()) {
|
||||
Terms terms = leafContext.reader().terms(term.field());
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(term.bytes())) {
|
||||
int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
|
||||
context.register(termsEnum.termState(), leafContext.ord, freq, freq);
|
||||
int freq = 1 - termStates.docFreq(); // we want the total df and ttf to be 1
|
||||
termStates.register(termsEnum.termState(), leafContext.ord, freq, freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
return new TermQuery(term, context);
|
||||
return new TermQuery(term, termStates);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1309,7 +1309,8 @@ ant test "-Dtests.method=*esi*"
|
|||
ant test -Dtests.seed=DEADBEEF
|
||||
|
||||
# Repeats _all_ tests of ClassName N times. Every test repetition
|
||||
# will have a different seed.
|
||||
# will have a different seed. NOTE: does not reinitialize
|
||||
# between repetitions, use only for idempotent tests.
|
||||
ant test -Dtests.iters=N -Dtestcase=ClassName
|
||||
|
||||
# Repeats _all_ tests of ClassName N times. Every test repetition
|
||||
|
|
|
@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
public static final int KATAKANA = 5;
|
||||
/** Hangul token type */
|
||||
public static final int HANGUL = 6;
|
||||
/** Emoji token type. */
|
||||
public static final int EMOJI = 7;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
|
@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
"<IDEOGRAPHIC>",
|
||||
"<HIRAGANA>",
|
||||
"<KATAKANA>",
|
||||
"<HANGUL>"
|
||||
"<HANGUL>",
|
||||
"<EMOJI>"
|
||||
};
|
||||
|
||||
/** Absolute maximum sized token */
|
||||
|
|
|
@ -103,11 +103,8 @@ final class IntersectTermsEnum extends TermsEnum {
|
|||
arcs[arcIdx] = new FST.Arc<>();
|
||||
}
|
||||
|
||||
if (fr.index == null) {
|
||||
fstReader = null;
|
||||
} else {
|
||||
fstReader = fr.index.getBytesReader();
|
||||
}
|
||||
|
||||
fstReader = fr.index.getBytesReader();
|
||||
|
||||
// TODO: if the automaton is "smallish" we really
|
||||
// should use the terms index to seek at least to
|
||||
|
|
|
@ -17,34 +17,37 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Maintains a {@link IndexReader} {@link TermState} view over
|
||||
* {@link IndexReader} instances containing a single term. The
|
||||
* {@link TermContext} doesn't track if the given {@link TermState}
|
||||
* {@link TermStates} doesn't track if the given {@link TermState}
|
||||
* objects are valid, neither if the {@link TermState} instances refer to the
|
||||
* same terms in the associated readers.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class TermContext {
|
||||
public final class TermStates {
|
||||
|
||||
private static final TermState EMPTY_TERMSTATE = new TermState() {
|
||||
@Override
|
||||
public void copyFrom(TermState other) {
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
// Important: do NOT keep hard references to index readers
|
||||
private final Object topReaderContextIdentity;
|
||||
private final TermState[] states;
|
||||
private final Term term; // null if stats are to be used
|
||||
private int docFreq;
|
||||
private long totalTermFreq;
|
||||
|
||||
//public static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
/**
|
||||
* Creates an empty {@link TermContext} from a {@link IndexReaderContext}
|
||||
*/
|
||||
public TermContext(IndexReaderContext context) {
|
||||
private TermStates(Term term, IndexReaderContext context) {
|
||||
assert context != null && context.isTopLevel;
|
||||
topReaderContextIdentity = context.identity;
|
||||
docFreq = 0;
|
||||
|
@ -56,10 +59,18 @@ public final class TermContext {
|
|||
len = context.leaves().size();
|
||||
}
|
||||
states = new TermState[len];
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Return whether this {@link TermContext} was built for the given
|
||||
* Creates an empty {@link TermStates} from a {@link IndexReaderContext}
|
||||
*/
|
||||
public TermStates(IndexReaderContext context) {
|
||||
this(null, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Return whether this {@link TermStates} was built for the given
|
||||
* {@link IndexReaderContext}. This is typically used for assertions.
|
||||
* @lucene.internal
|
||||
*/
|
||||
|
@ -68,35 +79,35 @@ public final class TermContext {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TermContext} with an initial {@link TermState},
|
||||
* Creates a {@link TermStates} with an initial {@link TermState},
|
||||
* {@link IndexReader} pair.
|
||||
*/
|
||||
public TermContext(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
|
||||
this(context);
|
||||
public TermStates(IndexReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
|
||||
this(null, context);
|
||||
register(state, ord, docFreq, totalTermFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TermContext} from a top-level {@link IndexReaderContext} and the
|
||||
* Creates a {@link TermStates} from a top-level {@link IndexReaderContext} and the
|
||||
* given {@link Term}. This method will lookup the given term in all context's leaf readers
|
||||
* and register each of the readers containing the term in the returned {@link TermContext}
|
||||
* and register each of the readers containing the term in the returned {@link TermStates}
|
||||
* using the leaf reader's ordinal.
|
||||
* <p>
|
||||
* Note: the given context must be a top-level context.
|
||||
*
|
||||
* @param needsStats if {@code true} then all leaf contexts will be visited up-front to
|
||||
* collect term statistics. Otherwise, the {@link TermState} objects
|
||||
* will be built only when requested
|
||||
*/
|
||||
public static TermContext build(IndexReaderContext context, Term term)
|
||||
public static TermStates build(IndexReaderContext context, Term term, boolean needsStats)
|
||||
throws IOException {
|
||||
assert context != null && context.isTopLevel;
|
||||
final String field = term.field();
|
||||
final BytesRef bytes = term.bytes();
|
||||
final TermContext perReaderTermState = new TermContext(context);
|
||||
//if (DEBUG) System.out.println("prts.build term=" + term);
|
||||
for (final LeafReaderContext ctx : context.leaves()) {
|
||||
//if (DEBUG) System.out.println(" r=" + leaves[i].reader);
|
||||
final Terms terms = ctx.reader().terms(field);
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(bytes)) {
|
||||
final TermStates perReaderTermState = new TermStates(needsStats ? null : term, context);
|
||||
if (needsStats) {
|
||||
for (final LeafReaderContext ctx : context.leaves()) {
|
||||
//if (DEBUG) System.out.println(" r=" + leaves[i].reader);
|
||||
TermsEnum termsEnum = loadTermsEnum(ctx, term);
|
||||
if (termsEnum != null) {
|
||||
final TermState termState = termsEnum.termState();
|
||||
//if (DEBUG) System.out.println(" found");
|
||||
perReaderTermState.register(termState, ctx.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
|
@ -106,8 +117,19 @@ public final class TermContext {
|
|||
return perReaderTermState;
|
||||
}
|
||||
|
||||
private static TermsEnum loadTermsEnum(LeafReaderContext ctx, Term term) throws IOException {
|
||||
final Terms terms = ctx.reader().terms(term.field());
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(term.bytes())) {
|
||||
return termsEnum;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the {@link TermContext} internal state and removes all
|
||||
* Clears the {@link TermStates} internal state and removes all
|
||||
* registered {@link TermState}s
|
||||
*/
|
||||
public void clear() {
|
||||
|
@ -149,17 +171,25 @@ public final class TermContext {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link TermState} for an leaf ordinal or <code>null</code> if no
|
||||
* {@link TermState} for the ordinal was registered.
|
||||
* Returns the {@link TermState} for a leaf reader context or <code>null</code> if no
|
||||
* {@link TermState} for the context was registered.
|
||||
*
|
||||
* @param ord
|
||||
* the readers leaf ordinal to get the {@link TermState} for.
|
||||
* @param ctx
|
||||
* the {@link LeafReaderContext} to get the {@link TermState} for.
|
||||
* @return the {@link TermState} for the given readers ord or <code>null</code> if no
|
||||
* {@link TermState} for the reader was registered
|
||||
*/
|
||||
public TermState get(int ord) {
|
||||
assert ord >= 0 && ord < states.length;
|
||||
return states[ord];
|
||||
public TermState get(LeafReaderContext ctx) throws IOException {
|
||||
assert ctx.ord >= 0 && ctx.ord < states.length;
|
||||
if (term == null)
|
||||
return states[ctx.ord];
|
||||
if (this.states[ctx.ord] == null) {
|
||||
TermsEnum te = loadTermsEnum(ctx, term);
|
||||
this.states[ctx.ord] = te == null ? EMPTY_TERMSTATE : te.termState();
|
||||
}
|
||||
if (this.states[ctx.ord] == EMPTY_TERMSTATE)
|
||||
return null;
|
||||
return this.states[ctx.ord];
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -169,6 +199,9 @@ public final class TermContext {
|
|||
* instances passed to {@link #register(TermState, int, int, long)}.
|
||||
*/
|
||||
public int docFreq() {
|
||||
if (term != null) {
|
||||
throw new IllegalStateException("Cannot call docFreq() when needsStats=false");
|
||||
}
|
||||
return docFreq;
|
||||
}
|
||||
|
||||
|
@ -179,19 +212,23 @@ public final class TermContext {
|
|||
* instances passed to {@link #register(TermState, int, int, long)}.
|
||||
*/
|
||||
public long totalTermFreq() {
|
||||
if (term != null) {
|
||||
throw new IllegalStateException("Cannot call totalTermFreq() when needsStats=false");
|
||||
}
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("TermContext\n");
|
||||
sb.append("TermStates\n");
|
||||
for(TermState termState : states) {
|
||||
sb.append(" state=");
|
||||
sb.append(termState.toString());
|
||||
sb.append(termState);
|
||||
sb.append('\n');
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -53,7 +53,7 @@ public final class BlendedTermQuery extends Query {
|
|||
private int numTerms = 0;
|
||||
private Term[] terms = new Term[0];
|
||||
private float[] boosts = new float[0];
|
||||
private TermContext[] contexts = new TermContext[0];
|
||||
private TermStates[] contexts = new TermStates[0];
|
||||
private RewriteMethod rewriteMethod = DISJUNCTION_MAX_REWRITE;
|
||||
|
||||
/** Sole constructor. */
|
||||
|
@ -82,10 +82,10 @@ public final class BlendedTermQuery extends Query {
|
|||
|
||||
/**
|
||||
* Expert: Add a {@link Term} with the provided boost and context.
|
||||
* This method is useful if you already have a {@link TermContext}
|
||||
* This method is useful if you already have a {@link TermStates}
|
||||
* object constructed for the given term.
|
||||
*/
|
||||
public Builder add(Term term, float boost, TermContext context) {
|
||||
public Builder add(Term term, float boost, TermStates context) {
|
||||
if (numTerms >= BooleanQuery.getMaxClauseCount()) {
|
||||
throw new BooleanQuery.TooManyClauses();
|
||||
}
|
||||
|
@ -184,10 +184,10 @@ public final class BlendedTermQuery extends Query {
|
|||
|
||||
private final Term[] terms;
|
||||
private final float[] boosts;
|
||||
private final TermContext[] contexts;
|
||||
private final TermStates[] contexts;
|
||||
private final RewriteMethod rewriteMethod;
|
||||
|
||||
private BlendedTermQuery(Term[] terms, float[] boosts, TermContext[] contexts,
|
||||
private BlendedTermQuery(Term[] terms, float[] boosts, TermStates[] contexts,
|
||||
RewriteMethod rewriteMethod) {
|
||||
assert terms.length == boosts.length;
|
||||
assert terms.length == contexts.length;
|
||||
|
@ -205,7 +205,7 @@ public final class BlendedTermQuery extends Query {
|
|||
terms[i] = terms[j];
|
||||
terms[j] = tmpTerm;
|
||||
|
||||
TermContext tmpContext = contexts[i];
|
||||
TermStates tmpContext = contexts[i];
|
||||
contexts[i] = contexts[j];
|
||||
contexts[j] = tmpContext;
|
||||
|
||||
|
@ -263,10 +263,10 @@ public final class BlendedTermQuery extends Query {
|
|||
|
||||
@Override
|
||||
public final Query rewrite(IndexReader reader) throws IOException {
|
||||
final TermContext[] contexts = Arrays.copyOf(this.contexts, this.contexts.length);
|
||||
final TermStates[] contexts = Arrays.copyOf(this.contexts, this.contexts.length);
|
||||
for (int i = 0; i < contexts.length; ++i) {
|
||||
if (contexts[i] == null || contexts[i].wasBuiltFor(reader.getContext()) == false) {
|
||||
contexts[i] = TermContext.build(reader.getContext(), terms[i]);
|
||||
contexts[i] = TermStates.build(reader.getContext(), terms[i], true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -275,7 +275,7 @@ public final class BlendedTermQuery extends Query {
|
|||
// ttf will be the sum of all total term freqs
|
||||
int df = 0;
|
||||
long ttf = 0;
|
||||
for (TermContext ctx : contexts) {
|
||||
for (TermStates ctx : contexts) {
|
||||
df = Math.max(df, ctx.docFreq());
|
||||
ttf += ctx.totalTermFreq();
|
||||
}
|
||||
|
@ -294,8 +294,8 @@ public final class BlendedTermQuery extends Query {
|
|||
return rewriteMethod.rewrite(termQueries);
|
||||
}
|
||||
|
||||
private static TermContext adjustFrequencies(IndexReaderContext readerContext,
|
||||
TermContext ctx, int artificialDf, long artificialTtf) {
|
||||
private static TermStates adjustFrequencies(IndexReaderContext readerContext,
|
||||
TermStates ctx, int artificialDf, long artificialTtf) throws IOException {
|
||||
List<LeafReaderContext> leaves = readerContext.leaves();
|
||||
final int len;
|
||||
if (leaves == null) {
|
||||
|
@ -303,9 +303,9 @@ public final class BlendedTermQuery extends Query {
|
|||
} else {
|
||||
len = leaves.size();
|
||||
}
|
||||
TermContext newCtx = new TermContext(readerContext);
|
||||
TermStates newCtx = new TermStates(readerContext);
|
||||
for (int i = 0; i < len; ++i) {
|
||||
TermState termState = ctx.get(i);
|
||||
TermState termState = ctx.get(leaves.get(i));
|
||||
if (termState == null) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -48,7 +48,7 @@ final class BooleanWeight extends Weight {
|
|||
super(query);
|
||||
this.query = query;
|
||||
this.scoreMode = scoreMode;
|
||||
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
|
||||
this.similarity = searcher.getSimilarity();
|
||||
weights = new ArrayList<>();
|
||||
for (BooleanClause c : query) {
|
||||
Weight w = searcher.createWeight(c.getQuery(), c.isScoring() ? scoreMode : ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
final class ExactPhraseScorer extends Scorer {
|
||||
|
||||
|
@ -42,13 +41,13 @@ final class ExactPhraseScorer extends Scorer {
|
|||
|
||||
private int freq;
|
||||
|
||||
private final Similarity.SimScorer docScorer;
|
||||
private final LeafSimScorer docScorer;
|
||||
private final boolean needsScores, needsTotalHitCount;
|
||||
private float matchCost;
|
||||
private float minCompetitiveScore;
|
||||
|
||||
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
Similarity.SimScorer docScorer, ScoreMode scoreMode,
|
||||
LeafSimScorer docScorer, ScoreMode scoreMode,
|
||||
float matchCost) throws IOException {
|
||||
super(weight);
|
||||
this.docScorer = docScorer;
|
||||
|
@ -123,7 +122,7 @@ final class ExactPhraseScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public float maxScore() {
|
||||
return docScorer.maxScore(Integer.MAX_VALUE);
|
||||
return docScorer.maxScore();
|
||||
}
|
||||
|
||||
/** Advance the given pos enum to the first doc on or after {@code target}.
|
||||
|
|
|
@ -32,7 +32,6 @@ import java.util.concurrent.Future;
|
|||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -40,7 +39,7 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
@ -75,36 +74,6 @@ import org.apache.lucene.util.ThreadInterruptedException;
|
|||
*/
|
||||
public class IndexSearcher {
|
||||
|
||||
/** A search-time {@link Similarity} that does not make use of scoring factors
|
||||
* and may be used when scores are not needed. */
|
||||
private static final Similarity NON_SCORING_SIMILARITY = new Similarity() {
|
||||
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
throw new UnsupportedOperationException("This Similarity may only be used for searching, not indexing");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new SimWeight() {};
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
return new SimScorer() {
|
||||
@Override
|
||||
public float score(int doc, float freq) {
|
||||
return 0f;
|
||||
}
|
||||
@Override
|
||||
public float maxScore(float maxFreq) {
|
||||
return 0f;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
private static QueryCache DEFAULT_QUERY_CACHE;
|
||||
private static QueryCachingPolicy DEFAULT_CACHING_POLICY = new UsageTrackingQueryCachingPolicy();
|
||||
static {
|
||||
|
@ -136,7 +105,7 @@ public class IndexSearcher {
|
|||
* Expert: returns a default Similarity instance.
|
||||
* In general, this method is only called to initialize searchers and writers.
|
||||
* User code and query implementations should respect
|
||||
* {@link IndexSearcher#getSimilarity(boolean)}.
|
||||
* {@link IndexSearcher#getSimilarity()}.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static Similarity getDefaultSimilarity() {
|
||||
|
@ -329,15 +298,11 @@ public class IndexSearcher {
|
|||
this.similarity = similarity;
|
||||
}
|
||||
|
||||
/** Expert: Get the {@link Similarity} to use to compute scores. When
|
||||
* {@code needsScores} is {@code false}, this method will return a simple
|
||||
* {@link Similarity} that does not leverage scoring factors such as norms.
|
||||
* When {@code needsScores} is {@code true}, this returns the
|
||||
/** Expert: Get the {@link Similarity} to use to compute scores. This returns the
|
||||
* {@link Similarity} that has been set through {@link #setSimilarity(Similarity)}
|
||||
* or the {@link #getDefaultSimilarity()} default {@link Similarity} if none
|
||||
* has been set explicitly. */
|
||||
public Similarity getSimilarity(boolean needsScores) {
|
||||
return needsScores ? similarity : NON_SCORING_SIMILARITY;
|
||||
* or the default {@link Similarity} if none has been set explicitly. */
|
||||
public Similarity getSimilarity() {
|
||||
return similarity;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -774,7 +739,7 @@ public class IndexSearcher {
|
|||
* across a distributed collection.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public TermStatistics termStatistics(Term term, TermContext context) throws IOException {
|
||||
public TermStatistics termStatistics(Term term, TermStates context) throws IOException {
|
||||
if (context.docFreq() == 0) {
|
||||
return null;
|
||||
} else {
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
/**
|
||||
* {@link SimScorer} on a specific {@link LeafReader}.
|
||||
*/
|
||||
public final class LeafSimScorer {
|
||||
|
||||
private final SimScorer scorer;
|
||||
private final NumericDocValues norms;
|
||||
private final float maxScore;
|
||||
|
||||
/**
|
||||
* Sole constructor: Score documents of {@code reader} with {@code scorer}.
|
||||
*/
|
||||
public LeafSimScorer(SimScorer scorer, LeafReader reader, boolean needsScores, float maxFreq) throws IOException {
|
||||
this.scorer = scorer;
|
||||
norms = needsScores ? reader.getNormValues(scorer.getField()) : null;
|
||||
maxScore = needsScores ? scorer.score(maxFreq, 1) : Float.MAX_VALUE;
|
||||
}
|
||||
|
||||
private long getNormValue(int doc) throws IOException {
|
||||
if (norms != null) {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
return norms.longValue();
|
||||
} else {
|
||||
return 1L; // default norm
|
||||
}
|
||||
}
|
||||
|
||||
/** Score the provided document assuming the given term document frequency.
|
||||
* This method must be called on non-decreasing sequences of doc ids.
|
||||
* @see SimScorer#score(float, long) */
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return scorer.score(freq, getNormValue(doc));
|
||||
}
|
||||
|
||||
/** Explain the score for the provided document assuming the given term document frequency.
|
||||
* This method must be called on non-decreasing sequences of doc ids.
|
||||
* @see SimScorer#explain(Explanation, long) */
|
||||
public Explanation explain(int doc, Explanation freqExpl) throws IOException {
|
||||
return scorer.explain(freqExpl, getNormValue(doc));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an upper bound of the score.
|
||||
*/
|
||||
public float maxScore() {
|
||||
return maxScore;
|
||||
}
|
||||
}
|
|
@ -18,19 +18,26 @@ package org.apache.lucene.search;
|
|||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -183,36 +190,38 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
private class MultiPhraseWeight extends Weight {
|
||||
private final Similarity similarity;
|
||||
private final Similarity.SimWeight stats;
|
||||
private final Map<Term,TermContext> termContexts = new HashMap<>();
|
||||
private final Similarity.SimScorer stats;
|
||||
private final Map<Term,TermStates> termStates = new HashMap<>();
|
||||
private final ScoreMode scoreMode;
|
||||
|
||||
public MultiPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
super(MultiPhraseQuery.this);
|
||||
this.scoreMode = scoreMode;
|
||||
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
|
||||
this.similarity = searcher.getSimilarity();
|
||||
final IndexReaderContext context = searcher.getTopReaderContext();
|
||||
|
||||
// compute idf
|
||||
ArrayList<TermStatistics> allTermStats = new ArrayList<>();
|
||||
for(final Term[] terms: termArrays) {
|
||||
for (Term term: terms) {
|
||||
TermContext termContext = termContexts.get(term);
|
||||
if (termContext == null) {
|
||||
termContext = TermContext.build(context, term);
|
||||
termContexts.put(term, termContext);
|
||||
TermStates ts = termStates.get(term);
|
||||
if (ts == null) {
|
||||
ts = TermStates.build(context, term, scoreMode.needsScores());
|
||||
termStates.put(term, ts);
|
||||
}
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, termContext);
|
||||
if (termStatistics != null) {
|
||||
allTermStats.add(termStatistics);
|
||||
if (scoreMode.needsScores()) {
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, ts);
|
||||
if (termStatistics != null) {
|
||||
allTermStats.add(termStatistics);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (allTermStats.isEmpty()) {
|
||||
stats = null; // none of the terms were found, we won't use sim at all
|
||||
} else {
|
||||
stats = similarity.computeWeight(
|
||||
stats = similarity.scorer(
|
||||
boost,
|
||||
searcher.collectionStatistics(field),
|
||||
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
|
||||
|
@ -253,7 +262,7 @@ public class MultiPhraseQuery extends Query {
|
|||
List<PostingsEnum> postings = new ArrayList<>();
|
||||
|
||||
for (Term term : terms) {
|
||||
TermState termState = termContexts.get(term).get(context.ord);
|
||||
TermState termState = termStates.get(term).get(context);
|
||||
if (termState != null) {
|
||||
termsEnum.seekExact(term.bytes(), termState);
|
||||
postings.add(termsEnum.postings(null, PostingsEnum.POSITIONS));
|
||||
|
@ -282,11 +291,11 @@ public class MultiPhraseQuery extends Query {
|
|||
|
||||
if (slop == 0) {
|
||||
return new ExactPhraseScorer(this, postingsFreqs,
|
||||
similarity.simScorer(stats, context),
|
||||
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
|
||||
scoreMode, totalMatchCost);
|
||||
} else {
|
||||
return new SloppyPhraseScorer(this, postingsFreqs, slop,
|
||||
similarity.simScorer(stats, context),
|
||||
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE),
|
||||
scoreMode.needsScores(), totalMatchCost);
|
||||
}
|
||||
}
|
||||
|
@ -303,7 +312,7 @@ public class MultiPhraseQuery extends Query {
|
|||
int newDoc = scorer.iterator().advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
|
||||
SimScorer docScorer = similarity.simScorer(stats, context);
|
||||
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE);
|
||||
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(
|
||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.index.FilteredTermsEnum; // javadocs
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.SingleTermsEnum; // javadocs
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BooleanQuery.Builder;
|
||||
|
@ -166,7 +166,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermContext states) {
|
||||
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount, float boost, TermStates states) {
|
||||
final TermQuery tq = new TermQuery(term, states);
|
||||
topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
@ -218,7 +218,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
|
||||
@Override
|
||||
protected void addClause(BlendedTermQuery.Builder topLevel, Term term, int docCount,
|
||||
float boost, TermContext states) {
|
||||
float boost, TermStates states) {
|
||||
topLevel.add(term, boost, states);
|
||||
}
|
||||
}
|
||||
|
@ -262,7 +262,7 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermContext states) {
|
||||
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docFreq, float boost, TermStates states) {
|
||||
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
|
||||
topLevel.add(new BoostQuery(q, boost), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.Objects;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
@ -148,9 +148,9 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
|
|||
// build a boolean query
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
for (TermAndState t : collectedTerms) {
|
||||
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
|
||||
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
|
||||
bq.add(new TermQuery(new Term(query.field, t.term), termContext), Occur.SHOULD);
|
||||
final TermStates termStates = new TermStates(searcher.getTopReaderContext());
|
||||
termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
|
||||
bq.add(new TermQuery(new Term(query.field, t.term), termStates), Occur.SHOULD);
|
||||
}
|
||||
Query q = new ConstantScoreQuery(bq.build());
|
||||
final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
|
||||
|
|
|
@ -32,12 +32,11 @@ import org.apache.lucene.index.LeafReader;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -352,9 +351,9 @@ public class PhraseQuery extends Query {
|
|||
|
||||
private class PhraseWeight extends Weight {
|
||||
private final Similarity similarity;
|
||||
private final Similarity.SimWeight stats;
|
||||
private final Similarity.SimScorer stats;
|
||||
private final ScoreMode scoreMode;
|
||||
private transient TermContext states[];
|
||||
private transient TermStates states[];
|
||||
|
||||
public PhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
|
@ -366,21 +365,23 @@ public class PhraseQuery extends Query {
|
|||
throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
|
||||
}
|
||||
this.scoreMode = scoreMode;
|
||||
this.similarity = searcher.getSimilarity(scoreMode.needsScores());
|
||||
this.similarity = searcher.getSimilarity();
|
||||
final IndexReaderContext context = searcher.getTopReaderContext();
|
||||
states = new TermContext[terms.length];
|
||||
states = new TermStates[terms.length];
|
||||
TermStatistics termStats[] = new TermStatistics[terms.length];
|
||||
int termUpTo = 0;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
final Term term = terms[i];
|
||||
states[i] = TermContext.build(context, term);
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
|
||||
if (termStatistics != null) {
|
||||
termStats[termUpTo++] = termStatistics;
|
||||
states[i] = TermStates.build(context, term, scoreMode.needsScores());
|
||||
if (scoreMode.needsScores()) {
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, states[i]);
|
||||
if (termStatistics != null) {
|
||||
termStats[termUpTo++] = termStatistics;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (termUpTo > 0) {
|
||||
stats = similarity.computeWeight(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
|
||||
stats = similarity.scorer(boost, searcher.collectionStatistics(field), Arrays.copyOf(termStats, termUpTo));
|
||||
} else {
|
||||
stats = null; // no terms at all, we won't use similarity
|
||||
}
|
||||
|
@ -415,7 +416,7 @@ public class PhraseQuery extends Query {
|
|||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
final Term t = terms[i];
|
||||
final TermState state = states[i].get(context.ord);
|
||||
final TermState state = states[i].get(context);
|
||||
if (state == null) { /* term doesnt exist in this segment */
|
||||
assert termNotInReader(reader, t): "no termstate found but term exists in reader";
|
||||
return null;
|
||||
|
@ -433,11 +434,11 @@ public class PhraseQuery extends Query {
|
|||
|
||||
if (slop == 0) { // optimize exact case
|
||||
return new ExactPhraseScorer(this, postingsFreqs,
|
||||
similarity.simScorer(stats, context),
|
||||
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Integer.MAX_VALUE),
|
||||
scoreMode, totalMatchCost);
|
||||
} else {
|
||||
return new SloppyPhraseScorer(this, postingsFreqs, slop,
|
||||
similarity.simScorer(stats, context),
|
||||
new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE),
|
||||
scoreMode.needsScores(), totalMatchCost);
|
||||
}
|
||||
}
|
||||
|
@ -459,7 +460,7 @@ public class PhraseQuery extends Query {
|
|||
int newDoc = scorer.iterator().advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = slop == 0 ? ((ExactPhraseScorer)scorer).freq() : ((SloppyPhraseScorer)scorer).sloppyFreq();
|
||||
SimScorer docScorer = similarity.simScorer(stats, context);
|
||||
LeafSimScorer docScorer = new LeafSimScorer(stats, context.reader(), scoreMode.needsScores(), Float.MAX_VALUE);
|
||||
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(
|
||||
|
|
|
@ -20,7 +20,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||
|
@ -64,7 +64,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery.Builder topLevel, Term term, int docCount,
|
||||
float boost, TermContext states) {
|
||||
float boost, TermStates states) {
|
||||
final TermQuery tq = new TermQuery(term, states);
|
||||
topLevel.add(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
if (size > 0) {
|
||||
final int sort[] = col.terms.sort();
|
||||
final float[] boost = col.array.boost;
|
||||
final TermContext[] termStates = col.array.termState;
|
||||
final TermStates[] termStates = col.array.termState;
|
||||
for (int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
|
||||
|
@ -146,7 +146,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
} else {
|
||||
// new entry: we populate the entry initially
|
||||
array.boost[e] = boostAtt.getBoost();
|
||||
array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
array.termState[e] = new TermStates(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
ScoringRewrite.this.checkMaxClauseCount(terms.size());
|
||||
}
|
||||
return true;
|
||||
|
@ -156,7 +156,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
|
||||
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
|
||||
float[] boost;
|
||||
TermContext[] termState;
|
||||
TermStates[] termState;
|
||||
|
||||
public TermFreqBoostByteStart(int initSize) {
|
||||
super(initSize);
|
||||
|
@ -166,7 +166,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
boost = new float[ArrayUtil.oversize(ord.length, Float.BYTES)];
|
||||
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
termState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
assert termState.length >= ord.length && boost.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
@ -176,7 +176,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
|||
final int[] ord = super.grow();
|
||||
boost = ArrayUtil.grow(boost, ord.length);
|
||||
if (termState.length < ord.length) {
|
||||
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
TermStates[] tmpTermState = new TermStates[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
|
||||
termState = tmpTermState;
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.util.HashSet;
|
|||
import java.util.LinkedHashMap;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
final class SloppyPhraseScorer extends Scorer {
|
||||
|
@ -36,7 +35,7 @@ final class SloppyPhraseScorer extends Scorer {
|
|||
|
||||
private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().
|
||||
|
||||
private final Similarity.SimScorer docScorer;
|
||||
private final LeafSimScorer docScorer;
|
||||
|
||||
private final int slop;
|
||||
private final int numPostings;
|
||||
|
@ -55,7 +54,7 @@ final class SloppyPhraseScorer extends Scorer {
|
|||
private final float matchCost;
|
||||
|
||||
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
|
||||
int slop, Similarity.SimScorer docScorer, boolean needsScores,
|
||||
int slop, LeafSimScorer docScorer, boolean needsScores,
|
||||
float matchCost) {
|
||||
super(weight);
|
||||
this.docScorer = docScorer;
|
||||
|
@ -558,7 +557,7 @@ final class SloppyPhraseScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public float maxScore() {
|
||||
return docScorer.maxScore(Float.POSITIVE_INFINITY);
|
||||
return docScorer.maxScore();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,11 +31,10 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
|
@ -127,28 +126,28 @@ public final class SynonymQuery extends Query {
|
|||
}
|
||||
|
||||
class SynonymWeight extends Weight {
|
||||
private final TermContext termContexts[];
|
||||
private final TermStates termStates[];
|
||||
private final Similarity similarity;
|
||||
private final Similarity.SimWeight simWeight;
|
||||
private final Similarity.SimScorer simWeight;
|
||||
|
||||
SynonymWeight(Query query, IndexSearcher searcher, float boost) throws IOException {
|
||||
super(query);
|
||||
CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].field());
|
||||
long docFreq = 0;
|
||||
long totalTermFreq = 0;
|
||||
termContexts = new TermContext[terms.length];
|
||||
for (int i = 0; i < termContexts.length; i++) {
|
||||
termContexts[i] = TermContext.build(searcher.getTopReaderContext(), terms[i]);
|
||||
TermStatistics termStats = searcher.termStatistics(terms[i], termContexts[i]);
|
||||
termStates = new TermStates[terms.length];
|
||||
for (int i = 0; i < termStates.length; i++) {
|
||||
termStates[i] = TermStates.build(searcher.getTopReaderContext(), terms[i], true);
|
||||
TermStatistics termStats = searcher.termStatistics(terms[i], termStates[i]);
|
||||
if (termStats != null) {
|
||||
docFreq = Math.max(termStats.docFreq(), docFreq);
|
||||
totalTermFreq += termStats.totalTermFreq();
|
||||
}
|
||||
}
|
||||
this.similarity = searcher.getSimilarity(true);
|
||||
this.similarity = searcher.getSimilarity();
|
||||
if (docFreq > 0) {
|
||||
TermStatistics pseudoStats = new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq);
|
||||
this.simWeight = similarity.computeWeight(boost, collectionStats, pseudoStats);
|
||||
this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats);
|
||||
} else {
|
||||
this.simWeight = null; // no terms exist at all, we won't use similarity
|
||||
}
|
||||
|
@ -175,7 +174,7 @@ public final class SynonymQuery extends Query {
|
|||
assert scorer instanceof TermScorer;
|
||||
freq = ((TermScorer)scorer).freq();
|
||||
}
|
||||
SimScorer docScorer = similarity.simScorer(simWeight, context);
|
||||
LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), true, Float.MAX_VALUE);
|
||||
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(
|
||||
|
@ -190,7 +189,6 @@ public final class SynonymQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
Similarity.SimScorer simScorer = null;
|
||||
IndexOptions indexOptions = IndexOptions.NONE;
|
||||
if (terms.length > 0) {
|
||||
FieldInfo info = context.reader()
|
||||
|
@ -202,21 +200,17 @@ public final class SynonymQuery extends Query {
|
|||
}
|
||||
// we use termscorers + disjunction as an impl detail
|
||||
List<Scorer> subScorers = new ArrayList<>();
|
||||
long maxFreq = 0;
|
||||
long totalMaxFreq = 0;
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
TermState state = termContexts[i].get(context.ord);
|
||||
TermState state = termStates[i].get(context);
|
||||
if (state != null) {
|
||||
TermsEnum termsEnum = context.reader().terms(terms[i].field()).iterator();
|
||||
termsEnum.seekExact(terms[i].bytes(), state);
|
||||
|
||||
maxFreq += getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
|
||||
|
||||
long termMaxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
|
||||
totalMaxFreq += termMaxFreq;
|
||||
PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS);
|
||||
// lazy init sim, in case no terms exist
|
||||
if (simScorer == null) {
|
||||
simScorer = similarity.simScorer(simWeight, context);
|
||||
}
|
||||
subScorers.add(new TermScorer(this, postings, simScorer, Float.POSITIVE_INFINITY));
|
||||
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, termMaxFreq);
|
||||
subScorers.add(new TermScorer(this, postings, simScorer));
|
||||
}
|
||||
}
|
||||
if (subScorers.isEmpty()) {
|
||||
|
@ -225,7 +219,8 @@ public final class SynonymQuery extends Query {
|
|||
// we must optimize this case (term not in segment), disjunctionscorer requires >= 2 subs
|
||||
return subScorers.get(0);
|
||||
} else {
|
||||
return new SynonymScorer(simScorer, this, subScorers, maxFreq);
|
||||
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), true, totalMaxFreq);
|
||||
return new SynonymScorer(simScorer, this, subScorers);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -248,13 +243,11 @@ public final class SynonymQuery extends Query {
|
|||
}
|
||||
|
||||
static class SynonymScorer extends DisjunctionScorer {
|
||||
private final Similarity.SimScorer similarity;
|
||||
private final float maxFreq;
|
||||
private final LeafSimScorer similarity;
|
||||
|
||||
SynonymScorer(Similarity.SimScorer similarity, Weight weight, List<Scorer> subScorers, float maxFreq) {
|
||||
SynonymScorer(LeafSimScorer similarity, Weight weight, List<Scorer> subScorers) {
|
||||
super(weight, subScorers, true);
|
||||
this.similarity = similarity;
|
||||
this.maxFreq = maxFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -264,7 +257,7 @@ public final class SynonymQuery extends Query {
|
|||
|
||||
@Override
|
||||
public float maxScore() {
|
||||
return similarity.maxScore(maxFreq);
|
||||
return similarity.maxScore();
|
||||
}
|
||||
|
||||
/** combines TF of all subs. */
|
||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
@ -43,7 +43,7 @@ abstract class TermCollectingRewrite<B> extends MultiTermQuery.RewriteMethod {
|
|||
addClause(topLevel, term, docCount, boost, null);
|
||||
}
|
||||
|
||||
protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermContext states) throws IOException;
|
||||
protected abstract void addClause(B topLevel, Term term, int docCount, float boost, TermStates states) throws IOException;
|
||||
|
||||
|
||||
final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
|
||||
|
|
|
@ -33,7 +33,7 @@ import org.apache.lucene.index.PostingsEnum;
|
|||
import org.apache.lucene.index.PrefixCodedTerms;
|
||||
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
@ -268,9 +268,9 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
assert builder == null;
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
for (TermAndState t : matchingTerms) {
|
||||
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
|
||||
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
|
||||
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
|
||||
final TermStates termStates = new TermStates(searcher.getTopReaderContext());
|
||||
termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
|
||||
bq.add(new TermQuery(new Term(t.field, t.term), termStates), Occur.SHOULD);
|
||||
}
|
||||
Query q = new ConstantScoreQuery(bq.build());
|
||||
final Weight weight = searcher.rewrite(q).createWeight(searcher, scoreMode, score());
|
||||
|
|
|
@ -28,12 +28,10 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
/**
|
||||
* A Query that matches documents containing a term. This may be combined with
|
||||
|
@ -42,23 +40,23 @@ import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
|||
public class TermQuery extends Query {
|
||||
|
||||
private final Term term;
|
||||
private final TermContext perReaderTermState;
|
||||
private final TermStates perReaderTermState;
|
||||
|
||||
final class TermWeight extends Weight {
|
||||
private final Similarity similarity;
|
||||
private final Similarity.SimWeight stats;
|
||||
private final TermContext termStates;
|
||||
private final Similarity.SimScorer simScorer;
|
||||
private final TermStates termStates;
|
||||
private final boolean needsScores;
|
||||
|
||||
public TermWeight(IndexSearcher searcher, boolean needsScores,
|
||||
float boost, TermContext termStates) throws IOException {
|
||||
float boost, TermStates termStates) throws IOException {
|
||||
super(TermQuery.this);
|
||||
if (needsScores && termStates == null) {
|
||||
throw new IllegalStateException("termStates are required when scores are needed");
|
||||
}
|
||||
this.needsScores = needsScores;
|
||||
this.termStates = termStates;
|
||||
this.similarity = searcher.getSimilarity(needsScores);
|
||||
this.similarity = searcher.getSimilarity();
|
||||
|
||||
final CollectionStatistics collectionStats;
|
||||
final TermStatistics termStats;
|
||||
|
@ -72,9 +70,9 @@ public class TermQuery extends Query {
|
|||
}
|
||||
|
||||
if (termStats == null) {
|
||||
this.stats = null; // term doesn't exist in any segment, we won't use similarity at all
|
||||
this.simScorer = null; // term doesn't exist in any segment, we won't use similarity at all
|
||||
} else {
|
||||
this.stats = similarity.computeWeight(boost, collectionStats, termStats);
|
||||
this.simScorer = similarity.scorer(boost, collectionStats, termStats);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,8 +99,8 @@ public class TermQuery extends Query {
|
|||
.getIndexOptions();
|
||||
PostingsEnum docs = termsEnum.postings(null, needsScores ? PostingsEnum.FREQS : PostingsEnum.NONE);
|
||||
assert docs != null;
|
||||
return new TermScorer(this, docs, similarity.simScorer(stats, context),
|
||||
getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq()));
|
||||
float maxFreq = getMaxFreq(indexOptions, termsEnum.totalTermFreq(), termsEnum.docFreq());
|
||||
return new TermScorer(this, docs, new LeafSimScorer(simScorer, context.reader(), needsScores, maxFreq));
|
||||
}
|
||||
|
||||
private long getMaxFreq(IndexOptions indexOptions, long ttf, long df) {
|
||||
|
@ -126,30 +124,17 @@ public class TermQuery extends Query {
|
|||
* the term does not exist in the given context
|
||||
*/
|
||||
private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException {
|
||||
if (termStates != null) {
|
||||
// TermQuery either used as a Query or the term states have been provided at construction time
|
||||
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
|
||||
final TermState state = termStates.get(context.ord);
|
||||
if (state == null) { // term is not present in that reader
|
||||
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
|
||||
return null;
|
||||
}
|
||||
final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
|
||||
termsEnum.seekExact(term.bytes(), state);
|
||||
return termsEnum;
|
||||
} else {
|
||||
// TermQuery used as a filter, so the term states have not been built up front
|
||||
Terms terms = context.reader().terms(term.field());
|
||||
if (terms == null) {
|
||||
return null;
|
||||
}
|
||||
final TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(term.bytes())) {
|
||||
return termsEnum;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
assert termStates != null;
|
||||
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) :
|
||||
"The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
|
||||
final TermState state = termStates.get(context);
|
||||
if (state == null) { // term is not present in that reader
|
||||
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
|
||||
return null;
|
||||
}
|
||||
final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
|
||||
termsEnum.seekExact(term.bytes(), state);
|
||||
return termsEnum;
|
||||
}
|
||||
|
||||
private boolean termNotInReader(LeafReader reader, Term term) throws IOException {
|
||||
|
@ -166,7 +151,7 @@ public class TermQuery extends Query {
|
|||
int newDoc = scorer.iterator().advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = scorer.freq();
|
||||
SimScorer docScorer = similarity.simScorer(stats, context);
|
||||
LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Integer.MAX_VALUE);
|
||||
Explanation freqExplanation = Explanation.match(freq, "freq, occurrences of term within document");
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(
|
||||
|
@ -190,7 +175,7 @@ public class TermQuery extends Query {
|
|||
* Expert: constructs a TermQuery that will use the provided docFreq instead
|
||||
* of looking up the docFreq against the searcher.
|
||||
*/
|
||||
public TermQuery(Term t, TermContext states) {
|
||||
public TermQuery(Term t, TermStates states) {
|
||||
assert states != null;
|
||||
term = Objects.requireNonNull(t);
|
||||
perReaderTermState = Objects.requireNonNull(states);
|
||||
|
@ -204,18 +189,10 @@ public class TermQuery extends Query {
|
|||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
final IndexReaderContext context = searcher.getTopReaderContext();
|
||||
final TermContext termState;
|
||||
final TermStates termState;
|
||||
if (perReaderTermState == null
|
||||
|| perReaderTermState.wasBuiltFor(context) == false) {
|
||||
if (scoreMode.needsScores()) {
|
||||
// make TermQuery single-pass if we don't have a PRTS or if the context
|
||||
// differs!
|
||||
termState = TermContext.build(context, term);
|
||||
} else {
|
||||
// do not compute the term state, this will help save seeks in the terms
|
||||
// dict on segments that have a cache entry for this query
|
||||
termState = null;
|
||||
}
|
||||
termState = TermStates.build(context, term, scoreMode.needsScores());
|
||||
} else {
|
||||
// PRTS was pre-build for this IS
|
||||
termState = this.perReaderTermState;
|
||||
|
|
|
@ -20,14 +20,12 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
|
||||
*/
|
||||
final class TermScorer extends Scorer {
|
||||
private final PostingsEnum postingsEnum;
|
||||
private final Similarity.SimScorer docScorer;
|
||||
private final float maxFreq;
|
||||
private final LeafSimScorer docScorer;
|
||||
|
||||
/**
|
||||
* Construct a <code>TermScorer</code>.
|
||||
|
@ -39,14 +37,11 @@ final class TermScorer extends Scorer {
|
|||
* @param docScorer
|
||||
* The <code>Similarity.SimScorer</code> implementation
|
||||
* to be used for score computations.
|
||||
* @param maxFreq
|
||||
* An upper bound of the term frequency of the searched term in any document.
|
||||
*/
|
||||
TermScorer(Weight weight, PostingsEnum td, Similarity.SimScorer docScorer, float maxFreq) {
|
||||
TermScorer(Weight weight, PostingsEnum td, LeafSimScorer docScorer) {
|
||||
super(weight);
|
||||
this.docScorer = docScorer;
|
||||
this.postingsEnum = td;
|
||||
this.maxFreq = maxFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -71,7 +66,7 @@ final class TermScorer extends Scorer {
|
|||
|
||||
@Override
|
||||
public float maxScore() {
|
||||
return docScorer.maxScore(maxFreq);
|
||||
return docScorer.maxScore();
|
||||
}
|
||||
|
||||
/** Returns a string representation of this <code>TermScorer</code>. */
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.PriorityQueue;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -82,7 +82,7 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
|
|||
|
||||
// lazy init the initial ScoreTerm because comparator is not known on ctor:
|
||||
if (st == null)
|
||||
st = new ScoreTerm(new TermContext(topReaderContext));
|
||||
st = new ScoreTerm(new TermStates(topReaderContext));
|
||||
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
|
@ -139,7 +139,7 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
|
|||
visitedTerms.remove(st.bytes.get());
|
||||
st.termState.clear(); // reset the termstate!
|
||||
} else {
|
||||
st = new ScoreTerm(new TermContext(topReaderContext));
|
||||
st = new ScoreTerm(new TermStates(topReaderContext));
|
||||
}
|
||||
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
|
||||
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
|
||||
|
@ -193,8 +193,8 @@ public abstract class TopTermsRewrite<B> extends TermCollectingRewrite<B> {
|
|||
static final class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final BytesRefBuilder bytes = new BytesRefBuilder();
|
||||
public float boost;
|
||||
public final TermContext termState;
|
||||
public ScoreTerm(TermContext termState) {
|
||||
public final TermStates termState;
|
||||
public ScoreTerm(TermStates termState) {
|
||||
this.termState = termState;
|
||||
}
|
||||
|
||||
|
|
|
@ -378,7 +378,7 @@
|
|||
* scored the way it was.
|
||||
* Typically a weight such as TermWeight
|
||||
* that scores via a {@link org.apache.lucene.search.similarities.Similarity Similarity} will make use of the Similarity's implementation:
|
||||
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(int, Explanation) SimScorer#explain(int doc, Explanation freq)}.
|
||||
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#explain(Explanation, long) SimScorer#explain(Explanation freq, long norm)}.
|
||||
* </li>
|
||||
* </ol>
|
||||
* <a name="scorerClass"></a>
|
||||
|
@ -402,7 +402,7 @@
|
|||
* {@link org.apache.lucene.search.Scorer#score score()} — Return the score of the
|
||||
* current document. This value can be determined in any appropriate way for an application. For instance, the
|
||||
* {@link org.apache.lucene.search.TermScorer TermScorer} simply defers to the configured Similarity:
|
||||
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(int, float) SimScorer.score(int doc, float freq)}.
|
||||
* {@link org.apache.lucene.search.similarities.Similarity.SimScorer#score(float, long) SimScorer.score(float freq, long norm)}.
|
||||
* </li>
|
||||
* <li>
|
||||
* {@link org.apache.lucene.search.Scorer#getChildren getChildren()} — Returns any child subscorers
|
||||
|
|
|
@ -112,18 +112,12 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
return Math.max(0, score);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
double f = freq.getValue().doubleValue();
|
||||
explain(subs, stats, doc, f, docLen);
|
||||
explain(subs, stats, f, docLen);
|
||||
|
||||
double score = tf(stats, f, docLen)
|
||||
* ln(stats, f, docLen)
|
||||
|
@ -132,7 +126,7 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
- gamma(stats, f, docLen);
|
||||
|
||||
Explanation explanation = Explanation.match((float) score,
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
|
||||
"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
|
||||
subs);
|
||||
if (stats.boost != 1f) {
|
||||
explanation = Explanation.match((float) (score * stats.boost), "Boosted score, computed as (score * boost) from:",
|
||||
|
@ -148,7 +142,7 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
protected void explain(List<Explanation> subs, BasicStats stats,
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(),
|
||||
|
@ -165,7 +159,7 @@ public abstract class Axiomatic extends SimilarityBase {
|
|||
subs.add(tflnExplain(stats, freq, docLen));
|
||||
subs.add(idfExplain(stats, freq, docLen));
|
||||
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
|
||||
super.explain(subs, stats, doc, freq, docLen);
|
||||
super.explain(subs, stats, freq, docLen);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -17,13 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -176,7 +173,7 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
|
||||
float avgdl = avgFieldLength(collectionStats);
|
||||
|
||||
|
@ -184,100 +181,17 @@ public class BM25Similarity extends Similarity {
|
|||
for (int i = 0; i < cache.length; i++) {
|
||||
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
|
||||
}
|
||||
return new BM25Stats(collectionStats.field(), boost, k1, idf, avgdl, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
BM25Stats bm25stats = (BM25Stats) stats;
|
||||
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
|
||||
}
|
||||
|
||||
private class BM25DocScorer extends SimScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue; // boost * idf * (k1 + 1)
|
||||
private final NumericDocValues norms;
|
||||
/** precomputed cache for all length values */
|
||||
private final float[] lengthCache;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float[] cache;
|
||||
|
||||
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight;
|
||||
this.norms = norms;
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
// if there are no norms, we act as if b=0
|
||||
double norm;
|
||||
if (norms == null) {
|
||||
norm = k1;
|
||||
} else {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
norm = cache[((byte) norms.longValue()) & 0xFF];
|
||||
}
|
||||
return weightValue * (float) (freq / (freq + norm));
|
||||
}
|
||||
|
||||
@Override
|
||||
public float maxScore(float maxFreq) {
|
||||
// TODO: leverage maxFreq and the min norm from the cache
|
||||
return weightValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.addAll(stats.explain());
|
||||
Explanation tfExpl = explainTF(doc, freq);
|
||||
subs.add(tfExpl);
|
||||
return Explanation.match(stats.weight * tfExpl.getValue().floatValue(),
|
||||
"score(doc="+doc+",freq="+freq.getValue()+"), product of:", subs);
|
||||
}
|
||||
|
||||
private Explanation explainTF(int doc, Explanation freq) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
|
||||
if (norms == null) {
|
||||
subs.add(Explanation.match(0, "b, field omits length norms"));
|
||||
return Explanation.match(
|
||||
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) k1)),
|
||||
"tf, computed as freq / (freq + k1) from:", subs);
|
||||
} else {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
byte norm = (byte) norms.longValue();
|
||||
float doclen = lengthCache[norm & 0xff];
|
||||
subs.add(Explanation.match(b, "b, length normalization parameter"));
|
||||
if ((norm & 0xFF) > 39) {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
|
||||
} else {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field"));
|
||||
}
|
||||
subs.add(Explanation.match(stats.avgdl, "avgdl, average length of field"));
|
||||
float normValue = k1 * ((1 - b) + b * doclen / stats.avgdl);
|
||||
return Explanation.match(
|
||||
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
|
||||
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
|
||||
}
|
||||
}
|
||||
|
||||
return new BM25Scorer(collectionStats.field(), boost, k1, b, idf, avgdl, cache);
|
||||
}
|
||||
|
||||
/** Collection statistics for the BM25 model. */
|
||||
private static class BM25Stats extends SimWeight {
|
||||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
private static class BM25Scorer extends SimScorer {
|
||||
/** query boost */
|
||||
private final float boost;
|
||||
/** k1 value for scale factor */
|
||||
private final float k1;
|
||||
/** b value for length normalization impact */
|
||||
private final float b;
|
||||
/** BM25's idf */
|
||||
private final Explanation idf;
|
||||
/** The average document length. */
|
||||
|
@ -287,17 +201,51 @@ public class BM25Similarity extends Similarity {
|
|||
/** weight (idf * boost) */
|
||||
private final float weight;
|
||||
|
||||
BM25Stats(String field, float boost, float k1, Explanation idf, float avgdl, float[] cache) {
|
||||
this.field = field;
|
||||
BM25Scorer(String field, float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) {
|
||||
super(field);
|
||||
this.boost = boost;
|
||||
this.idf = idf;
|
||||
this.avgdl = avgdl;
|
||||
this.k1 = k1;
|
||||
this.b = b;
|
||||
this.cache = cache;
|
||||
this.weight = (k1 + 1) * boost * idf.getValue().floatValue();
|
||||
}
|
||||
|
||||
private List<Explanation> explain() {
|
||||
@Override
|
||||
public float score(float freq, long encodedNorm) {
|
||||
double norm = cache[((byte) encodedNorm) & 0xFF];
|
||||
return weight * (float) (freq / (freq + norm));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(Explanation freq, long encodedNorm) {
|
||||
List<Explanation> subs = new ArrayList<>(explainConstantFactors());
|
||||
Explanation tfExpl = explainTF(freq, encodedNorm);
|
||||
subs.add(tfExpl);
|
||||
return Explanation.match(weight * tfExpl.getValue().floatValue(),
|
||||
"score(freq="+freq.getValue()+"), product of:", subs);
|
||||
}
|
||||
|
||||
private Explanation explainTF(Explanation freq, long norm) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "k1, term saturation parameter"));
|
||||
float doclen = LENGTH_TABLE[((byte) norm) & 0xff];
|
||||
subs.add(Explanation.match(b, "b, length normalization parameter"));
|
||||
if ((norm & 0xFF) > 39) {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field (approximate)"));
|
||||
} else {
|
||||
subs.add(Explanation.match(doclen, "dl, length of field"));
|
||||
}
|
||||
subs.add(Explanation.match(avgdl, "avgdl, average length of field"));
|
||||
float normValue = k1 * ((1 - b) + b * doclen / avgdl);
|
||||
return Explanation.match(
|
||||
(float) (freq.getValue().floatValue() / (freq.getValue().floatValue() + (double) normValue)),
|
||||
"tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs);
|
||||
}
|
||||
|
||||
private List<Explanation> explainConstantFactors() {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
// scale factor
|
||||
subs.add(Explanation.match(k1 + 1, "scaling factor, k1 + 1"));
|
||||
|
@ -311,7 +259,6 @@ public class BM25Similarity extends Similarity {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "BM25(k1=" + k1 + ",b=" + b + ")";
|
||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.index.Terms;
|
|||
* Stores all statistics commonly used ranking methods.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BasicStats extends Similarity.SimWeight {
|
||||
public class BasicStats {
|
||||
final String field;
|
||||
/** The number of documents. */
|
||||
protected long numberOfDocuments;
|
||||
|
|
|
@ -16,10 +16,7 @@
|
|||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -47,44 +44,31 @@ public class BooleanSimilarity extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new BooleanWeight(boost);
|
||||
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new BooleanWeight(collectionStats.field(), boost);
|
||||
}
|
||||
|
||||
private static class BooleanWeight extends SimWeight {
|
||||
private static class BooleanWeight extends SimScorer {
|
||||
final float boost;
|
||||
|
||||
BooleanWeight(float boost) {
|
||||
BooleanWeight(String field, float boost) {
|
||||
super(field);
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(float freq, long norm) {
|
||||
return boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(Explanation freq, long norm) {
|
||||
Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
|
||||
return Explanation.match(
|
||||
queryBoostExpl.getValue(),
|
||||
"score(" + getClass().getSimpleName() + "), computed from:",
|
||||
queryBoostExpl);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
final float boost = ((BooleanWeight) weight).boost;
|
||||
|
||||
return new SimScorer() {
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float maxScore(float maxFreq) {
|
||||
return boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
|
||||
return Explanation.match(
|
||||
queryBoostExpl.getValue(),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
|
||||
queryBoostExpl);
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -64,12 +64,6 @@ public class DFISimilarity extends SimilarityBase {
|
|||
return stats.getBoost() * log2(measure + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the measure of independence
|
||||
*/
|
||||
|
@ -79,12 +73,12 @@ public class DFISimilarity extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
final double expected = (stats.getTotalTermFreq() + 1) * docLen /
|
||||
(stats.getNumberOfFieldTokens() + 1);
|
||||
if (freq.getValue().doubleValue() <= expected){
|
||||
return Explanation.match((float) 0, "score(" +
|
||||
getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), equals to 0");
|
||||
}
|
||||
Explanation explExpected = Explanation.match((float) expected,
|
||||
|
@ -103,7 +97,7 @@ public class DFISimilarity extends SimilarityBase {
|
|||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().doubleValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
"score(" + getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), computed as boost * log2(measure + 1) from:",
|
||||
Explanation.match( (float)stats.getBoost(), "boost, query boost"),
|
||||
explMeasure);
|
||||
|
|
|
@ -113,15 +113,9 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs,
|
||||
BasicStats stats, int doc, double freq, double docLen) {
|
||||
BasicStats stats, double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
|
||||
}
|
||||
|
@ -136,13 +130,13 @@ public class DFRSimilarity extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
|
||||
explain(subs, stats, freq.getValue().doubleValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().doubleValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
"score(" + getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), computed as boost * " +
|
||||
"basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
|
||||
subs);
|
||||
|
|
|
@ -104,15 +104,9 @@ public class IBSimilarity extends SimilarityBase {
|
|||
lambda.lambda(stats));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(
|
||||
List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
|
||||
List<Explanation> subs, BasicStats stats, double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost"));
|
||||
}
|
||||
|
@ -125,13 +119,13 @@ public class IBSimilarity extends SimilarityBase {
|
|||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
|
||||
explain(subs, stats, freq.getValue().doubleValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().doubleValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
"score(" + getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), computed as boost * " +
|
||||
"distribution.score(stats, normalization.tfn(stats, freq," +
|
||||
" docLen), lambda.lambda(stats)) from:",
|
||||
|
|
|
@ -78,13 +78,7 @@ public class LMDirichletSimilarity extends LMSimilarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
protected void explain(List<Explanation> subs, BasicStats stats,
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(), "query boost"));
|
||||
|
@ -107,18 +101,18 @@ public class LMDirichletSimilarity extends LMSimilarity {
|
|||
(float)Math.log(mu / (docLen + mu)),
|
||||
"document norm, computed as log(mu / (dl + mu))"));
|
||||
subs.add(Explanation.match((float) docLen,"dl, length of field"));
|
||||
super.explain(subs, stats, doc, freq, docLen);
|
||||
super.explain(subs, stats, freq, docLen);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
|
||||
explain(subs, stats, freq.getValue().doubleValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().doubleValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
"score(" + getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), computed as boost * " +
|
||||
"(term weight + document norm) from:",
|
||||
subs);
|
||||
|
|
|
@ -68,13 +68,7 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected double maxScore(BasicStats stats, double maxFreq) {
|
||||
// TODO: can we compute a better upper bound on the produced scores
|
||||
return Double.POSITIVE_INFINITY;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
|
||||
protected void explain(List<Explanation> subs, BasicStats stats,
|
||||
double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0d) {
|
||||
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
|
||||
|
@ -88,18 +82,18 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
|
|||
"freq, number of occurrences of term in the document");
|
||||
subs.add(explFreq);
|
||||
subs.add(Explanation.match((float) docLen,"dl, length of field"));
|
||||
super.explain(subs, stats, doc, freq, docLen);
|
||||
super.explain(subs, stats, freq, docLen);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue().doubleValue(), docLen);
|
||||
explain(subs, stats, freq.getValue().doubleValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().doubleValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
|
||||
"score(" + getClass().getSimpleName() + ", freq=" +
|
||||
freq.getValue() +"), computed as boost * " +
|
||||
"log(1 + ((1 - lambda) * freq / dl) /(lambda * P)) from:",
|
||||
subs);
|
||||
|
|
|
@ -70,7 +70,7 @@ public abstract class LMSimilarity extends SimilarityBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subExpls, BasicStats stats, int doc,
|
||||
protected void explain(List<Explanation> subExpls, BasicStats stats,
|
||||
double freq, double docLen) {
|
||||
subExpls.add(Explanation.match((float) collectionModel.computeProbability(stats),
|
||||
"collection probability"));
|
||||
|
|
|
@ -17,12 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -49,64 +47,39 @@ public class MultiSimilarity extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
SimWeight subStats[] = new SimWeight[sims.length];
|
||||
for (int i = 0; i < subStats.length; i++) {
|
||||
subStats[i] = sims[i].computeWeight(boost, collectionStats, termStats);
|
||||
}
|
||||
return new MultiStats(subStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
SimScorer subScorers[] = new SimScorer[sims.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
subScorers[i] = sims[i].simScorer(((MultiStats)stats).subStats[i], context);
|
||||
subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
|
||||
}
|
||||
return new MultiSimScorer(subScorers);
|
||||
return new MultiSimScorer(collectionStats.field(), subScorers);
|
||||
}
|
||||
|
||||
static class MultiSimScorer extends SimScorer {
|
||||
private final SimScorer subScorers[];
|
||||
|
||||
MultiSimScorer(SimScorer subScorers[]) {
|
||||
MultiSimScorer(String field, SimScorer subScorers[]) {
|
||||
super(field);
|
||||
this.subScorers = subScorers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
public float score(float freq, long norm) {
|
||||
float sum = 0.0f;
|
||||
for (SimScorer subScorer : subScorers) {
|
||||
sum += subScorer.score(doc, freq);
|
||||
sum += subScorer.score(freq, norm);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float maxScore(float freq) {
|
||||
float sumMaxScore = 0;
|
||||
for (SimScorer subScorer : subScorers) {
|
||||
sumMaxScore += subScorer.maxScore(freq);
|
||||
}
|
||||
return sumMaxScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
public Explanation explain(Explanation freq, long norm) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
for (SimScorer subScorer : subScorers) {
|
||||
subs.add(subScorer.explain(doc, freq));
|
||||
subs.add(subScorer.explain(freq, norm));
|
||||
}
|
||||
return Explanation.match(score(doc, freq.getValue().floatValue()), "sum of:", subs);
|
||||
return Explanation.match(score(freq.getValue().floatValue(), norm), "sum of:", subs);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static class MultiStats extends SimWeight {
|
||||
final SimWeight subStats[];
|
||||
|
||||
MultiStats(SimWeight subStats[]) {
|
||||
this.subStats = subStats;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,6 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -46,26 +43,13 @@ public abstract class PerFieldSimilarityWrapper extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
PerFieldSimWeight weight = new PerFieldSimWeight();
|
||||
weight.delegate = get(collectionStats.field());
|
||||
weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats);
|
||||
return weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
PerFieldSimWeight perFieldWeight = (PerFieldSimWeight) weight;
|
||||
return perFieldWeight.delegate.simScorer(perFieldWeight.delegateWeight, context);
|
||||
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return get(collectionStats.field()).scorer(boost, collectionStats, termStats);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link Similarity} for scoring a field.
|
||||
*/
|
||||
public abstract Similarity get(String name);
|
||||
|
||||
static class PerFieldSimWeight extends SimWeight {
|
||||
Similarity delegate;
|
||||
SimWeight delegateWeight;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,18 +17,15 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
/**
|
||||
|
@ -38,9 +35,9 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <p>
|
||||
* This is a low-level API, you should only extend this API if you want to implement
|
||||
* an information retrieval <i>model</i>. If you are instead looking for a convenient way
|
||||
* to alter Lucene's scoring, consider extending a higher-level implementation
|
||||
* such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
|
||||
* just tweaking the default implementation: {@link BM25Similarity}.
|
||||
* to alter Lucene's scoring, consider just tweaking the default implementation:
|
||||
* {@link BM25Similarity} or extend {@link SimilarityBase}, which makes it easy to compute
|
||||
* a score from index statistics.
|
||||
* <p>
|
||||
* Similarity determines how Lucene weights terms, and Lucene interacts with
|
||||
* this class at both <a href="#indextime">index-time</a> and
|
||||
|
@ -49,23 +46,22 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <a name="indextime">Indexing Time</a>
|
||||
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
|
||||
* the Similarity implementation to set a per-document value for the field that will
|
||||
* be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}. Lucene makes no assumption
|
||||
* about what is in this norm, but it is most useful for encoding length normalization
|
||||
* information.
|
||||
* be later accessible via {@link org.apache.lucene.index.LeafReader#getNormValues(String)}.
|
||||
* Lucene makes no assumption about what is in this norm, but it is most useful for
|
||||
* encoding length normalization information.
|
||||
* <p>
|
||||
* Implementations should carefully consider how the normalization is encoded: while
|
||||
* Lucene's {@link BM25Similarity} encodes a combination of index-time boost
|
||||
* and length normalization information with {@link SmallFloat} into a single byte, this
|
||||
* might not be suitable for all purposes.
|
||||
* Lucene's {@link BM25Similarity} encodes length normalization information with
|
||||
* {@link SmallFloat} into a single byte, this might not be suitable for all purposes.
|
||||
* <p>
|
||||
* Many formulas require the use of average document length, which can be computed via a
|
||||
* combination of {@link CollectionStatistics#sumTotalTermFreq()} and
|
||||
* {@link CollectionStatistics#maxDoc()} or {@link CollectionStatistics#docCount()},
|
||||
* depending upon whether the average should reflect field sparsity.
|
||||
* {@link CollectionStatistics#docCount()}.
|
||||
* <p>
|
||||
* Additional scoring factors can be stored in named
|
||||
* <code>NumericDocValuesField</code>s and accessed
|
||||
* at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
|
||||
* Additional scoring factors can be stored in named {@link NumericDocValuesField}s and
|
||||
* accessed at query-time with {@link org.apache.lucene.index.LeafReader#getNumericDocValues(String)}.
|
||||
* However this should not be done in the {@link Similarity} but externally, for instance
|
||||
* by using <tt>FunctionScoreQuery</tt>.
|
||||
* <p>
|
||||
* Finally, using index-time boosts (either via folding into the normalization byte or
|
||||
* via DocValues), is an inefficient way to boost the scores of different fields if the
|
||||
|
@ -76,14 +72,13 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* <a name="querytime">Query time</a>
|
||||
* At query-time, Queries interact with the Similarity via these steps:
|
||||
* <ol>
|
||||
* <li>The {@link #computeWeight(float, CollectionStatistics, TermStatistics...)} method is called a single time,
|
||||
* <li>The {@link #scorer(float, CollectionStatistics, TermStatistics...)} method is called a single time,
|
||||
* allowing the implementation to compute any statistics (such as IDF, average document length, etc)
|
||||
* across <i>the entire collection</i>. The {@link TermStatistics} and {@link CollectionStatistics} passed in
|
||||
* already contain all of the raw statistics involved, so a Similarity can freely use any combination
|
||||
* of statistics without causing any additional I/O. Lucene makes no assumption about what is
|
||||
* stored in the returned {@link Similarity.SimWeight} object.
|
||||
* <li>For each segment in the index, the Query creates a {@link #simScorer(SimWeight, org.apache.lucene.index.LeafReaderContext)}
|
||||
* The score() method is called for each matching document.
|
||||
* stored in the returned {@link Similarity.SimScorer} object.
|
||||
* <li>Then {@link SimScorer#score(float, long)} is called for every matching document to compute its score.
|
||||
* </ol>
|
||||
* <p>
|
||||
* <a name="explaintime">Explanations</a>
|
||||
|
@ -110,7 +105,17 @@ public abstract class Similarity {
|
|||
* <p>Matches in longer fields are less precise, so implementations of this
|
||||
* method usually set smaller values when <code>state.getLength()</code> is large,
|
||||
* and larger values when <code>state.getLength()</code> is small.
|
||||
*
|
||||
*
|
||||
* <p>Note that for a given term-document frequency, greater unsigned norms
|
||||
* must produce scores that are lower or equal, ie. for two encoded norms
|
||||
* {@code n1} and {@code n2} so that
|
||||
* {@code Long.compareUnsigned(n1, n2) > 0} then
|
||||
* {@code SimScorer.score(freq, n1) <= SimScorer.score(freq, n2)}
|
||||
* for any legal {@code freq}.
|
||||
*
|
||||
* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces
|
||||
* the highest scores.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @param state current processing state for this field
|
||||
|
@ -126,71 +131,68 @@ public abstract class Similarity {
|
|||
* @param termStats term-level statistics, such as the document frequency of a term across the collection.
|
||||
* @return SimWeight object with the information this Similarity needs to score a query.
|
||||
*/
|
||||
public abstract SimWeight computeWeight(float boost,
|
||||
public abstract SimScorer scorer(float boost,
|
||||
CollectionStatistics collectionStats, TermStatistics... termStats);
|
||||
|
||||
/**
|
||||
* Creates a new {@link Similarity.SimScorer} to score matching documents from a segment of the inverted index.
|
||||
* @param weight collection information from {@link #computeWeight(float, CollectionStatistics, TermStatistics...)}
|
||||
* @param context segment of the inverted index to be scored.
|
||||
* @return SloppySimScorer for scoring documents across <code>context</code>
|
||||
* @throws IOException if there is a low-level I/O error
|
||||
*/
|
||||
public abstract SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException;
|
||||
|
||||
/**
|
||||
* API for scoring "sloppy" queries such as {@link TermQuery},
|
||||
* {@link SpanQuery}, and {@link PhraseQuery}.
|
||||
*/
|
||||
public static abstract class SimScorer {
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
*/
|
||||
public SimScorer() {}
|
||||
|
||||
/**
|
||||
* Score a single document
|
||||
* @param doc document id within the inverted index segment
|
||||
* @param freq sloppy term frequency
|
||||
* @return document's score
|
||||
*/
|
||||
public abstract float score(int doc, float freq) throws IOException;
|
||||
|
||||
/**
|
||||
* Return the maximum score that this scorer may produce for freqs in {@code ]0, maxFreq]}.
|
||||
* {@code Float.POSITIVE_INFINITY} is a fine return value if scores are not bounded.
|
||||
* @param maxFreq the maximum frequency
|
||||
*/
|
||||
public abstract float maxScore(float maxFreq);
|
||||
|
||||
/**
|
||||
* Explain the score for a single document
|
||||
* @param doc document id within the inverted index segment
|
||||
* @param freq Explanation of how the sloppy term frequency was computed
|
||||
* @return document's score
|
||||
*/
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return Explanation.match(
|
||||
score(doc, freq.getValue().floatValue()),
|
||||
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:",
|
||||
Collections.singleton(freq));
|
||||
}
|
||||
}
|
||||
|
||||
/** Stores the weight for a query across the indexed collection. This abstract
|
||||
* implementation is empty; descendants of {@code Similarity} should
|
||||
* subclass {@code SimWeight} and define the statistics they require in the
|
||||
* subclass. Examples include idf, average field length, etc.
|
||||
*/
|
||||
public static abstract class SimWeight {
|
||||
|
||||
public static abstract class SimScorer {
|
||||
|
||||
private final String field;
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
* constructors.)
|
||||
*/
|
||||
public SimWeight() {}
|
||||
public SimScorer(String field) {
|
||||
this.field = Objects.requireNonNull(field);
|
||||
}
|
||||
|
||||
/** Return the field that this {@link SimScorer} operates on. */
|
||||
public final String getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score a single document. {@code freq} is the document-term sloppy
|
||||
* frequency and must be finite and positive. {@code norm} is the
|
||||
* encoded normalization factor as computed by
|
||||
* {@link Similarity#computeNorm(FieldInvertState)} at index time, or
|
||||
* {@code 1} if norms are disabled. {@code norm} is never {@code 0}.
|
||||
* <p>
|
||||
* Score must not decrease when {@code freq} increases, ie. if
|
||||
* {@code freq1 > freq2}, then {@code score(freq1, norm) >=
|
||||
* score(freq2, norm)} for any value of {@code norm} that may be produced
|
||||
* by {@link Similarity#computeNorm(FieldInvertState)}.
|
||||
* <p>
|
||||
* Score must not increase when the unsigned {@code norm} increases, ie. if
|
||||
* {@code Long.compareUnsigned(norm1, norm2) > 0} then
|
||||
* {@code score(freq, norm1) <= score(freq, norm2)} for any legal
|
||||
* {@code freq}.
|
||||
* <p>
|
||||
* As a consequence, the maximum score that this scorer can produce is bound
|
||||
* by {@code score(Float.MAX_VALUE, 1)}.
|
||||
* @param freq sloppy term frequency, must be finite and positive
|
||||
* @param norm encoded normalization factor or {@code 1} if norms are disabled
|
||||
* @return document's score
|
||||
*/
|
||||
public abstract float score(float freq, long norm);
|
||||
|
||||
/**
|
||||
* Explain the score for a single document
|
||||
* @param freq Explanation of how the sloppy term frequency was computed
|
||||
* @param norm encoded normalization factor, as returned by {@link Similarity#computeNorm}, or {@code 1} if norms are disabled
|
||||
* @return document's score
|
||||
*/
|
||||
public Explanation explain(Explanation freq, long norm) {
|
||||
return Explanation.match(
|
||||
score(freq.getValue().floatValue(), norm),
|
||||
"score(freq=" + freq.getValue() +"), with freq of:",
|
||||
Collections.singleton(freq));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
|
@ -33,7 +30,7 @@ import org.apache.lucene.util.SmallFloat;
|
|||
* A subclass of {@code Similarity} that provides a simplified API for its
|
||||
* descendants. Subclasses are only required to implement the {@link #score}
|
||||
* and {@link #toString()} methods. Implementing
|
||||
* {@link #explain(List, BasicStats, int, double, double)} is optional,
|
||||
* {@link #explain(List, BasicStats, double, double)} is optional,
|
||||
* inasmuch as SimilarityBase already provides a basic explanation of the score
|
||||
* and the term frequency. However, implementers of a subclass are encouraged to
|
||||
* include as much detail about the scoring method as possible.
|
||||
|
@ -82,13 +79,18 @@ public abstract class SimilarityBase extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
BasicStats stats[] = new BasicStats[termStats.length];
|
||||
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
SimScorer weights[] = new SimScorer[termStats.length];
|
||||
for (int i = 0; i < termStats.length; i++) {
|
||||
stats[i] = newStats(collectionStats.field(), boost);
|
||||
fillBasicStats(stats[i], collectionStats, termStats[i]);
|
||||
BasicStats stats = newStats(collectionStats.field(), boost);
|
||||
fillBasicStats(stats, collectionStats, termStats[i]);
|
||||
weights[i] = new BasicSimScorer(stats);
|
||||
}
|
||||
if (weights.length == 1) {
|
||||
return weights[0];
|
||||
} else {
|
||||
return new MultiSimilarity.MultiSimScorer(collectionStats.field(), weights);
|
||||
}
|
||||
return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
|
||||
}
|
||||
|
||||
/** Factory method to return a custom stats object */
|
||||
|
@ -121,13 +123,6 @@ public abstract class SimilarityBase extends Similarity {
|
|||
*/
|
||||
protected abstract double score(BasicStats stats, double freq, double docLen);
|
||||
|
||||
/**
|
||||
* Return the maximum value that may be returned by {@link #score(BasicStats, double, double)}
|
||||
* for the given stats.
|
||||
* @see org.apache.lucene.search.similarities.Similarity.SimScorer#maxScore(float)
|
||||
*/
|
||||
protected abstract double maxScore(BasicStats stats, double maxFreq);
|
||||
|
||||
/**
|
||||
* Subclasses should implement this method to explain the score. {@code expl}
|
||||
* already contains the score, the name of the class and the doc id, as well
|
||||
|
@ -137,12 +132,11 @@ public abstract class SimilarityBase extends Similarity {
|
|||
*
|
||||
* @param subExpls the list of details of the explanation to extend
|
||||
* @param stats the corpus level statistics.
|
||||
* @param doc the document id.
|
||||
* @param freq the term frequency.
|
||||
* @param docLen the document length.
|
||||
*/
|
||||
protected void explain(
|
||||
List<Explanation> subExpls, BasicStats stats, int doc, double freq, double docLen) {}
|
||||
List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}
|
||||
|
||||
/**
|
||||
* Explains the score. The implementation here provides a basic explanation
|
||||
|
@ -151,43 +145,24 @@ public abstract class SimilarityBase extends Similarity {
|
|||
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
|
||||
* method) and the explanation for the term frequency. Subclasses content with
|
||||
* this format may add additional details in
|
||||
* {@link #explain(List, BasicStats, int, double, double)}.
|
||||
* {@link #explain(List, BasicStats, double, double)}.
|
||||
*
|
||||
* @param stats the corpus level statistics.
|
||||
* @param doc the document id.
|
||||
* @param freq the term frequency and its explanation.
|
||||
* @param docLen the document length.
|
||||
* @return the explanation.
|
||||
*/
|
||||
protected Explanation explain(
|
||||
BasicStats stats, int doc, Explanation freq, double docLen) {
|
||||
BasicStats stats, Explanation freq, double docLen) {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
explain(subs, stats, doc, freq.getValue().floatValue(), docLen);
|
||||
explain(subs, stats, freq.getValue().floatValue(), docLen);
|
||||
|
||||
return Explanation.match(
|
||||
(float) score(stats, freq.getValue().floatValue(), docLen),
|
||||
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:",
|
||||
"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
|
||||
subs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
SimWeight subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
|
||||
SimScorer subScorers[] = new SimScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
BasicStats basicstats = (BasicStats) subStats[i];
|
||||
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
return new MultiSimilarity.MultiSimScorer(subScorers);
|
||||
} else {
|
||||
BasicStats basicstats = (BasicStats) stats;
|
||||
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must override this method to return the name of the Similarity
|
||||
* and preferably the values of parameters (if any) as well.
|
||||
|
@ -227,43 +202,32 @@ public abstract class SimilarityBase extends Similarity {
|
|||
|
||||
// --------------------------------- Classes ---------------------------------
|
||||
|
||||
/** Delegates the {@link #score(int, float)} and
|
||||
* {@link #explain(int, Explanation)} methods to
|
||||
/** Delegates the {@link #score(float, long)} and
|
||||
* {@link #explain(Explanation, long)} methods to
|
||||
* {@link SimilarityBase#score(BasicStats, double, double)} and
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, double)},
|
||||
* {@link SimilarityBase#explain(BasicStats, Explanation, double)},
|
||||
* respectively.
|
||||
*/
|
||||
final class BasicSimScorer extends SimScorer {
|
||||
private final BasicStats stats;
|
||||
private final NumericDocValues norms;
|
||||
final BasicStats stats;
|
||||
|
||||
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
|
||||
BasicSimScorer(BasicStats stats) {
|
||||
super(stats.field);
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
}
|
||||
|
||||
double getLengthValue(int doc) throws IOException {
|
||||
if (norms == null) {
|
||||
return 1D;
|
||||
}
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
double getLengthValue(long norm) {
|
||||
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(doc));
|
||||
public float score(float freq, long norm) {
|
||||
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
|
||||
}
|
||||
|
||||
@Override
|
||||
public float maxScore(float maxFreq) {
|
||||
return (float) SimilarityBase.this.maxScore(stats, maxFreq);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
|
||||
public Explanation explain(Explanation freq, long norm) {
|
||||
return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,13 +17,10 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -511,7 +508,7 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
}
|
||||
|
||||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
final Explanation idf = termStats.length == 1
|
||||
? idfExplain(collectionStats, termStats[0])
|
||||
: idfExplain(collectionStats, termStats);
|
||||
|
@ -522,110 +519,59 @@ public abstract class TFIDFSimilarity extends Similarity {
|
|||
normTable[i] = norm;
|
||||
}
|
||||
normTable[0] = 1f / normTable[255];
|
||||
return new IDFStats(collectionStats.field(), boost, idf, normTable);
|
||||
return new TFIDFScorer(collectionStats.field(), boost, idf, normTable);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
IDFStats idfstats = (IDFStats) stats;
|
||||
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
|
||||
final float[] normTable = idfstats.normTable;
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
|
||||
}
|
||||
|
||||
private final class TFIDFSimScorer extends SimScorer {
|
||||
private final IDFStats stats;
|
||||
private final float weightValue;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normTable;
|
||||
|
||||
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.queryWeight;
|
||||
this.norms = norms;
|
||||
this.normTable = normTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
|
||||
|
||||
if (norms == null) {
|
||||
return raw;
|
||||
} else {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
float normValue = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
return raw * normValue; // normalize for field
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float maxScore(float maxFreq) {
|
||||
final float raw = tf(maxFreq) * weightValue;
|
||||
if (norms == null) {
|
||||
return raw;
|
||||
} else {
|
||||
float maxNormValue = Float.NEGATIVE_INFINITY;
|
||||
for (float norm : normTable) {
|
||||
maxNormValue = Math.max(maxNormValue, norm);
|
||||
}
|
||||
return raw * maxNormValue;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms, normTable);
|
||||
}
|
||||
}
|
||||
|
||||
/** Collection statistics for the TF-IDF model. The only statistic of interest
|
||||
* to this model is idf. */
|
||||
static class IDFStats extends SimWeight {
|
||||
private final String field;
|
||||
class TFIDFScorer extends SimScorer {
|
||||
/** The idf and its explanation */
|
||||
private final Explanation idf;
|
||||
private final float boost;
|
||||
private final float queryWeight;
|
||||
final float[] normTable;
|
||||
|
||||
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
|
||||
public TFIDFScorer(String field, float boost, Explanation idf, float[] normTable) {
|
||||
super(field);
|
||||
// TODO: Validate?
|
||||
this.field = field;
|
||||
this.idf = idf;
|
||||
this.boost = boost;
|
||||
this.queryWeight = boost * idf.getValue().floatValue();
|
||||
this.normTable = normTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(float freq, long norm) {
|
||||
final float raw = tf(freq) * queryWeight; // compute tf(f)*weight
|
||||
float normValue = normTable[(int) (norm & 0xFF)];
|
||||
return raw * normValue; // normalize for field
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(Explanation freq, long norm) {
|
||||
return explainScore(freq, norm, normTable);
|
||||
}
|
||||
|
||||
private Explanation explainScore(Explanation freq, long encodedNorm, float[] normTable) {
|
||||
List<Explanation> subs = new ArrayList<Explanation>();
|
||||
if (boost != 1F) {
|
||||
subs.add(Explanation.match(boost, "boost"));
|
||||
}
|
||||
subs.add(idf);
|
||||
Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
|
||||
subs.add(tf);
|
||||
|
||||
float norm = normTable[(int) (encodedNorm & 0xFF)];
|
||||
|
||||
Explanation fieldNorm = Explanation.match(norm, "fieldNorm");
|
||||
subs.add(fieldNorm);
|
||||
|
||||
return Explanation.match(
|
||||
queryWeight * tf.getValue().floatValue() * norm,
|
||||
"score(freq="+freq.getValue()+"), product of:",
|
||||
subs);
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<Explanation>();
|
||||
if (stats.boost != 1F) {
|
||||
subs.add(Explanation.match(stats.boost, "boost"));
|
||||
}
|
||||
subs.add(stats.idf);
|
||||
Explanation tf = Explanation.match(tf(freq.getValue().floatValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
|
||||
subs.add(tf);
|
||||
|
||||
float norm;
|
||||
if (norms == null) {
|
||||
norm = 1f;
|
||||
} else {
|
||||
boolean found = norms.advanceExact(doc);
|
||||
assert found;
|
||||
norm = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
}
|
||||
|
||||
Explanation fieldNorm = Explanation.match(
|
||||
norm,
|
||||
"fieldNorm(doc=" + doc + ")");
|
||||
subs.add(fieldNorm);
|
||||
|
||||
return Explanation.match(
|
||||
stats.queryWeight * tf.getValue().floatValue() * norm,
|
||||
"score(doc="+doc+",freq="+freq.getValue()+"), product of:",
|
||||
subs);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.search.spans;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
|
@ -61,7 +61,7 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable {
|
|||
final SpanWeight bigWeight;
|
||||
final SpanWeight littleWeight;
|
||||
|
||||
public SpanContainWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
|
||||
public SpanContainWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
|
||||
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
|
||||
super(SpanContainQuery.this, searcher, terms, boost);
|
||||
this.bigWeight = bigWeight;
|
||||
|
@ -93,9 +93,9 @@ abstract class SpanContainQuery extends SpanQuery implements Cloneable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
bigWeight.extractTermContexts(contexts);
|
||||
littleWeight.extractTermContexts(contexts);
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
bigWeight.extractTermStates(contexts);
|
||||
littleWeight.extractTermStates(contexts);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
||||
|
@ -45,15 +45,15 @@ public final class SpanContainingQuery extends SpanContainQuery {
|
|||
|
||||
@Override
|
||||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null,
|
||||
SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost);
|
||||
SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost);
|
||||
return new SpanContainingWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null,
|
||||
bigWeight, littleWeight, boost);
|
||||
}
|
||||
|
||||
public class SpanContainingWeight extends SpanContainWeight {
|
||||
|
||||
public SpanContainingWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
|
||||
public SpanContainingWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
|
||||
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
|
||||
super(searcher, terms, bigWeight, littleWeight, boost);
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.Objects;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
|
@ -163,7 +163,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(List<SpanQuery> topLevel, Term term, int docCount, float boost, TermContext states) {
|
||||
protected void addClause(List<SpanQuery> topLevel, Term term, int docCount, float boost, TermStates states) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term, states);
|
||||
topLevel.add(q);
|
||||
}
|
||||
|
@ -211,7 +211,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(List<SpanQuery> topLevel, Term term, int docFreq, float boost, TermContext states) {
|
||||
protected void addClause(List<SpanQuery> topLevel, Term term, int docFreq, float boost, TermStates states) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term, states);
|
||||
topLevel.add(q);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -181,24 +181,24 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
|
|||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
List<SpanWeight> subWeights = new ArrayList<>();
|
||||
for (SpanQuery q : clauses) {
|
||||
subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost));
|
||||
subWeights.add(q.createWeight(searcher, scoreMode, boost));
|
||||
}
|
||||
return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, boost);
|
||||
return new SpanNearWeight(subWeights, searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, boost);
|
||||
}
|
||||
|
||||
public class SpanNearWeight extends SpanWeight {
|
||||
|
||||
final List<SpanWeight> subWeights;
|
||||
|
||||
public SpanNearWeight(List<SpanWeight> subWeights, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
|
||||
public SpanNearWeight(List<SpanWeight> subWeights, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
|
||||
super(SpanNearQuery.this, searcher, terms, boost);
|
||||
this.subWeights = subWeights;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
for (SpanWeight w : subWeights) {
|
||||
w.extractTermContexts(contexts);
|
||||
w.extractTermStates(contexts);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -318,7 +318,7 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -99,9 +99,9 @@ public final class SpanNotQuery extends SpanQuery {
|
|||
|
||||
@Override
|
||||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
SpanWeight includeWeight = include.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
SpanWeight includeWeight = include.createWeight(searcher, scoreMode, boost);
|
||||
SpanWeight excludeWeight = exclude.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermContexts(includeWeight, excludeWeight) : null,
|
||||
return new SpanNotWeight(searcher, scoreMode.needsScores() ? getTermStates(includeWeight) : null,
|
||||
includeWeight, excludeWeight, boost);
|
||||
}
|
||||
|
||||
|
@ -110,7 +110,7 @@ public final class SpanNotQuery extends SpanQuery {
|
|||
final SpanWeight includeWeight;
|
||||
final SpanWeight excludeWeight;
|
||||
|
||||
public SpanNotWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
|
||||
public SpanNotWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
|
||||
SpanWeight includeWeight, SpanWeight excludeWeight, float boost) throws IOException {
|
||||
super(SpanNotQuery.this, searcher, terms, boost);
|
||||
this.includeWeight = includeWeight;
|
||||
|
@ -118,8 +118,8 @@ public final class SpanNotQuery extends SpanQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
includeWeight.extractTermContexts(contexts);
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
includeWeight.extractTermStates(contexts);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,7 +27,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.DisiPriorityQueue;
|
||||
import org.apache.lucene.search.DisiWrapper;
|
||||
import org.apache.lucene.search.DisjunctionDISIApproximation;
|
||||
|
@ -119,16 +119,16 @@ public final class SpanOrQuery extends SpanQuery {
|
|||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
List<SpanWeight> subWeights = new ArrayList<>(clauses.size());
|
||||
for (SpanQuery q : clauses) {
|
||||
subWeights.add(q.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost));
|
||||
subWeights.add(q.createWeight(searcher, scoreMode, boost));
|
||||
}
|
||||
return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermContexts(subWeights) : null, subWeights, boost);
|
||||
return new SpanOrWeight(searcher, scoreMode.needsScores() ? getTermStates(subWeights) : null, subWeights, boost);
|
||||
}
|
||||
|
||||
public class SpanOrWeight extends SpanWeight {
|
||||
|
||||
final List<SpanWeight> subWeights;
|
||||
|
||||
public SpanOrWeight(IndexSearcher searcher, Map<Term, TermContext> terms, List<SpanWeight> subWeights, float boost) throws IOException {
|
||||
public SpanOrWeight(IndexSearcher searcher, Map<Term, TermStates> terms, List<SpanWeight> subWeights, float boost) throws IOException {
|
||||
super(SpanOrQuery.this, searcher, terms, boost);
|
||||
this.subWeights = subWeights;
|
||||
}
|
||||
|
@ -150,9 +150,9 @@ public final class SpanOrQuery extends SpanQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
for (SpanWeight w : subWeights) {
|
||||
w.extractTermContexts(contexts);
|
||||
w.extractTermStates(contexts);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
@ -69,15 +69,15 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
|
|||
|
||||
@Override
|
||||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
SpanWeight matchWeight = match.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermContexts(matchWeight) : null, boost);
|
||||
SpanWeight matchWeight = match.createWeight(searcher, scoreMode, boost);
|
||||
return new SpanPositionCheckWeight(matchWeight, searcher, scoreMode.needsScores() ? getTermStates(matchWeight) : null, boost);
|
||||
}
|
||||
|
||||
public class SpanPositionCheckWeight extends SpanWeight {
|
||||
|
||||
final SpanWeight matchWeight;
|
||||
|
||||
public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
|
||||
public SpanPositionCheckWeight(SpanWeight matchWeight, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
|
||||
super(SpanPositionCheckQuery.this, searcher, terms, boost);
|
||||
this.matchWeight = matchWeight;
|
||||
}
|
||||
|
@ -93,8 +93,8 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
matchWeight.extractTermContexts(contexts);
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
matchWeight.extractTermStates(contexts);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Map;
|
|||
import java.util.TreeMap;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
@ -40,25 +40,25 @@ public abstract class SpanQuery extends Query {
|
|||
public abstract SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException;
|
||||
|
||||
/**
|
||||
* Build a map of terms to termcontexts, for use in constructing SpanWeights
|
||||
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static Map<Term, TermContext> getTermContexts(SpanWeight... weights) {
|
||||
Map<Term, TermContext> terms = new TreeMap<>();
|
||||
public static Map<Term, TermStates> getTermStates(SpanWeight... weights) {
|
||||
Map<Term, TermStates> terms = new TreeMap<>();
|
||||
for (SpanWeight w : weights) {
|
||||
w.extractTermContexts(terms);
|
||||
w.extractTermStates(terms);
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a map of terms to termcontexts, for use in constructing SpanWeights
|
||||
* Build a map of terms to {@link TermStates}, for use in constructing SpanWeights
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static Map<Term, TermContext> getTermContexts(Collection<SpanWeight> weights) {
|
||||
Map<Term, TermContext> terms = new TreeMap<>();
|
||||
public static Map<Term, TermStates> getTermStates(Collection<SpanWeight> weights) {
|
||||
Map<Term, TermStates> terms = new TreeMap<>();
|
||||
for (SpanWeight w : weights) {
|
||||
w.extractTermContexts(terms);
|
||||
w.extractTermStates(terms);
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
|
|
@ -21,9 +21,9 @@ import java.io.IOException;
|
|||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.LeafSimScorer;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TwoPhaseIterator;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/**
|
||||
* A basic {@link Scorer} over {@link Spans}.
|
||||
|
@ -32,7 +32,7 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
public class SpanScorer extends Scorer {
|
||||
|
||||
protected final Spans spans;
|
||||
protected final Similarity.SimScorer docScorer;
|
||||
protected final LeafSimScorer docScorer;
|
||||
|
||||
/** accumulated sloppy freq (computed in setFreqCurrentDoc) */
|
||||
private float freq;
|
||||
|
@ -41,7 +41,7 @@ public class SpanScorer extends Scorer {
|
|||
private int lastScoredDoc = -1; // last doc we called setFreqCurrentDoc() for
|
||||
|
||||
/** Sole constructor. */
|
||||
public SpanScorer(SpanWeight weight, Spans spans, Similarity.SimScorer docScorer) {
|
||||
public SpanScorer(SpanWeight weight, Spans spans, LeafSimScorer docScorer) {
|
||||
super(weight);
|
||||
this.spans = Objects.requireNonNull(spans);
|
||||
this.docScorer = docScorer;
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.index.LeafReaderContext;
|
|||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
@ -41,21 +41,21 @@ import org.apache.lucene.search.ScoreMode;
|
|||
public class SpanTermQuery extends SpanQuery {
|
||||
|
||||
protected final Term term;
|
||||
protected final TermContext termContext;
|
||||
protected final TermStates termStates;
|
||||
|
||||
/** Construct a SpanTermQuery matching the named term's spans. */
|
||||
public SpanTermQuery(Term term) {
|
||||
this.term = Objects.requireNonNull(term);
|
||||
this.termContext = null;
|
||||
this.termStates = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Construct a SpanTermQuery matching the named term's spans, using
|
||||
* the provided TermContext
|
||||
* the provided TermStates
|
||||
*/
|
||||
public SpanTermQuery(Term term, TermContext context) {
|
||||
public SpanTermQuery(Term term, TermStates termStates) {
|
||||
this.term = Objects.requireNonNull(term);
|
||||
this.termContext = context;
|
||||
this.termStates = termStates;
|
||||
}
|
||||
|
||||
/** Return the term whose spans are matched. */
|
||||
|
@ -66,25 +66,25 @@ public class SpanTermQuery extends SpanQuery {
|
|||
|
||||
@Override
|
||||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
final TermContext context;
|
||||
final TermStates context;
|
||||
final IndexReaderContext topContext = searcher.getTopReaderContext();
|
||||
if (termContext == null || termContext.wasBuiltFor(topContext) == false) {
|
||||
context = TermContext.build(topContext, term);
|
||||
if (termStates == null || termStates.wasBuiltFor(topContext) == false) {
|
||||
context = TermStates.build(topContext, term, scoreMode.needsScores());
|
||||
}
|
||||
else {
|
||||
context = termContext;
|
||||
context = termStates;
|
||||
}
|
||||
return new SpanTermWeight(context, searcher, scoreMode.needsScores() ? Collections.singletonMap(term, context) : null, boost);
|
||||
}
|
||||
|
||||
public class SpanTermWeight extends SpanWeight {
|
||||
|
||||
final TermContext termContext;
|
||||
final TermStates termStates;
|
||||
|
||||
public SpanTermWeight(TermContext termContext, IndexSearcher searcher, Map<Term, TermContext> terms, float boost) throws IOException {
|
||||
public SpanTermWeight(TermStates termStates, IndexSearcher searcher, Map<Term, TermStates> terms, float boost) throws IOException {
|
||||
super(SpanTermQuery.this, searcher, terms, boost);
|
||||
this.termContext = termContext;
|
||||
assert termContext != null : "TermContext must not be null";
|
||||
this.termStates = termStates;
|
||||
assert termStates != null : "TermStates must not be null";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -98,16 +98,16 @@ public class SpanTermQuery extends SpanQuery {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void extractTermContexts(Map<Term, TermContext> contexts) {
|
||||
contexts.put(term, termContext);
|
||||
public void extractTermStates(Map<Term, TermStates> contexts) {
|
||||
contexts.put(term, termStates);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException {
|
||||
|
||||
assert termContext.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
|
||||
assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
|
||||
|
||||
final TermState state = termContext.get(context.ord);
|
||||
final TermState state = termStates.get(context);
|
||||
if (state == null) { // term is not present in that reader
|
||||
assert context.reader().docFreq(term) == 0 : "no termstate found but term exists in reader term=" + term;
|
||||
return null;
|
||||
|
|
|
@ -24,14 +24,14 @@ import java.util.Map;
|
|||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.LeafSimScorer;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||
|
||||
/**
|
||||
* Expert-only. Public for use by other weight implementations
|
||||
|
@ -72,48 +72,48 @@ public abstract class SpanWeight extends Weight {
|
|||
}
|
||||
|
||||
protected final Similarity similarity;
|
||||
protected final Similarity.SimWeight simWeight;
|
||||
protected final Similarity.SimScorer simScorer;
|
||||
protected final String field;
|
||||
|
||||
/**
|
||||
* Create a new SpanWeight
|
||||
* @param query the parent query
|
||||
* @param searcher the IndexSearcher to query against
|
||||
* @param termContexts a map of terms to termcontexts for use in building the similarity. May
|
||||
* @param termStates a map of terms to {@link TermStates} for use in building the similarity. May
|
||||
* be null if scores are not required
|
||||
* @throws IOException on error
|
||||
*/
|
||||
public SpanWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermContext> termContexts, float boost) throws IOException {
|
||||
public SpanWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
|
||||
super(query);
|
||||
this.field = query.getField();
|
||||
this.similarity = searcher.getSimilarity(termContexts != null);
|
||||
this.simWeight = buildSimWeight(query, searcher, termContexts, boost);
|
||||
this.similarity = searcher.getSimilarity();
|
||||
this.simScorer = buildSimWeight(query, searcher, termStates, boost);
|
||||
}
|
||||
|
||||
private Similarity.SimWeight buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermContext> termContexts, float boost) throws IOException {
|
||||
if (termContexts == null || termContexts.size() == 0 || query.getField() == null)
|
||||
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
|
||||
if (termStates == null || termStates.size() == 0 || query.getField() == null)
|
||||
return null;
|
||||
TermStatistics[] termStats = new TermStatistics[termContexts.size()];
|
||||
TermStatistics[] termStats = new TermStatistics[termStates.size()];
|
||||
int termUpTo = 0;
|
||||
for (Term term : termContexts.keySet()) {
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, termContexts.get(term));
|
||||
for (Term term : termStates.keySet()) {
|
||||
TermStatistics termStatistics = searcher.termStatistics(term, termStates.get(term));
|
||||
if (termStatistics != null) {
|
||||
termStats[termUpTo++] = termStatistics;
|
||||
}
|
||||
}
|
||||
CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
|
||||
if (termUpTo > 0) {
|
||||
return similarity.computeWeight(boost, collectionStats, Arrays.copyOf(termStats, termUpTo));
|
||||
return similarity.scorer(boost, collectionStats, Arrays.copyOf(termStats, termUpTo));
|
||||
} else {
|
||||
return null; // no terms at all exist, we won't use similarity
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect all TermContexts used by this Weight
|
||||
* @param contexts a map to add the TermContexts to
|
||||
* Collect all TermStates used by this Weight
|
||||
* @param contexts a map to add the TermStates to
|
||||
*/
|
||||
public abstract void extractTermContexts(Map<Term, TermContext> contexts);
|
||||
public abstract void extractTermStates(Map<Term, TermStates> contexts);
|
||||
|
||||
/**
|
||||
* Expert: Return a Spans object iterating over matches from this Weight
|
||||
|
@ -129,18 +129,18 @@ public abstract class SpanWeight extends Weight {
|
|||
if (spans == null) {
|
||||
return null;
|
||||
}
|
||||
final Similarity.SimScorer docScorer = getSimScorer(context);
|
||||
final LeafSimScorer docScorer = getSimScorer(context);
|
||||
return new SpanScorer(this, spans, docScorer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a SimScorer for this context
|
||||
* Return a LeafSimScorer for this context
|
||||
* @param context the LeafReaderContext
|
||||
* @return a SimWeight
|
||||
* @throws IOException on error
|
||||
*/
|
||||
public Similarity.SimScorer getSimScorer(LeafReaderContext context) throws IOException {
|
||||
return simWeight == null ? null : similarity.simScorer(simWeight, context);
|
||||
public LeafSimScorer getSimScorer(LeafReaderContext context) throws IOException {
|
||||
return simScorer == null ? null : new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -150,7 +150,7 @@ public abstract class SpanWeight extends Weight {
|
|||
int newDoc = scorer.iterator().advance(doc);
|
||||
if (newDoc == doc) {
|
||||
float freq = scorer.sloppyFreq();
|
||||
SimScorer docScorer = similarity.simScorer(simWeight, context);
|
||||
LeafSimScorer docScorer = new LeafSimScorer(simScorer, context.reader(), true, Float.MAX_VALUE);
|
||||
Explanation freqExplanation = Explanation.match(freq, "phraseFreq=" + freq);
|
||||
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
|
||||
return Explanation.match(scoreExplanation.getValue(),
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.TermStates;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
||||
|
@ -46,15 +46,15 @@ public final class SpanWithinQuery extends SpanContainQuery {
|
|||
|
||||
@Override
|
||||
public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
SpanWeight bigWeight = big.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
SpanWeight littleWeight = little.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, boost);
|
||||
return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermContexts(bigWeight, littleWeight) : null,
|
||||
SpanWeight bigWeight = big.createWeight(searcher, scoreMode, boost);
|
||||
SpanWeight littleWeight = little.createWeight(searcher, scoreMode, boost);
|
||||
return new SpanWithinWeight(searcher, scoreMode.needsScores() ? getTermStates(bigWeight, littleWeight) : null,
|
||||
bigWeight, littleWeight, boost);
|
||||
}
|
||||
|
||||
public class SpanWithinWeight extends SpanContainWeight {
|
||||
|
||||
public SpanWithinWeight(IndexSearcher searcher, Map<Term, TermContext> terms,
|
||||
public SpanWithinWeight(IndexSearcher searcher, Map<Term, TermStates> terms,
|
||||
SpanWeight bigWeight, SpanWeight littleWeight, float boost) throws IOException {
|
||||
super(searcher, terms, bigWeight, littleWeight, boost);
|
||||
}
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue