mirror of
https://github.com/apache/lucene.git
synced 2025-02-23 18:55:50 +00:00
Merge branch 'master' into feature/autoscaling
This commit is contained in:
commit
6a8768e395
41
.gitignore
vendored
41
.gitignore
vendored
@ -1,20 +1,19 @@
|
||||
# .
|
||||
/eclipse-build
|
||||
/classes
|
||||
**/build
|
||||
build
|
||||
/idea-build
|
||||
**/dist
|
||||
**/lib
|
||||
**/test-lib
|
||||
dist
|
||||
lib
|
||||
test-lib
|
||||
/*~
|
||||
/velocity.log
|
||||
/build.properties
|
||||
/.idea
|
||||
lucene/**/*.iml
|
||||
solr/**/*.iml
|
||||
parent.iml
|
||||
**/*.ipr
|
||||
**/*.iws
|
||||
*.ipr
|
||||
*.iws
|
||||
/.project
|
||||
/.classpath
|
||||
/.settings
|
||||
@ -22,33 +21,7 @@ parent.iml
|
||||
/prj.el
|
||||
/bin
|
||||
/bin.*
|
||||
**/pom.xml
|
||||
pom.xml
|
||||
/nbproject
|
||||
/nb-build
|
||||
.pydevproject
|
||||
|
||||
/solr/package
|
||||
|
||||
# can this be minimized?
|
||||
/solr/example/start.jar
|
||||
/solr/example/webapps/*
|
||||
/solr/example/logs/*.log
|
||||
/solr/example/**/data
|
||||
/solr/example/solr/lib
|
||||
/solr/example/solr/logs
|
||||
/solr/example/solr/zoo_data
|
||||
/solr/example/work/*
|
||||
/solr/example/exampledocs/post.jar
|
||||
|
||||
/solr/example/example-DIH/**/data
|
||||
/solr/example/example-DIH/**/dataimport.properties
|
||||
/solr/example/example-DIH/solr/mail/lib/*.jar
|
||||
|
||||
solr/contrib/dataimporthandler/test-lib/
|
||||
|
||||
solr/core/test-lib/
|
||||
|
||||
solr/server/logs/
|
||||
solr/server/solr/zoo_data/
|
||||
solr/server/solr-webapp
|
||||
solr/server/start.jar
|
||||
|
@ -66,6 +66,13 @@
|
||||
</foaf:Person>
|
||||
</maintainer>
|
||||
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-6.5.1</name>
|
||||
<created>2017-04-27</created>
|
||||
<revision>6.5.1</revision>
|
||||
</Version>
|
||||
</release>
|
||||
<release>
|
||||
<Version>
|
||||
<name>lucene-6.5.0</name>
|
||||
|
@ -66,6 +66,13 @@
|
||||
</foaf:Person>
|
||||
</maintainer>
|
||||
|
||||
<release>
|
||||
<Version>
|
||||
<name>solr-6.5.1</name>
|
||||
<created>2017-04-27</created>
|
||||
<revision>6.5.1</revision>
|
||||
</Version>
|
||||
</release>
|
||||
<release>
|
||||
<Version>
|
||||
<name>solr-6.5.0</name>
|
||||
|
2
dev-tools/idea/.idea/libraries/HSQLDB.xml
generated
2
dev-tools/idea/.idea/libraries/HSQLDB.xml
generated
@ -1,7 +1,7 @@
|
||||
<component name="libraryTable">
|
||||
<library name="HSQLDB">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-1.8.0.10.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-2.4.0.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
|
@ -16,8 +16,9 @@
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="queries" />
|
||||
<orderEntry type="module" scope="TEST" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="grouping" />
|
||||
<orderEntry type="module" module-name="misc" />
|
||||
<orderEntry type="module" module-name="sandbox" />
|
||||
</component>
|
||||
</module>
|
||||
|
@ -32,5 +32,6 @@
|
||||
<orderEntry type="module" module-name="join" />
|
||||
<orderEntry type="module" module-name="sandbox" />
|
||||
<orderEntry type="module" module-name="backward-codecs" />
|
||||
<orderEntry type="module" module-name="codecs" />
|
||||
</component>
|
||||
</module>
|
||||
|
@ -97,8 +97,8 @@ def prepare(root, version, gpgKeyID, gpgPassword):
|
||||
print(' Check DOAP files')
|
||||
checkDOAPfiles(version)
|
||||
|
||||
print(' ant clean test')
|
||||
run('ant clean test')
|
||||
print(' ant clean test validate documentation-lint')
|
||||
run('ant clean test validate documentation-lint')
|
||||
|
||||
open('rev.txt', mode='wb').write(rev.encode('UTF-8'))
|
||||
|
||||
|
@ -296,7 +296,7 @@ def checkSummary(fullPath):
|
||||
print()
|
||||
print(fullPath)
|
||||
printed = True
|
||||
print(' missing: %s' % unescapeHTML(lastHREF))
|
||||
print(' missing description: %s' % unescapeHTML(lastHREF))
|
||||
anyMissing = True
|
||||
elif lineLower.find('licensed to the apache software foundation') != -1 or lineLower.find('copyright 2004 the apache software foundation') != -1:
|
||||
if not printed:
|
||||
|
@ -266,7 +266,10 @@ def checkAll(dirName):
|
||||
if __name__ == '__main__':
|
||||
if checkAll(sys.argv[1]):
|
||||
print()
|
||||
print('Broken javadocs links were found!')
|
||||
print('Broken javadocs links were found! Common root causes:')
|
||||
# please feel free to add to this list
|
||||
print('* A typo of some sort for manually created links.')
|
||||
print('* Public methods referencing non-public classes in their signature.')
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
|
||||
|
@ -707,8 +707,10 @@ def verifyUnpacked(java, project, artifact, unpackPath, gitRevision, version, te
|
||||
print(' %s' % line.strip())
|
||||
raise RuntimeError('source release has WARs...')
|
||||
|
||||
print(' run "ant validate"')
|
||||
java.run_java8('ant validate', '%s/validate.log' % unpackPath)
|
||||
# Can't run documentation-lint in lucene src, because dev-tools is missing
|
||||
validateCmd = 'ant validate' if project == 'lucene' else 'ant validate documentation-lint';
|
||||
print(' run "%s"' % validateCmd)
|
||||
java.run_java8(validateCmd, '%s/validate.log' % unpackPath)
|
||||
|
||||
if project == 'lucene':
|
||||
print(" run tests w/ Java 8 and testArgs='%s'..." % testArgs)
|
||||
|
@ -50,16 +50,31 @@ API Changes
|
||||
* LUCENE-7701: Grouping collectors have been refactored, such that groups are
|
||||
now defined by a GroupSelector implementation. (Alan Woodward)
|
||||
|
||||
* LUCENE-7741: DoubleValuesSource now has an explain() method (Alan Woodward,
|
||||
Adrien Grand)
|
||||
|
||||
* LUCENE-7815: Removed the PostingsHighlighter; you should use the UnifiedHighlighter
|
||||
instead, which derived from the UH. WholeBreakIterator and
|
||||
CustomSeparatorBreakIterator were moved to UH's package. (David Smiley)
|
||||
|
||||
* LUCENE-7850: Removed support for legacy numerics. (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
|
||||
(Mike McCandless)
|
||||
|
||||
* LUCENE-7859: Spatial-extras PackedQuadPrefixTree bug that only revealed itself
|
||||
with the new pointsOnly optimizations in LUCENE-7845. (David Smiley)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7489: Better storage of sparse doc-values fields with the default
|
||||
codec. (Adrien Grand)
|
||||
|
||||
* LUCENE-7730: More accurate encoding of the length normalization factor
|
||||
thanks to the removal of index-time boosts. (Adrien Grand)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
|
||||
@ -78,6 +93,10 @@ Optimizations
|
||||
values using different numbers of bits per value if this proves to save
|
||||
storage. (Adrien Grand)
|
||||
|
||||
* LUCENE-7845: Enhance spatial-extras RecursivePrefixTreeStrategy queries when the
|
||||
query is a point (for 2D) or a is a simple date interval (e.g. 1 month). When
|
||||
the strategy is marked as pointsOnly, the results is a TermQuery. (David Smiley)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
|
||||
@ -89,14 +108,76 @@ Other
|
||||
* LUCENE-7753: Make fields static when possible.
|
||||
(Daniel Jelinski via Adrien Grand)
|
||||
|
||||
* LUCENE-7540: Upgrade ICU to 59.1 (Mike McCandless, Jim Ferenczi)
|
||||
|
||||
* LUCENE-7852: Correct copyright year(s) in lucene/LICENSE.txt file.
|
||||
(Christine Poerschke, Steve Rowe)
|
||||
|
||||
======================= Lucene 6.7.0 =======================
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-7800: Remove code that potentially rethrows checked exceptions
|
||||
from methods that don't declare them ("sneaky throw" hack). (Robert Muir,
|
||||
Uwe Schindler, Dawid Weiss)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss)
|
||||
|
||||
======================= Lucene 6.6.0 =======================
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-7811: Add a concurrent SortedSet facets implementation.
|
||||
(Mike McCandless)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-7777: ByteBlockPool.readBytes sometimes throws
|
||||
ArrayIndexOutOfBoundsException when byte blocks larger than 32 KB
|
||||
were added (Mike McCandless)
|
||||
|
||||
* LUCENE-7797: The static FSDirectory.listAll(Path) method was always
|
||||
returning an empty array. (Atkins Chang via Mike McCandless)
|
||||
|
||||
* LUCENE-7481: Fixed missing rewrite methods for SpanPayloadCheckQuery
|
||||
and PayloadScoreQuery. (Erik Hatcher)
|
||||
|
||||
* LUCENE-7808: Fixed PayloadScoreQuery and SpanPayloadCheckQuery
|
||||
.equals and .hashCode methods. (Erik Hatcher)
|
||||
|
||||
* LUCENE-7798: Add .equals and .hashCode to ToParentBlockJoinSortField
|
||||
(Mikhail Khludnev)
|
||||
|
||||
* LUCENE-7814: DateRangePrefixTree (in spatial-extras) had edge-case bugs for
|
||||
years >= 292,000,000. (David Smiley)
|
||||
|
||||
* LUCENE-5365, LUCENE-7818: Fix incorrect condition in queryparser's
|
||||
QueryNodeOperation#logicalAnd(). (Olivier Binda, Amrit Sarkar,
|
||||
AppChecker via Uwe Schindler)
|
||||
|
||||
* LUCENE-7821: The classic and flexible query parsers, as well as Solr's
|
||||
"lucene"/standard query parser, should require " TO " in range queries,
|
||||
and accept "TO" as endpoints in range queries. (hossman, Steve Rowe)
|
||||
|
||||
* LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
|
||||
(Jim Ferenczi)
|
||||
|
||||
* LUCENE-7817: Pass cached query to onQueryCache instead of null.
|
||||
(Christoph Kaser via Adrien Grand)
|
||||
|
||||
* LUCENE-7831: CodecUtil should not seek to negative offsets. (Adrien Grand)
|
||||
|
||||
* LUCENE-7833: ToParentBlockJoinQuery computed the min score instead of the max
|
||||
score with ScoreMode.MAX. (Adrien Grand)
|
||||
|
||||
* LUCENE-7847: Fixed all-docs-match optimization of range queries on range
|
||||
fields. (Adrien Grand)
|
||||
|
||||
* LUCENE-7810: Fix equals() and hashCode() methods of several join queries.
|
||||
(Hossman, Adrien Grand, Martijn van Groningen)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7782: OfflineSorter now passes the total number of items it
|
||||
@ -105,6 +186,16 @@ Improvements
|
||||
* LUCENE-7785: Move dictionary for Ukrainian analyzer to external dependency.
|
||||
(Andriy Rysin via Steve Rowe, Dawid Weiss)
|
||||
|
||||
* LUCENE-7801: SortedSetDocValuesReaderState now implements
|
||||
Accountable so you can see how much RAM it's using (Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
* LUCENE-7792: OfflineSorter can now run concurrently if you pass it
|
||||
an optional ExecutorService (Dawid Weiss, Mike McCandless)
|
||||
|
||||
* LUCENE-7811: Sorted set facets now use sparse storage when
|
||||
collecting hits, when appropriate. (Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-7787: spatial-extras HeatmapFacetCounter will now short-circuit it's
|
||||
@ -112,6 +203,12 @@ Optimizations
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-7796: Make IOUtils.reThrow idiom declare Error return type so
|
||||
callers may use it in a way that compiler knows subsequent code is
|
||||
unreachable. reThrow is now deprecated in favor of IOUtils.rethrowAlways
|
||||
with a slightly different semantics (see javadoc). (Hossman, Robert Muir,
|
||||
Dawid Weiss)
|
||||
|
||||
* LUCENE-7754: Inner classes should be static whenever possible.
|
||||
(Daniel Jelinski via Adrien Grand)
|
||||
|
||||
|
@ -74,3 +74,9 @@ collecting TopDocs for each group, but instead takes a GroupReducer that will
|
||||
perform any type of reduction on the top groups collected on a first-pass. To
|
||||
reproduce the old behaviour of SecondPassGroupingCollector, you should instead
|
||||
use TopGroupsCollector.
|
||||
|
||||
## Removed legacy numerics (LUCENE-7850)
|
||||
|
||||
Support for legacy numerics has been removed since legacy numerics had been
|
||||
deprecated since Lucene 6.0. Points should be used instead, see
|
||||
org.apache.lucene.index.PointValues for an introduction.
|
||||
|
@ -1,5 +1,5 @@
|
||||
Apache Lucene
|
||||
Copyright 2014 The Apache Software Foundation
|
||||
Copyright 2001-2017 The Apache Software Foundation
|
||||
|
||||
This product includes software developed at
|
||||
The Apache Software Foundation (http://www.apache.org/).
|
||||
@ -18,13 +18,13 @@ Some data files (under analysis/icu/src/data) are derived from Unicode data such
|
||||
as the Unicode Character Database. See http://unicode.org/copyright.html for more
|
||||
details.
|
||||
|
||||
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
|
||||
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
|
||||
BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
|
||||
|
||||
The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
|
||||
automatically generated with the moman/finenight FSA library, created by
|
||||
Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
|
||||
see http://sites.google.com/site/rrettesite/moman and
|
||||
see http://sites.google.com/site/rrettesite/moman and
|
||||
http://bitbucket.org/jpbarrette/moman/overview/
|
||||
|
||||
The class org.apache.lucene.util.WeakIdentityMap was derived from
|
||||
@ -78,7 +78,7 @@ analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.ja
|
||||
analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
|
||||
analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
|
||||
|
||||
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
||||
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
||||
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
||||
and Edmond Nolan.
|
||||
|
||||
@ -90,8 +90,8 @@ See http://project.carrot2.org/license.html.
|
||||
The SmartChineseAnalyzer source code (smartcn) was
|
||||
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||
|
||||
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||
is derived from Unicode data such as the Unicode Character Database.
|
||||
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
|
||||
is derived from Unicode data such as the Unicode Character Database.
|
||||
See http://unicode.org/copyright.html for more details.
|
||||
|
||||
The Morfologik analyzer (morfologik) includes BSD-licensed software
|
||||
|
@ -24,6 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Emits the entire input as a single token.
|
||||
*/
|
||||
@ -41,16 +43,16 @@ public final class KeywordTokenizer extends Tokenizer {
|
||||
}
|
||||
|
||||
public KeywordTokenizer(int bufferSize) {
|
||||
if (bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("bufferSize must be > 0");
|
||||
if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
}
|
||||
|
||||
public KeywordTokenizer(AttributeFactory factory, int bufferSize) {
|
||||
super(factory);
|
||||
if (bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("bufferSize must be > 0");
|
||||
if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
|
||||
}
|
||||
termAtt.resizeBuffer(bufferSize);
|
||||
}
|
||||
|
@ -16,26 +16,39 @@
|
||||
*/
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Factory for {@link KeywordTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="256"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* Options:
|
||||
* <ul>
|
||||
* <li>maxTokenLen: max token length, should be greater than 0 and less than
|
||||
* MAX_TOKEN_LENGTH_LIMIT (1024*1024). It is rare to need to change this
|
||||
* else {@link KeywordTokenizer}::DEFAULT_BUFFER_SIZE</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class KeywordTokenizerFactory extends TokenizerFactory {
|
||||
private final int maxTokenLen;
|
||||
|
||||
/** Creates a new KeywordTokenizerFactory */
|
||||
public KeywordTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
maxTokenLen = getInt(args, "maxTokenLen", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
@ -43,6 +56,6 @@ public class KeywordTokenizerFactory extends TokenizerFactory {
|
||||
|
||||
@Override
|
||||
public KeywordTokenizer create(AttributeFactory factory) {
|
||||
return new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
return new KeywordTokenizer(factory, maxTokenLen);
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +50,20 @@ public class LetterTokenizer extends CharTokenizer {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LetterTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
|
||||
*/
|
||||
public LetterTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
/** Collects only characters which satisfy
|
||||
* {@link Character#isLetter(int)}.*/
|
||||
@Override
|
||||
|
@ -17,25 +17,40 @@
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Factory for {@link LetterTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.LetterTokenizerFactory"/>
|
||||
* <tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="256"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* Options:
|
||||
* <ul>
|
||||
* <li>maxTokenLen: max token length, must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
|
||||
* It is rare to need to change this
|
||||
* else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class LetterTokenizerFactory extends TokenizerFactory {
|
||||
private final int maxTokenLen;
|
||||
|
||||
/** Creates a new LetterTokenizerFactory */
|
||||
public LetterTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
@ -43,6 +58,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
|
||||
|
||||
@Override
|
||||
public LetterTokenizer create(AttributeFactory factory) {
|
||||
return new LetterTokenizer(factory);
|
||||
return new LetterTokenizer(factory, maxTokenLen);
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +50,19 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new LowerCaseTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
*/
|
||||
public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
/** Converts char to lower case
|
||||
* {@link Character#toLowerCase(int)}.*/
|
||||
@Override
|
||||
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
@ -25,20 +26,36 @@ import org.apache.lucene.util.AttributeFactory;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Factory for {@link LowerCaseTokenizer}.
|
||||
* Factory for {@link LowerCaseTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.LowerCaseTokenizerFactory"/>
|
||||
* </analyzer>
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
* <p>
|
||||
* Options:
|
||||
* <ul>
|
||||
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
|
||||
* It is rare to need to change this
|
||||
* else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
|
||||
|
||||
/** Creates a new LowerCaseTokenizerFactory */
|
||||
public LowerCaseTokenizerFactory(Map<String,String> args) {
|
||||
private final int maxTokenLen;
|
||||
|
||||
/**
|
||||
* Creates a new LowerCaseTokenizerFactory
|
||||
*/
|
||||
public LowerCaseTokenizerFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
@ -46,11 +63,13 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
|
||||
|
||||
@Override
|
||||
public LowerCaseTokenizer create(AttributeFactory factory) {
|
||||
return new LowerCaseTokenizer(factory);
|
||||
return new LowerCaseTokenizer(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AbstractAnalysisFactory getMultiTermComponent() {
|
||||
return new LowerCaseFilterFactory(new HashMap<>(getOriginalArgs()));
|
||||
Map map = new HashMap<>(getOriginalArgs());
|
||||
map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
|
||||
return new LowerCaseFilterFactory(map);
|
||||
}
|
||||
}
|
||||
|
@ -58,7 +58,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
* <ul>
|
||||
* <li><code>wordset</code> - This is the default format, which supports one word per
|
||||
* line (including any intra-word whitespace) and allows whole line comments
|
||||
* begining with the "#" character. Blank lines are ignored. See
|
||||
* beginning with the "#" character. Blank lines are ignored. See
|
||||
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
|
||||
* </li>
|
||||
* <li><code>snowball</code> - This format allows for multiple words specified on each
|
||||
|
@ -47,6 +47,19 @@ public final class UnicodeWhitespaceTokenizer extends CharTokenizer {
|
||||
public UnicodeWhitespaceTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new UnicodeWhitespaceTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
*/
|
||||
public UnicodeWhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy Unicode's WHITESPACE property. */
|
||||
@Override
|
||||
|
@ -46,6 +46,19 @@ public final class WhitespaceTokenizer extends CharTokenizer {
|
||||
public WhitespaceTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new WhitespaceTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeFactory}.
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
*/
|
||||
public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory, maxTokenLen);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
* {@link Character#isWhitespace(int)}.*/
|
||||
|
@ -22,15 +22,18 @@ import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* Factory for {@link WhitespaceTokenizer}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"/>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode" maxTokenLen="256"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
@ -38,6 +41,9 @@ import org.apache.lucene.util.AttributeFactory;
|
||||
* <ul>
|
||||
* <li>rule: either "java" for {@link WhitespaceTokenizer}
|
||||
* or "unicode" for {@link UnicodeWhitespaceTokenizer}</li>
|
||||
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
|
||||
* It is rare to need to change this
|
||||
* else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
|
||||
* </ul>
|
||||
*/
|
||||
public class WhitespaceTokenizerFactory extends TokenizerFactory {
|
||||
@ -46,13 +52,17 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
|
||||
private static final Collection<String> RULE_NAMES = Arrays.asList(RULE_JAVA, RULE_UNICODE);
|
||||
|
||||
private final String rule;
|
||||
private final int maxTokenLen;
|
||||
|
||||
/** Creates a new WhitespaceTokenizerFactory */
|
||||
public WhitespaceTokenizerFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
||||
rule = get(args, "rule", RULE_NAMES, RULE_JAVA);
|
||||
|
||||
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
@ -62,9 +72,9 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
|
||||
public Tokenizer create(AttributeFactory factory) {
|
||||
switch (rule) {
|
||||
case RULE_JAVA:
|
||||
return new WhitespaceTokenizer(factory);
|
||||
return new WhitespaceTokenizer(factory, maxTokenLen);
|
||||
case RULE_UNICODE:
|
||||
return new UnicodeWhitespaceTokenizer(factory);
|
||||
return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
|
||||
default:
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
@ -33,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
|
||||
|
||||
/**
|
||||
* An abstract base class for simple, character-oriented tokenizers.
|
||||
* <p>
|
||||
@ -50,6 +52,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||
* Creates a new {@link CharTokenizer} instance
|
||||
*/
|
||||
public CharTokenizer() {
|
||||
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -60,6 +63,23 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||
*/
|
||||
public CharTokenizer(AttributeFactory factory) {
|
||||
super(factory);
|
||||
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharTokenizer} instance
|
||||
*
|
||||
* @param factory the attribute factory to use for this {@link Tokenizer}
|
||||
* @param maxTokenLen maximum token length the tokenizer will emit.
|
||||
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
|
||||
* @throws IllegalArgumentException if maxTokenLen is invalid.
|
||||
*/
|
||||
public CharTokenizer(AttributeFactory factory, int maxTokenLen) {
|
||||
super(factory);
|
||||
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
|
||||
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
|
||||
}
|
||||
this.maxTokenLen = maxTokenLen;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -193,9 +213,10 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
public static final int DEFAULT_MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
|
||||
private final int maxTokenLen;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
@ -256,7 +277,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||
}
|
||||
end += charCount;
|
||||
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
|
||||
if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test
|
||||
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
|
||||
break;
|
||||
}
|
||||
} else if (length > 0) { // at non-Letter w/ chars
|
||||
|
@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was created using ICU4J v56.1.0.0
|
||||
* The data was created using ICU4J v59.1.0.0
|
||||
* <p>
|
||||
* Unicode version: 8.0.0.0
|
||||
* Unicode version: 9.0.0.0
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "8.0.0.0";
|
||||
public static final String UNICODE_VERSION = "9.0.0.0";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(
|
||||
|
@ -31,6 +31,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
package org.tartarus.snowball;
|
||||
|
||||
import java.lang.reflect.UndeclaredThrowableException;
|
||||
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/**
|
||||
@ -313,8 +315,10 @@ public abstract class SnowballProgram {
|
||||
boolean res = false;
|
||||
try {
|
||||
res = (boolean) w.method.invokeExact(this);
|
||||
} catch (Error | RuntimeException e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
rethrow(e);
|
||||
throw new UndeclaredThrowableException(e);
|
||||
}
|
||||
cursor = c + w.s_size;
|
||||
if (res) return w.result;
|
||||
@ -376,8 +380,10 @@ public abstract class SnowballProgram {
|
||||
boolean res = false;
|
||||
try {
|
||||
res = (boolean) w.method.invokeExact(this);
|
||||
} catch (Error | RuntimeException e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
rethrow(e);
|
||||
throw new UndeclaredThrowableException(e);
|
||||
}
|
||||
cursor = c - w.s_size;
|
||||
if (res) return w.result;
|
||||
@ -485,15 +491,5 @@ extern void debug(struct SN_env * z, int number, int line_count)
|
||||
printf("'\n");
|
||||
}
|
||||
*/
|
||||
|
||||
// Hack to rethrow unknown Exceptions from {@link MethodHandle#invoke}:
|
||||
private static void rethrow(Throwable t) {
|
||||
SnowballProgram.<Error>rethrow0(t);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T extends Throwable> void rethrow0(Throwable t) throws T {
|
||||
throw (T) t;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -53,7 +53,7 @@
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
|
||||
characters as described before, between any two word characters a digit
|
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent
|
||||
to zero. The '.' character is reserved to indicate begining or ending
|
||||
to zero. The '.' character is reserved to indicate beginning or ending
|
||||
of words. -->
|
||||
<!ELEMENT patterns (#PCDATA)>
|
||||
|
||||
|
@ -54,7 +54,7 @@
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
|
||||
characters as described before, between any two word characters a digit
|
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent
|
||||
to zero. The '.' character is reserved to indicate begining or ending
|
||||
to zero. The '.' character is reserved to indicate beginning or ending
|
||||
of words. -->
|
||||
<!ELEMENT patterns (#PCDATA)>
|
||||
|
||||
|
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.core;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.util.AttributeFactory;
|
||||
|
||||
public class TestKeywordTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testSimple() throws IOException {
|
||||
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer();
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer \ud801\udc1ctest"});
|
||||
}
|
||||
|
||||
public void testFactory() {
|
||||
Map<String, String> args = new HashMap<>();
|
||||
KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
|
||||
AttributeFactory attributeFactory = newAttributeFactory();
|
||||
Tokenizer tokenizer = factory.create(attributeFactory);
|
||||
assertEquals(KeywordTokenizer.class, tokenizer.getClass());
|
||||
}
|
||||
|
||||
private Map<String, String> makeArgs(String... args) {
|
||||
Map<String, String> ret = new HashMap<>();
|
||||
for (int idx = 0; idx < args.length; idx += 2) {
|
||||
ret.put(args[idx], args[idx + 1]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void testParamsFactory() throws IOException {
|
||||
// negative maxTokenLen
|
||||
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "-1")));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
|
||||
|
||||
// zero maxTokenLen
|
||||
iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "0")));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
|
||||
|
||||
// Added random param, should throw illegal error
|
||||
iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "255", "randomParam", "rValue")));
|
||||
assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
|
||||
|
||||
// tokeniser will never split, no matter what is passed,
|
||||
// but the buffer will not be more than length of the token
|
||||
|
||||
KeywordTokenizerFactory factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "5"));
|
||||
AttributeFactory attributeFactory = newAttributeFactory();
|
||||
Tokenizer tokenizer = factory.create(attributeFactory);
|
||||
StringReader reader = new StringReader("Tokenizertest");
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"Tokenizertest"});
|
||||
|
||||
// tokeniser will never split, no matter what is passed,
|
||||
// but the buffer will not be more than length of the token
|
||||
factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "2"));
|
||||
attributeFactory = newAttributeFactory();
|
||||
tokenizer = factory.create(attributeFactory);
|
||||
reader = new StringReader("Tokenizer\u00A0test");
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer\u00A0test"});
|
||||
}
|
||||
}
|
@ -54,4 +54,55 @@ public class TestUnicodeWhitespaceTokenizer extends BaseTokenStreamTestCase {
|
||||
assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
|
||||
}
|
||||
|
||||
private Map<String, String> makeArgs(String... args) {
|
||||
Map<String, String> ret = new HashMap<>();
|
||||
for (int idx = 0; idx < args.length; idx += 2) {
|
||||
ret.put(args[idx], args[idx + 1]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void testParamsFactory() throws IOException {
|
||||
|
||||
|
||||
// negative maxTokenLen
|
||||
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
|
||||
|
||||
// zero maxTokenLen
|
||||
iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
|
||||
|
||||
// Added random param, should throw illegal error
|
||||
iae = expectThrows(IllegalArgumentException.class, () ->
|
||||
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
|
||||
assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
|
||||
|
||||
// tokeniser will split at 5, Token | izer, no matter what happens
|
||||
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
|
||||
AttributeFactory attributeFactory = newAttributeFactory();
|
||||
Tokenizer tokenizer = factory.create(attributeFactory);
|
||||
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"Token", "izer", "\ud801\udc1ctes", "t"});
|
||||
|
||||
// tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens
|
||||
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
|
||||
attributeFactory = newAttributeFactory();
|
||||
tokenizer = factory.create(attributeFactory);
|
||||
reader = new StringReader("Tokenizer\u00A0test");
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"To", "ke", "ni", "ze", "r", "te", "st"});
|
||||
|
||||
// tokeniser will split at 10, no matter what happens,
|
||||
// but tokens' length are less than that
|
||||
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
|
||||
attributeFactory = newAttributeFactory();
|
||||
tokenizer = factory.create(attributeFactory);
|
||||
reader = new StringReader("Tokenizer\u00A0test");
|
||||
tokenizer.setReader(reader);
|
||||
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer", "test"});
|
||||
}
|
||||
}
|
||||
|
@ -25,8 +25,10 @@ import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
@ -89,6 +91,99 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
|
||||
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
|
||||
}
|
||||
|
||||
/*
|
||||
* tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
|
||||
*/
|
||||
public void testCustomMaxTokenLength() throws IOException {
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
builder.append("A");
|
||||
}
|
||||
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
|
||||
// Tricky, passing two copies of the string to the reader....
|
||||
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT),
|
||||
builder.toString().toLowerCase(Locale.ROOT) });
|
||||
|
||||
Exception e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new LowerCaseTokenizer(newAttributeFactory(), -1));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
|
||||
|
||||
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
|
||||
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString(), builder.toString()});
|
||||
|
||||
|
||||
// Let's test that we can get a token longer than 255 through.
|
||||
builder.setLength(0);
|
||||
for (int i = 0; i < 500; i++) {
|
||||
builder.append("Z");
|
||||
}
|
||||
tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
|
||||
|
||||
|
||||
// Just to be sure what is happening here, token lengths of zero make no sense,
|
||||
// Let's try the edge cases, token > I/O buffer (4096)
|
||||
builder.setLength(0);
|
||||
for (int i = 0; i < 600; i++) {
|
||||
builder.append("aUrOkIjq"); // 600 * 8 = 4800 chars.
|
||||
}
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new LowerCaseTokenizer(newAttributeFactory(), 0));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
|
||||
|
||||
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
|
||||
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new KeywordTokenizer(newAttributeFactory(), 0));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new KeywordTokenizer(newAttributeFactory(), 10_000_000));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
|
||||
|
||||
|
||||
tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new LetterTokenizer(newAttributeFactory(), 0));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new LetterTokenizer(newAttributeFactory(), 2_000_000));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
|
||||
|
||||
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new WhitespaceTokenizer(newAttributeFactory(), 0));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
|
||||
|
||||
e = expectThrows(IllegalArgumentException.class, () ->
|
||||
new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
|
||||
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
|
||||
|
||||
tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
|
||||
tokenizer.setReader(new StringReader(builder.toString()));
|
||||
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* tests the max word length of 255 with a surrogate pair at position 255
|
||||
|
@ -168,11 +168,14 @@ FFE3>
|
||||
1134D>
|
||||
11366..1136C>
|
||||
11370..11374>
|
||||
11442>
|
||||
11446>
|
||||
114C2..114C3>
|
||||
115BF..115C0>
|
||||
1163F>
|
||||
116B6..116B7>
|
||||
1172B>
|
||||
11C3F>
|
||||
16AF0..16AF4>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
@ -181,6 +184,8 @@ FFE3>
|
||||
1D185..1D18B>
|
||||
1D1AA..1D1AD>
|
||||
1E8D0..1E8D6>
|
||||
1E944..1E946>
|
||||
1E948..1E94A>
|
||||
|
||||
# Latin script "composed" that do not further decompose, so decompose here
|
||||
# These are from AsciiFoldingFilter
|
||||
|
@ -510,6 +510,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||
112F7>0037 # KHUDAWADI DIGIT SEVEN
|
||||
112F8>0038 # KHUDAWADI DIGIT EIGHT
|
||||
112F9>0039 # KHUDAWADI DIGIT NINE
|
||||
11450>0030 # NEWA DIGIT ZERO
|
||||
11451>0031 # NEWA DIGIT ONE
|
||||
11452>0032 # NEWA DIGIT TWO
|
||||
11453>0033 # NEWA DIGIT THREE
|
||||
11454>0034 # NEWA DIGIT FOUR
|
||||
11455>0035 # NEWA DIGIT FIVE
|
||||
11456>0036 # NEWA DIGIT SIX
|
||||
11457>0037 # NEWA DIGIT SEVEN
|
||||
11458>0038 # NEWA DIGIT EIGHT
|
||||
11459>0039 # NEWA DIGIT NINE
|
||||
114D0>0030 # TIRHUTA DIGIT ZERO
|
||||
114D1>0031 # TIRHUTA DIGIT ONE
|
||||
114D2>0032 # TIRHUTA DIGIT TWO
|
||||
@ -560,6 +570,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||
118E7>0037 # WARANG CITI DIGIT SEVEN
|
||||
118E8>0038 # WARANG CITI DIGIT EIGHT
|
||||
118E9>0039 # WARANG CITI DIGIT NINE
|
||||
11C50>0030 # BHAIKSUKI DIGIT ZERO
|
||||
11C51>0031 # BHAIKSUKI DIGIT ONE
|
||||
11C52>0032 # BHAIKSUKI DIGIT TWO
|
||||
11C53>0033 # BHAIKSUKI DIGIT THREE
|
||||
11C54>0034 # BHAIKSUKI DIGIT FOUR
|
||||
11C55>0035 # BHAIKSUKI DIGIT FIVE
|
||||
11C56>0036 # BHAIKSUKI DIGIT SIX
|
||||
11C57>0037 # BHAIKSUKI DIGIT SEVEN
|
||||
11C58>0038 # BHAIKSUKI DIGIT EIGHT
|
||||
11C59>0039 # BHAIKSUKI DIGIT NINE
|
||||
16A60>0030 # MRO DIGIT ZERO
|
||||
16A61>0031 # MRO DIGIT ONE
|
||||
16A62>0032 # MRO DIGIT TWO
|
||||
@ -580,4 +600,14 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
|
||||
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
|
||||
16B59>0039 # PAHAWH HMONG DIGIT NINE
|
||||
1E950>0030 # ADLAM DIGIT ZERO
|
||||
1E951>0031 # ADLAM DIGIT ONE
|
||||
1E952>0032 # ADLAM DIGIT TWO
|
||||
1E953>0033 # ADLAM DIGIT THREE
|
||||
1E954>0034 # ADLAM DIGIT FOUR
|
||||
1E955>0035 # ADLAM DIGIT FIVE
|
||||
1E956>0036 # ADLAM DIGIT SIX
|
||||
1E957>0037 # ADLAM DIGIT SEVEN
|
||||
1E958>0038 # ADLAM DIGIT EIGHT
|
||||
1E959>0039 # ADLAM DIGIT NINE
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright (C) 1999-2014, International Business Machines
|
||||
# Copyright (C) 1999-2016, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfc.txt
|
||||
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 7.0.0
|
||||
* Unicode 9.0.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
@ -129,6 +129,8 @@
|
||||
0825..0827:230
|
||||
0829..082D:230
|
||||
0859..085B:220
|
||||
08D4..08E1:230
|
||||
08E3:220
|
||||
08E4..08E5:230
|
||||
08E6:220
|
||||
08E7..08E8:230
|
||||
@ -232,6 +234,7 @@
|
||||
1DCF:220
|
||||
1DD0:202
|
||||
1DD1..1DF5:230
|
||||
1DFB:230
|
||||
1DFC:233
|
||||
1DFD:220
|
||||
1DFE:230
|
||||
@ -260,7 +263,7 @@
|
||||
3099..309A:8
|
||||
A66F:230
|
||||
A674..A67D:230
|
||||
A69F:230
|
||||
A69E..A69F:230
|
||||
A6F0..A6F1:230
|
||||
A806:9
|
||||
A8C4:9
|
||||
@ -280,6 +283,7 @@ ABED:9
|
||||
FB1E:26
|
||||
FE20..FE26:230
|
||||
FE27..FE2D:220
|
||||
FE2E..FE2F:230
|
||||
101FD:220
|
||||
102E0:220
|
||||
10376..1037A:230
|
||||
@ -299,6 +303,7 @@ FE27..FE2D:220
|
||||
11133..11134:9
|
||||
11173:7
|
||||
111C0:9
|
||||
111CA:7
|
||||
11235:9
|
||||
11236:7
|
||||
112E9:7
|
||||
@ -307,6 +312,8 @@ FE27..FE2D:220
|
||||
1134D:9
|
||||
11366..1136C:230
|
||||
11370..11374:230
|
||||
11442:9
|
||||
11446:7
|
||||
114C2:9
|
||||
114C3:7
|
||||
115BF:9
|
||||
@ -314,6 +321,8 @@ FE27..FE2D:220
|
||||
1163F:9
|
||||
116B6:9
|
||||
116B7:7
|
||||
1172B:9
|
||||
11C3F:9
|
||||
16AF0..16AF4:1
|
||||
16B30..16B36:230
|
||||
1BC9E:1
|
||||
@ -326,7 +335,14 @@ FE27..FE2D:220
|
||||
1D18A..1D18B:220
|
||||
1D1AA..1D1AD:230
|
||||
1D242..1D244:230
|
||||
1E000..1E006:230
|
||||
1E008..1E018:230
|
||||
1E01B..1E021:230
|
||||
1E023..1E024:230
|
||||
1E026..1E02A:230
|
||||
1E8D0..1E8D6:220
|
||||
1E944..1E949:230
|
||||
1E94A:7
|
||||
|
||||
# Canonical decomposition mappings
|
||||
00C0>0041 0300 # one-way: diacritic 0300
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright (C) 1999-2014, International Business Machines
|
||||
# Copyright (C) 1999-2016, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc.txt
|
||||
@ -11,7 +11,7 @@
|
||||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 7.0.0
|
||||
* Unicode 9.0.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
@ -3675,6 +3675,7 @@ FFEE>25CB
|
||||
1F238>7533
|
||||
1F239>5272
|
||||
1F23A>55B6
|
||||
1F23B>914D
|
||||
1F240>3014 672C 3015
|
||||
1F241>3014 4E09 3015
|
||||
1F242>3014 4E8C 3015
|
||||
|
@ -1,5 +1,5 @@
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2014 Unicode, Inc.
|
||||
# Copyright (c) 1991-2016 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
@ -12,7 +12,7 @@
|
||||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 7.0.0
|
||||
* Unicode 9.0.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
@ -632,8 +632,22 @@
|
||||
10CD>2D2D
|
||||
10FC>10DC
|
||||
115F..1160>
|
||||
13F8>13F0
|
||||
13F9>13F1
|
||||
13FA>13F2
|
||||
13FB>13F3
|
||||
13FC>13F4
|
||||
13FD>13F5
|
||||
17B4..17B5>
|
||||
180B..180E>
|
||||
1C80>0432
|
||||
1C81>0434
|
||||
1C82>043E
|
||||
1C83>0441
|
||||
1C84..1C85>0442
|
||||
1C86>044A
|
||||
1C87>0463
|
||||
1C88>A64B
|
||||
1D2C>0061
|
||||
1D2D>00E6
|
||||
1D2E>0062
|
||||
@ -2382,14 +2396,99 @@ A7AA>0266
|
||||
A7AB>025C
|
||||
A7AC>0261
|
||||
A7AD>026C
|
||||
A7AE>026A
|
||||
A7B0>029E
|
||||
A7B1>0287
|
||||
A7B2>029D
|
||||
A7B3>AB53
|
||||
A7B4>A7B5
|
||||
A7B6>A7B7
|
||||
A7F8>0127
|
||||
A7F9>0153
|
||||
AB5C>A727
|
||||
AB5D>AB37
|
||||
AB5E>026B
|
||||
AB5F>AB52
|
||||
AB70>13A0
|
||||
AB71>13A1
|
||||
AB72>13A2
|
||||
AB73>13A3
|
||||
AB74>13A4
|
||||
AB75>13A5
|
||||
AB76>13A6
|
||||
AB77>13A7
|
||||
AB78>13A8
|
||||
AB79>13A9
|
||||
AB7A>13AA
|
||||
AB7B>13AB
|
||||
AB7C>13AC
|
||||
AB7D>13AD
|
||||
AB7E>13AE
|
||||
AB7F>13AF
|
||||
AB80>13B0
|
||||
AB81>13B1
|
||||
AB82>13B2
|
||||
AB83>13B3
|
||||
AB84>13B4
|
||||
AB85>13B5
|
||||
AB86>13B6
|
||||
AB87>13B7
|
||||
AB88>13B8
|
||||
AB89>13B9
|
||||
AB8A>13BA
|
||||
AB8B>13BB
|
||||
AB8C>13BC
|
||||
AB8D>13BD
|
||||
AB8E>13BE
|
||||
AB8F>13BF
|
||||
AB90>13C0
|
||||
AB91>13C1
|
||||
AB92>13C2
|
||||
AB93>13C3
|
||||
AB94>13C4
|
||||
AB95>13C5
|
||||
AB96>13C6
|
||||
AB97>13C7
|
||||
AB98>13C8
|
||||
AB99>13C9
|
||||
AB9A>13CA
|
||||
AB9B>13CB
|
||||
AB9C>13CC
|
||||
AB9D>13CD
|
||||
AB9E>13CE
|
||||
AB9F>13CF
|
||||
ABA0>13D0
|
||||
ABA1>13D1
|
||||
ABA2>13D2
|
||||
ABA3>13D3
|
||||
ABA4>13D4
|
||||
ABA5>13D5
|
||||
ABA6>13D6
|
||||
ABA7>13D7
|
||||
ABA8>13D8
|
||||
ABA9>13D9
|
||||
ABAA>13DA
|
||||
ABAB>13DB
|
||||
ABAC>13DC
|
||||
ABAD>13DD
|
||||
ABAE>13DE
|
||||
ABAF>13DF
|
||||
ABB0>13E0
|
||||
ABB1>13E1
|
||||
ABB2>13E2
|
||||
ABB3>13E3
|
||||
ABB4>13E4
|
||||
ABB5>13E5
|
||||
ABB6>13E6
|
||||
ABB7>13E7
|
||||
ABB8>13E8
|
||||
ABB9>13E9
|
||||
ABBA>13EA
|
||||
ABBB>13EB
|
||||
ABBC>13EC
|
||||
ABBD>13ED
|
||||
ABBE>13EE
|
||||
ABBF>13EF
|
||||
F900>8C48
|
||||
F901>66F4
|
||||
F902>8ECA
|
||||
@ -3766,6 +3865,93 @@ FFF0..FFF8>
|
||||
10425>1044D
|
||||
10426>1044E
|
||||
10427>1044F
|
||||
104B0>104D8
|
||||
104B1>104D9
|
||||
104B2>104DA
|
||||
104B3>104DB
|
||||
104B4>104DC
|
||||
104B5>104DD
|
||||
104B6>104DE
|
||||
104B7>104DF
|
||||
104B8>104E0
|
||||
104B9>104E1
|
||||
104BA>104E2
|
||||
104BB>104E3
|
||||
104BC>104E4
|
||||
104BD>104E5
|
||||
104BE>104E6
|
||||
104BF>104E7
|
||||
104C0>104E8
|
||||
104C1>104E9
|
||||
104C2>104EA
|
||||
104C3>104EB
|
||||
104C4>104EC
|
||||
104C5>104ED
|
||||
104C6>104EE
|
||||
104C7>104EF
|
||||
104C8>104F0
|
||||
104C9>104F1
|
||||
104CA>104F2
|
||||
104CB>104F3
|
||||
104CC>104F4
|
||||
104CD>104F5
|
||||
104CE>104F6
|
||||
104CF>104F7
|
||||
104D0>104F8
|
||||
104D1>104F9
|
||||
104D2>104FA
|
||||
104D3>104FB
|
||||
10C80>10CC0
|
||||
10C81>10CC1
|
||||
10C82>10CC2
|
||||
10C83>10CC3
|
||||
10C84>10CC4
|
||||
10C85>10CC5
|
||||
10C86>10CC6
|
||||
10C87>10CC7
|
||||
10C88>10CC8
|
||||
10C89>10CC9
|
||||
10C8A>10CCA
|
||||
10C8B>10CCB
|
||||
10C8C>10CCC
|
||||
10C8D>10CCD
|
||||
10C8E>10CCE
|
||||
10C8F>10CCF
|
||||
10C90>10CD0
|
||||
10C91>10CD1
|
||||
10C92>10CD2
|
||||
10C93>10CD3
|
||||
10C94>10CD4
|
||||
10C95>10CD5
|
||||
10C96>10CD6
|
||||
10C97>10CD7
|
||||
10C98>10CD8
|
||||
10C99>10CD9
|
||||
10C9A>10CDA
|
||||
10C9B>10CDB
|
||||
10C9C>10CDC
|
||||
10C9D>10CDD
|
||||
10C9E>10CDE
|
||||
10C9F>10CDF
|
||||
10CA0>10CE0
|
||||
10CA1>10CE1
|
||||
10CA2>10CE2
|
||||
10CA3>10CE3
|
||||
10CA4>10CE4
|
||||
10CA5>10CE5
|
||||
10CA6>10CE6
|
||||
10CA7>10CE7
|
||||
10CA8>10CE8
|
||||
10CA9>10CE9
|
||||
10CAA>10CEA
|
||||
10CAB>10CEB
|
||||
10CAC>10CEC
|
||||
10CAD>10CED
|
||||
10CAE>10CEE
|
||||
10CAF>10CEF
|
||||
10CB0>10CF0
|
||||
10CB1>10CF1
|
||||
10CB2>10CF2
|
||||
118A0>118C0
|
||||
118A1>118C1
|
||||
118A2>118C2
|
||||
@ -4803,6 +4989,40 @@ FFF0..FFF8>
|
||||
1D7FD>0037
|
||||
1D7FE>0038
|
||||
1D7FF>0039
|
||||
1E900>1E922
|
||||
1E901>1E923
|
||||
1E902>1E924
|
||||
1E903>1E925
|
||||
1E904>1E926
|
||||
1E905>1E927
|
||||
1E906>1E928
|
||||
1E907>1E929
|
||||
1E908>1E92A
|
||||
1E909>1E92B
|
||||
1E90A>1E92C
|
||||
1E90B>1E92D
|
||||
1E90C>1E92E
|
||||
1E90D>1E92F
|
||||
1E90E>1E930
|
||||
1E90F>1E931
|
||||
1E910>1E932
|
||||
1E911>1E933
|
||||
1E912>1E934
|
||||
1E913>1E935
|
||||
1E914>1E936
|
||||
1E915>1E937
|
||||
1E916>1E938
|
||||
1E917>1E939
|
||||
1E918>1E93A
|
||||
1E919>1E93B
|
||||
1E91A>1E93C
|
||||
1E91B>1E93D
|
||||
1E91C>1E93E
|
||||
1E91D>1E93F
|
||||
1E91E>1E940
|
||||
1E91F>1E941
|
||||
1E920>1E942
|
||||
1E921>1E943
|
||||
1EE00>0627
|
||||
1EE01>0628
|
||||
1EE02>062C
|
||||
@ -5067,6 +5287,7 @@ FFF0..FFF8>
|
||||
1F238>7533
|
||||
1F239>5272
|
||||
1F23A>55B6
|
||||
1F23B>914D
|
||||
1F240>3014 672C 3015
|
||||
1F241>3014 4E09 3015
|
||||
1F242>3014 4E8C 3015
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -53,7 +53,14 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
||||
new String[] { "我", "购买", "了", "道具", "和", "服装" }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testTraditionalChinese() throws Exception {
|
||||
assertAnalyzesTo(a, "我購買了道具和服裝。",
|
||||
new String[] { "我", "購買", "了", "道具", "和", "服裝"});
|
||||
assertAnalyzesTo(a, "定義切分字串的基本單位是訂定分詞標準的首要工作", // From http://godel.iis.sinica.edu.tw/CKIP/paper/wordsegment_standard.pdf
|
||||
new String[] { "定義", "切", "分", "字串", "的", "基本", "單位", "是", "訂定", "分詞", "標準", "的", "首要", "工作" });
|
||||
}
|
||||
|
||||
public void testChineseNumerics() throws Exception {
|
||||
assertAnalyzesTo(a, "9483", new String[] { "9483" });
|
||||
assertAnalyzesTo(a, "院內分機9483。",
|
||||
|
@ -63,7 +63,7 @@ import java.util.regex.Pattern;
|
||||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-54-1";
|
||||
private static final String ICU_RELEASE_TAG = "release-58-1";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
|
@ -116,6 +116,8 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
|
||||
// ignored characters
|
||||
builder.add("\u0301", "");
|
||||
builder.add("\u00AD", "");
|
||||
builder.add("ґ", "г");
|
||||
builder.add("Ґ", "Г");
|
||||
|
||||
NormalizeCharMap normMap = builder.build();
|
||||
reader = new MappingCharFilter(normMap, reader);
|
||||
|
@ -52,10 +52,17 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
|
||||
public void testCapsTokenStream() throws Exception {
|
||||
Analyzer a = new UkrainianMorfologikAnalyzer();
|
||||
assertAnalyzesTo(a, "Цих Чайковського і Ґете.",
|
||||
new String[] { "Чайковське", "Чайковський", "Ґете" });
|
||||
new String[] { "Чайковське", "Чайковський", "Гете" });
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testCharNormalization() throws Exception {
|
||||
Analyzer a = new UkrainianMorfologikAnalyzer();
|
||||
assertAnalyzesTo(a, "Ґюмрі та Гюмрі.",
|
||||
new String[] { "Гюмрі", "Гюмрі" });
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testSampleSentence() throws Exception {
|
||||
Analyzer a = new UkrainianMorfologikAnalyzer();
|
||||
assertAnalyzesTo(a, "Це — проект генерування словника з тегами частин мови для української мови.",
|
||||
|
@ -60,10 +60,6 @@ import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.legacy.LegacyIntField;
|
||||
import org.apache.lucene.legacy.LegacyLongField;
|
||||
import org.apache.lucene.legacy.LegacyNumericRangeQuery;
|
||||
import org.apache.lucene.legacy.LegacyNumericUtils;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
@ -299,7 +295,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||
"6.4.2-cfs",
|
||||
"6.4.2-nocfs",
|
||||
"6.5.0-cfs",
|
||||
"6.5.0-nocfs"
|
||||
"6.5.0-nocfs",
|
||||
"6.5.1-cfs",
|
||||
"6.5.1-nocfs"
|
||||
};
|
||||
|
||||
final String[] unsupportedNames = {
|
||||
@ -1112,9 +1110,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||
doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
|
||||
doc.add(new Field("content2", "here is more content with aaa aaa aaa", customType2));
|
||||
doc.add(new Field("fie\u2C77ld", "field with non-ascii name", customType2));
|
||||
// add numeric fields, to test if flex preserves encoding
|
||||
doc.add(new LegacyIntField("trieInt", id, Field.Store.NO));
|
||||
doc.add(new LegacyLongField("trieLong", (long) id, Field.Store.NO));
|
||||
|
||||
// add docvalues fields
|
||||
doc.add(new NumericDocValuesField("dvByte", (byte) id));
|
||||
@ -1292,51 +1287,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||
}
|
||||
}
|
||||
|
||||
public void testNumericFields() throws Exception {
|
||||
for (String name : oldNames) {
|
||||
|
||||
Directory dir = oldIndexDirs.get(name);
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
for (int id=10; id<15; id++) {
|
||||
ScoreDoc[] hits = searcher.search(LegacyNumericRangeQuery.newIntRange("trieInt", LegacyNumericUtils.PRECISION_STEP_DEFAULT_32, Integer.valueOf(id), Integer.valueOf(id), true, true), 100).scoreDocs;
|
||||
assertEquals("wrong number of hits", 1, hits.length);
|
||||
Document d = searcher.doc(hits[0].doc);
|
||||
assertEquals(String.valueOf(id), d.get("id"));
|
||||
|
||||
hits = searcher.search(LegacyNumericRangeQuery.newLongRange("trieLong", LegacyNumericUtils.PRECISION_STEP_DEFAULT, Long.valueOf(id), Long.valueOf(id), true, true), 100).scoreDocs;
|
||||
assertEquals("wrong number of hits", 1, hits.length);
|
||||
d = searcher.doc(hits[0].doc);
|
||||
assertEquals(String.valueOf(id), d.get("id"));
|
||||
}
|
||||
|
||||
// check that also lower-precision fields are ok
|
||||
ScoreDoc[] hits = searcher.search(LegacyNumericRangeQuery.newIntRange("trieInt", LegacyNumericUtils.PRECISION_STEP_DEFAULT_32, Integer.MIN_VALUE, Integer.MAX_VALUE, false, false), 100).scoreDocs;
|
||||
assertEquals("wrong number of hits", 34, hits.length);
|
||||
|
||||
hits = searcher.search(LegacyNumericRangeQuery.newLongRange("trieLong", LegacyNumericUtils.PRECISION_STEP_DEFAULT, Long.MIN_VALUE, Long.MAX_VALUE, false, false), 100).scoreDocs;
|
||||
assertEquals("wrong number of hits", 34, hits.length);
|
||||
|
||||
// check decoding of terms
|
||||
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "trieInt");
|
||||
TermsEnum termsEnum = LegacyNumericUtils.filterPrefixCodedInts(terms.iterator());
|
||||
while (termsEnum.next() != null) {
|
||||
int val = LegacyNumericUtils.prefixCodedToInt(termsEnum.term());
|
||||
assertTrue("value in id bounds", val >= 0 && val < 35);
|
||||
}
|
||||
|
||||
terms = MultiFields.getTerms(searcher.getIndexReader(), "trieLong");
|
||||
termsEnum = LegacyNumericUtils.filterPrefixCodedLongs(terms.iterator());
|
||||
while (termsEnum.next() != null) {
|
||||
long val = LegacyNumericUtils.prefixCodedToLong(termsEnum.term());
|
||||
assertTrue("value in id bounds", val >= 0L && val < 35L);
|
||||
}
|
||||
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
private int checkAllSegmentsUpgraded(Directory dir, int indexCreatedVersion) throws IOException {
|
||||
final SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
|
||||
if (VERBOSE) {
|
||||
|
Binary file not shown.
Binary file not shown.
@ -38,7 +38,7 @@ file.query.maker.file=conf/query-terms.txt
|
||||
log.queries=false
|
||||
log.step.SearchTravRetHighlight=-1
|
||||
|
||||
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
||||
highlighter=HlImpl:NONE:SH_A:UH_A:UH_P:UH_PV
|
||||
|
||||
{ "Populate"
|
||||
CreateIndex
|
||||
@ -60,6 +60,6 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
|
||||
CloseReader
|
||||
|
||||
NewRound
|
||||
} : 6
|
||||
} : 5
|
||||
|
||||
RepSumByPrefRound HL
|
@ -42,7 +42,6 @@ import org.apache.lucene.search.highlight.Highlighter;
|
||||
import org.apache.lucene.search.highlight.QueryScorer;
|
||||
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
|
||||
import org.apache.lucene.search.highlight.TokenSources;
|
||||
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
|
||||
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
|
||||
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
|
||||
@ -133,8 +132,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
|
||||
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
|
||||
|
||||
case "PH_P": hlImpl = new PostingsHLImpl(); break;
|
||||
|
||||
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
|
||||
}
|
||||
}
|
||||
@ -224,33 +221,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
|
||||
return clone;
|
||||
}
|
||||
|
||||
private class PostingsHLImpl implements HLImpl {
|
||||
PostingsHighlighter highlighter;
|
||||
String[] fields = hlFields.toArray(new String[hlFields.size()]);
|
||||
int[] maxPassages;
|
||||
PostingsHLImpl() {
|
||||
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
|
||||
@Override
|
||||
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BreakIterator getBreakIterator(String field) {
|
||||
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
|
||||
}
|
||||
};
|
||||
maxPassages = new int[hlFields.size()];
|
||||
Arrays.fill(maxPassages, maxFrags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
|
||||
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
|
||||
preventOptimizeAway = result.size();
|
||||
}
|
||||
}
|
||||
|
||||
private class UnifiedHLImpl implements HLImpl {
|
||||
UnifiedHighlighter highlighter;
|
||||
IndexSearcher lastSearcher;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -28,6 +28,8 @@
|
||||
<path refid="base.classpath"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${grouping.jar}"/>
|
||||
<pathelement path="${sandbox.jar}"/>
|
||||
<pathelement path="${analyzers-common.jar}"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
@ -36,16 +38,18 @@
|
||||
<path refid="test.base.classpath"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="jar-grouping,jar-queries,jar-analyzers-common,common.compile-core" />
|
||||
<target name="compile-core" depends="jar-sandbox,jar-grouping,jar-queries,jar-analyzers-common,common.compile-core" />
|
||||
|
||||
<target name="jar-core" depends="common.jar-core" />
|
||||
|
||||
<target name="javadocs" depends="javadocs-grouping,compile-core,check-javadocs-uptodate"
|
||||
<target name="javadocs" depends="javadocs-sandbox,javadocs-grouping,compile-core,check-javadocs-uptodate"
|
||||
unless="javadocs-uptodate-${name}">
|
||||
<invoke-module-javadoc>
|
||||
<links>
|
||||
<link href="../queries"/>
|
||||
<link href="../analyzers-common"/>
|
||||
<link href="../grouping"/>
|
||||
<link href="../sandbox"/>
|
||||
</links>
|
||||
</invoke-module-javadoc>
|
||||
</target>
|
||||
|
@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.classification;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A classifier approximating naive bayes classifier by using pure queries on BM25.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BM25NBClassifier implements Classifier<BytesRef> {
|
||||
|
||||
/**
|
||||
* {@link IndexReader} used to access the {@link Classifier}'s
|
||||
* index
|
||||
*/
|
||||
private final IndexReader indexReader;
|
||||
|
||||
/**
|
||||
* names of the fields to be used as input text
|
||||
*/
|
||||
private final String[] textFieldNames;
|
||||
|
||||
/**
|
||||
* name of the field to be used as a class / category output
|
||||
*/
|
||||
private final String classFieldName;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} to be used for tokenizing unseen input text
|
||||
*/
|
||||
private final Analyzer analyzer;
|
||||
|
||||
/**
|
||||
* {@link IndexSearcher} to run searches on the index for retrieving frequencies
|
||||
*/
|
||||
private final IndexSearcher indexSearcher;
|
||||
|
||||
/**
|
||||
* {@link Query} used to eventually filter the document set to be used to classify
|
||||
*/
|
||||
private final Query query;
|
||||
|
||||
/**
|
||||
* Creates a new NaiveBayes classifier.
|
||||
*
|
||||
* @param indexReader the reader on the index to be used for classification
|
||||
* @param analyzer an {@link Analyzer} used to analyze unseen text
|
||||
* @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
|
||||
* if all the indexed docs should be used
|
||||
* @param classFieldName the name of the field used as the output for the classifier NOTE: must not be havely analyzed
|
||||
* as the returned class will be a token indexed for this field
|
||||
* @param textFieldNames the name of the fields used as the inputs for the classifier, NO boosting supported per field
|
||||
*/
|
||||
public BM25NBClassifier(IndexReader indexReader, Analyzer analyzer, Query query, String classFieldName, String... textFieldNames) {
|
||||
this.indexReader = indexReader;
|
||||
this.indexSearcher = new IndexSearcher(this.indexReader);
|
||||
this.indexSearcher.setSimilarity(new BM25Similarity());
|
||||
this.textFieldNames = textFieldNames;
|
||||
this.classFieldName = classFieldName;
|
||||
this.analyzer = analyzer;
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
|
||||
return assignClassNormalizedList(inputDocument).get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
Collections.sort(assignedClasses);
|
||||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
|
||||
Collections.sort(assignedClasses);
|
||||
return assignedClasses.subList(0, max);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate probabilities for all classes for a given input text
|
||||
*
|
||||
* @param inputDocument the input text as a {@code String}
|
||||
* @return a {@code List} of {@code ClassificationResult}, one for each existing class
|
||||
* @throws IOException if assigning probabilities fails
|
||||
*/
|
||||
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
|
||||
|
||||
Terms classes = MultiFields.getTerms(indexReader, classFieldName);
|
||||
TermsEnum classesEnum = classes.iterator();
|
||||
BytesRef next;
|
||||
String[] tokenizedText = tokenize(inputDocument);
|
||||
while ((next = classesEnum.next()) != null) {
|
||||
if (next.length > 0) {
|
||||
Term term = new Term(this.classFieldName, next);
|
||||
assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
|
||||
}
|
||||
}
|
||||
|
||||
return normClassificationResults(assignedClasses);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize the classification results based on the max score available
|
||||
*
|
||||
* @param assignedClasses the list of assigned classes
|
||||
* @return the normalized results
|
||||
*/
|
||||
private ArrayList<ClassificationResult<BytesRef>> normClassificationResults(List<ClassificationResult<BytesRef>> assignedClasses) {
|
||||
// normalization; the values transforms to a 0-1 range
|
||||
ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
|
||||
if (!assignedClasses.isEmpty()) {
|
||||
Collections.sort(assignedClasses);
|
||||
// this is a negative number closest to 0 = a
|
||||
double smax = assignedClasses.get(0).getScore();
|
||||
|
||||
double sumLog = 0;
|
||||
// log(sum(exp(x_n-a)))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
// getScore-smax <=0 (both negative, smax is the smallest abs()
|
||||
sumLog += Math.exp(cr.getScore() - smax);
|
||||
}
|
||||
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
|
||||
double loga = smax;
|
||||
loga += Math.log(sumLog);
|
||||
|
||||
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
|
||||
for (ClassificationResult<BytesRef> cr : assignedClasses) {
|
||||
double scoreDiff = cr.getScore() - loga;
|
||||
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
|
||||
}
|
||||
}
|
||||
return returnList;
|
||||
}
|
||||
|
||||
/**
|
||||
* tokenize a <code>String</code> on this classifier's text fields and analyzer
|
||||
*
|
||||
* @param text the <code>String</code> representing an input text (to be classified)
|
||||
* @return a <code>String</code> array of the resulting tokens
|
||||
* @throws IOException if tokenization fails
|
||||
*/
|
||||
private String[] tokenize(String text) throws IOException {
|
||||
Collection<String> result = new LinkedList<>();
|
||||
for (String textFieldName : textFieldNames) {
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
|
||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
result.add(charTermAttribute.toString());
|
||||
}
|
||||
tokenStream.end();
|
||||
}
|
||||
}
|
||||
return result.toArray(new String[result.size()]);
|
||||
}
|
||||
|
||||
private double calculateLogLikelihood(String[] tokens, Term term) throws IOException {
|
||||
double result = 0d;
|
||||
for (String word : tokens) {
|
||||
result += Math.log(getTermProbForClass(term, word));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private double getTermProbForClass(Term classTerm, String... words) throws IOException {
|
||||
BooleanQuery.Builder builder = new BooleanQuery.Builder();
|
||||
builder.add(new BooleanClause(new TermQuery(classTerm), BooleanClause.Occur.MUST));
|
||||
for (String textFieldName : textFieldNames) {
|
||||
for (String word : words) {
|
||||
builder.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
|
||||
}
|
||||
}
|
||||
if (query != null) {
|
||||
builder.add(query, BooleanClause.Occur.MUST);
|
||||
}
|
||||
TopDocs search = indexSearcher.search(builder.build(), 1);
|
||||
return search.totalHits > 0 ? search.getMaxScore() : 1;
|
||||
}
|
||||
|
||||
private double calculateLogPrior(Term term) throws IOException {
|
||||
TermQuery termQuery = new TermQuery(term);
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
bq.add(termQuery, BooleanClause.Occur.MUST);
|
||||
if (query != null) {
|
||||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
}
|
||||
TopDocs topDocs = indexSearcher.search(bq.build(), 1);
|
||||
return topDocs.totalHits > 0 ? Math.log(topDocs.getMaxScore()) : 0;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,224 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.classification;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* A k-Nearest Neighbor classifier based on {@link FuzzyLikeThisQuery}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
|
||||
|
||||
/**
|
||||
* the name of the fields used as the input text
|
||||
*/
|
||||
protected final String[] textFieldNames;
|
||||
|
||||
/**
|
||||
* the name of the field used as the output text
|
||||
*/
|
||||
protected final String classFieldName;
|
||||
|
||||
/**
|
||||
* an {@link IndexSearcher} used to perform queries
|
||||
*/
|
||||
protected final IndexSearcher indexSearcher;
|
||||
|
||||
/**
|
||||
* the no. of docs to compare in order to find the nearest neighbor to the input text
|
||||
*/
|
||||
protected final int k;
|
||||
|
||||
/**
|
||||
* a {@link Query} used to filter the documents that should be used from this classifier's underlying {@link LeafReader}
|
||||
*/
|
||||
protected final Query query;
|
||||
private final Analyzer analyzer;
|
||||
|
||||
/**
|
||||
* Creates a {@link KNearestFuzzyClassifier}.
|
||||
*
|
||||
* @param indexReader the reader on the index to be used for classification
|
||||
* @param analyzer an {@link Analyzer} used to analyze unseen text
|
||||
* @param similarity the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
|
||||
* (defaults to {@link BM25Similarity})
|
||||
* @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
|
||||
* if all the indexed docs should be used
|
||||
* @param k the no. of docs to select in the MLT results to find the nearest neighbor
|
||||
* @param classFieldName the name of the field used as the output for the classifier
|
||||
* @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
|
||||
*/
|
||||
public KNearestFuzzyClassifier(IndexReader indexReader, Similarity similarity, Analyzer analyzer, Query query, int k,
|
||||
String classFieldName, String... textFieldNames) {
|
||||
this.textFieldNames = textFieldNames;
|
||||
this.classFieldName = classFieldName;
|
||||
this.analyzer = analyzer;
|
||||
this.indexSearcher = new IndexSearcher(indexReader);
|
||||
if (similarity != null) {
|
||||
this.indexSearcher.setSimilarity(similarity);
|
||||
} else {
|
||||
this.indexSearcher.setSimilarity(new BM25Similarity());
|
||||
}
|
||||
this.query = query;
|
||||
this.k = k;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
|
||||
ClassificationResult<BytesRef> assignedClass = null;
|
||||
double maxscore = -Double.MAX_VALUE;
|
||||
for (ClassificationResult<BytesRef> cl : assignedClasses) {
|
||||
if (cl.getScore() > maxscore) {
|
||||
assignedClass = cl;
|
||||
maxscore = cl.getScore();
|
||||
}
|
||||
}
|
||||
return assignedClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
|
||||
Collections.sort(assignedClasses);
|
||||
return assignedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
|
||||
TopDocs knnResults = knnSearch(text);
|
||||
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
|
||||
Collections.sort(assignedClasses);
|
||||
return assignedClasses.subList(0, max);
|
||||
}
|
||||
|
||||
private TopDocs knnSearch(String text) throws IOException {
|
||||
BooleanQuery.Builder bq = new BooleanQuery.Builder();
|
||||
FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(300, analyzer);
|
||||
for (String fieldName : textFieldNames) {
|
||||
fuzzyLikeThisQuery.addTerms(text, fieldName, 1f, 2); // TODO: make this parameters configurable
|
||||
}
|
||||
bq.add(fuzzyLikeThisQuery, BooleanClause.Occur.MUST);
|
||||
Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));
|
||||
bq.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
|
||||
if (query != null) {
|
||||
bq.add(query, BooleanClause.Occur.MUST);
|
||||
}
|
||||
return indexSearcher.search(bq.build(), k);
|
||||
}
|
||||
|
||||
/**
|
||||
* build a list of classification results from search results
|
||||
*
|
||||
* @param topDocs the search results as a {@link TopDocs} object
|
||||
* @return a {@link List} of {@link ClassificationResult}, one for each existing class
|
||||
* @throws IOException if it's not possible to get the stored value of class field
|
||||
*/
|
||||
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
|
||||
Map<BytesRef, Integer> classCounts = new HashMap<>();
|
||||
Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs
|
||||
float maxScore = topDocs.getMaxScore();
|
||||
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
|
||||
IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
|
||||
if (storableField != null) {
|
||||
BytesRef cl = new BytesRef(storableField.stringValue());
|
||||
//update count
|
||||
Integer count = classCounts.get(cl);
|
||||
if (count != null) {
|
||||
classCounts.put(cl, count + 1);
|
||||
} else {
|
||||
classCounts.put(cl, 1);
|
||||
}
|
||||
//update boost, the boost is based on the best score
|
||||
Double totalBoost = classBoosts.get(cl);
|
||||
double singleBoost = scoreDoc.score / maxScore;
|
||||
if (totalBoost != null) {
|
||||
classBoosts.put(cl, totalBoost + singleBoost);
|
||||
} else {
|
||||
classBoosts.put(cl, singleBoost);
|
||||
}
|
||||
}
|
||||
}
|
||||
List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
|
||||
List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
|
||||
int sumdoc = 0;
|
||||
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
|
||||
Integer count = entry.getValue();
|
||||
Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1
|
||||
temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
|
||||
sumdoc += count;
|
||||
}
|
||||
|
||||
//correction
|
||||
if (sumdoc < k) {
|
||||
for (ClassificationResult<BytesRef> cr : temporaryList) {
|
||||
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
|
||||
}
|
||||
} else {
|
||||
returnList = temporaryList;
|
||||
}
|
||||
return returnList;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "KNearestFuzzyClassifier{" +
|
||||
"textFieldNames=" + Arrays.toString(textFieldNames) +
|
||||
", classFieldName='" + classFieldName + '\'' +
|
||||
", k=" + k +
|
||||
", query=" + query +
|
||||
", similarity=" + indexSearcher.getSimilarity(true) +
|
||||
'}';
|
||||
}
|
||||
}
|
@ -121,7 +121,7 @@ public class DatasetSplitter {
|
||||
int b = 0;
|
||||
|
||||
// iterate over existing documents
|
||||
for (GroupDocs group : topGroups.groups) {
|
||||
for (GroupDocs<Object> group : topGroups.groups) {
|
||||
int totalHits = group.totalHits;
|
||||
double testSize = totalHits * testRatio;
|
||||
int tc = 0;
|
||||
|
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.classification;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.classification.utils.ConfusionMatrixGenerator;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests for {@link BM25NBClassifier}
|
||||
*/
|
||||
public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
|
||||
|
||||
@Test
|
||||
public void testBasicUsage() throws Exception {
|
||||
LeafReader leafReader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
leafReader = getSampleIndex(analyzer);
|
||||
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
|
||||
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
if (leafReader != null) {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicUsageWithQuery() throws Exception {
|
||||
LeafReader leafReader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
leafReader = getSampleIndex(analyzer);
|
||||
TermQuery query = new TermQuery(new Term(textFieldName, "not"));
|
||||
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, query, categoryFieldName, textFieldName);
|
||||
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
if (leafReader != null) {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNGramUsage() throws Exception {
|
||||
LeafReader leafReader = null;
|
||||
try {
|
||||
Analyzer analyzer = new NGramAnalyzer();
|
||||
leafReader = getSampleIndex(analyzer);
|
||||
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
|
||||
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
if (leafReader != null) {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class NGramAnalyzer extends Analyzer {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPerformance() throws Exception {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
LeafReader leafReader = getRandomIndex(analyzer, 100);
|
||||
try {
|
||||
long trainStart = System.currentTimeMillis();
|
||||
BM25NBClassifier classifier = new BM25NBClassifier(leafReader,
|
||||
analyzer, null, categoryFieldName, textFieldName);
|
||||
long trainEnd = System.currentTimeMillis();
|
||||
long trainTime = trainEnd - trainStart;
|
||||
assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
|
||||
|
||||
long evaluationStart = System.currentTimeMillis();
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
long evaluationEnd = System.currentTimeMillis();
|
||||
long evaluationTime = evaluationEnd - evaluationStart;
|
||||
assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
|
||||
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
|
||||
assertTrue("avg classification time: " + avgClassificationTime, 5000 > avgClassificationTime);
|
||||
|
||||
double f1 = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1 >= 0d);
|
||||
assertTrue(f1 <= 1d);
|
||||
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
|
||||
Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
|
||||
TermsEnum iterator = terms.iterator();
|
||||
BytesRef term;
|
||||
while ((term = iterator.next()) != null) {
|
||||
String s = term.utf8ToString();
|
||||
recall = confusionMatrix.getRecall(s);
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
precision = confusionMatrix.getPrecision(s);
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure(s);
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
}
|
||||
|
||||
} finally {
|
||||
leafReader.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.classification;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.classification.utils.ConfusionMatrixGenerator;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Testcase for {@link KNearestFuzzyClassifier}
|
||||
*/
|
||||
public class KNearestFuzzyClassifierTest extends ClassificationTestBase<BytesRef> {
|
||||
|
||||
@Test
|
||||
public void testBasicUsage() throws Exception {
|
||||
LeafReader leafReader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
leafReader = getSampleIndex(analyzer);
|
||||
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);
|
||||
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
checkCorrectClassification(classifier, POLITICS_INPUT, POLITICS_RESULT);
|
||||
} finally {
|
||||
if (leafReader != null) {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicUsageWithQuery() throws Exception {
|
||||
LeafReader leafReader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
leafReader = getSampleIndex(analyzer);
|
||||
TermQuery query = new TermQuery(new Term(textFieldName, "not"));
|
||||
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, query, 3, categoryFieldName, textFieldName);
|
||||
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
|
||||
} finally {
|
||||
if (leafReader != null) {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPerformance() throws Exception {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
LeafReader leafReader = getRandomIndex(analyzer, 100);
|
||||
try {
|
||||
long trainStart = System.currentTimeMillis();
|
||||
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);
|
||||
long trainEnd = System.currentTimeMillis();
|
||||
long trainTime = trainEnd - trainStart;
|
||||
assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
|
||||
|
||||
long evaluationStart = System.currentTimeMillis();
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
long evaluationEnd = System.currentTimeMillis();
|
||||
long evaluationTime = evaluationEnd - evaluationStart;
|
||||
assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
|
||||
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
|
||||
assertTrue(5000 > avgClassificationTime);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
|
||||
Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
|
||||
TermsEnum iterator = terms.iterator();
|
||||
BytesRef term;
|
||||
while ((term = iterator.next()) != null) {
|
||||
String s = term.utf8ToString();
|
||||
recall = confusionMatrix.getRecall(s);
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
precision = confusionMatrix.getPrecision(s);
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure(s);
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
}
|
||||
} finally {
|
||||
leafReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -21,11 +21,13 @@ import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.classification.BM25NBClassifier;
|
||||
import org.apache.lucene.classification.BooleanPerceptronClassifier;
|
||||
import org.apache.lucene.classification.CachingNaiveBayesClassifier;
|
||||
import org.apache.lucene.classification.ClassificationResult;
|
||||
import org.apache.lucene.classification.ClassificationTestBase;
|
||||
import org.apache.lucene.classification.Classifier;
|
||||
import org.apache.lucene.classification.KNearestFuzzyClassifier;
|
||||
import org.apache.lucene.classification.KNearestNeighborClassifier;
|
||||
import org.apache.lucene.classification.SimpleNaiveBayesClassifier;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
@ -94,22 +96,43 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
|
||||
Classifier<BytesRef> classifier = new SimpleNaiveBayesClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
assertNotNull(confusionMatrix.getLinearizedMatrix());
|
||||
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
|
||||
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
checkCM(confusionMatrix);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkCM(ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix) {
|
||||
assertNotNull(confusionMatrix);
|
||||
assertNotNull(confusionMatrix.getLinearizedMatrix());
|
||||
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
|
||||
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetConfusionMatrixWithBM25NB() throws Exception {
|
||||
LeafReader reader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
reader = getSampleIndex(analyzer);
|
||||
Classifier<BytesRef> classifier = new BM25NBClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
checkCM(confusionMatrix);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
@ -126,22 +149,7 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
|
||||
Classifier<BytesRef> classifier = new CachingNaiveBayesClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
assertNotNull(confusionMatrix.getLinearizedMatrix());
|
||||
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
|
||||
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
checkCM(confusionMatrix);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
@ -158,22 +166,24 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
|
||||
Classifier<BytesRef> classifier = new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, categoryFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
assertNotNull(confusionMatrix.getLinearizedMatrix());
|
||||
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
|
||||
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
checkCM(confusionMatrix);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetConfusionMatrixWithFLTKNN() throws Exception {
|
||||
LeafReader reader = null;
|
||||
try {
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
reader = getSampleIndex(analyzer);
|
||||
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, categoryFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, categoryFieldName, textFieldName, -1);
|
||||
checkCM(confusionMatrix);
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
@ -190,22 +200,7 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
|
||||
Classifier<Boolean> classifier = new BooleanPerceptronClassifier(reader, analyzer, null, 1, null, booleanFieldName, textFieldName);
|
||||
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
|
||||
classifier, booleanFieldName, textFieldName, -1);
|
||||
assertNotNull(confusionMatrix);
|
||||
assertNotNull(confusionMatrix.getLinearizedMatrix());
|
||||
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
|
||||
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
|
||||
double accuracy = confusionMatrix.getAccuracy();
|
||||
assertTrue(accuracy >= 0d);
|
||||
assertTrue(accuracy <= 1d);
|
||||
double precision = confusionMatrix.getPrecision();
|
||||
assertTrue(precision >= 0d);
|
||||
assertTrue(precision <= 1d);
|
||||
double recall = confusionMatrix.getRecall();
|
||||
assertTrue(recall >= 0d);
|
||||
assertTrue(recall <= 1d);
|
||||
double f1Measure = confusionMatrix.getF1Measure();
|
||||
assertTrue(f1Measure >= 0d);
|
||||
assertTrue(f1Measure <= 1d);
|
||||
checkCM(confusionMatrix);
|
||||
assertTrue(confusionMatrix.getPrecision("true") >= 0d);
|
||||
assertTrue(confusionMatrix.getPrecision("true") <= 1d);
|
||||
assertTrue(confusionMatrix.getPrecision("false") >= 0d);
|
||||
|
@ -877,7 +877,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
};
|
||||
}
|
||||
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {
|
||||
|
||||
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
||||
@Override
|
||||
@ -1170,7 +1170,8 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
|
||||
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
|
||||
* information (checksum matched or didn't) as a suppressed exception. */
|
||||
private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
|
||||
private Error verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
|
||||
assert priorException != null;
|
||||
// TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
|
||||
// right reader after recursing to children, and possibly within recursed children,
|
||||
// since all together they make a single pass through the file. But this is a sizable re-org,
|
||||
@ -1181,10 +1182,10 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
|
||||
CodecUtil.checkFooter(in, priorException);
|
||||
}
|
||||
} else {
|
||||
// We are reading from heap; nothing to add:
|
||||
IOUtils.reThrow(priorException);
|
||||
}
|
||||
|
||||
// We are reading from heap; nothing to add:
|
||||
throw IOUtils.rethrowAlways(priorException);
|
||||
}
|
||||
|
||||
/** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
|
||||
@ -1206,7 +1207,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
reader.markOrds(rightCount-1, ordBitSet);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, source.writer);
|
||||
throw verifyChecksum(t, source.writer);
|
||||
}
|
||||
|
||||
return scratch1;
|
||||
@ -1255,10 +1256,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
}
|
||||
return new PathSlice(writer, 0, count);
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, source.writer);
|
||||
|
||||
// Dead code but javac disagrees:
|
||||
return null;
|
||||
throw verifyChecksum(t, source.writer);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1564,7 +1562,7 @@ final class SimpleTextBKDWriter implements Closeable {
|
||||
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
|
||||
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, slices[dim].writer);
|
||||
throw verifyChecksum(t, slices[dim].writer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -331,6 +331,9 @@ public final class CodecUtil {
|
||||
/** Retrieves the full footer from the provided {@link IndexInput}. This throws
|
||||
* {@link CorruptIndexException} if this file does not have a valid footer. */
|
||||
public static byte[] readFooter(IndexInput in) throws IOException {
|
||||
if (in.length() < footerLength()) {
|
||||
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
|
||||
}
|
||||
in.seek(in.length() - footerLength());
|
||||
validateFooter(in);
|
||||
in.seek(in.length() - footerLength());
|
||||
@ -467,7 +470,7 @@ public final class CodecUtil {
|
||||
// catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could...
|
||||
priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t));
|
||||
}
|
||||
IOUtils.reThrow(priorException);
|
||||
throw IOUtils.rethrowAlways(priorException);
|
||||
}
|
||||
}
|
||||
|
||||
@ -516,6 +519,9 @@ public final class CodecUtil {
|
||||
clone.seek(0);
|
||||
ChecksumIndexInput in = new BufferedChecksumIndexInput(clone);
|
||||
assert in.getFilePointer() == 0;
|
||||
if (in.length() < footerLength()) {
|
||||
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), input);
|
||||
}
|
||||
in.seek(in.length() - footerLength());
|
||||
return checkFooter(in);
|
||||
}
|
||||
|
@ -112,6 +112,7 @@ abstract class RangeFieldQuery extends Query {
|
||||
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
|
||||
return new ConstantScoreWeight(this, boost) {
|
||||
final RangeFieldComparator target = new RangeFieldComparator();
|
||||
|
||||
private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException {
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
|
||||
values.intersect(
|
||||
@ -133,25 +134,29 @@ abstract class RangeFieldQuery extends Query {
|
||||
}
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
byte[] node = getInternalRange(minPackedValue, maxPackedValue);
|
||||
// compute range relation for BKD traversal
|
||||
if (target.intersects(node) == false) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
} else if (target.within(node)) {
|
||||
// target within cell; continue traversing:
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
} else if (target.contains(node)) {
|
||||
// target contains cell; add iff queryType is not a CONTAINS or CROSSES query:
|
||||
return (queryType == QueryType.CONTAINS || queryType == QueryType.CROSSES) ?
|
||||
Relation.CELL_OUTSIDE_QUERY : Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
// target intersects cell; continue traversing:
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
return compareRange(minPackedValue, maxPackedValue);
|
||||
}
|
||||
});
|
||||
return result.build();
|
||||
}
|
||||
|
||||
private Relation compareRange(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
byte[] node = getInternalRange(minPackedValue, maxPackedValue);
|
||||
// compute range relation for BKD traversal
|
||||
if (target.intersects(node) == false) {
|
||||
return Relation.CELL_OUTSIDE_QUERY;
|
||||
} else if (target.within(node)) {
|
||||
// target within cell; continue traversing:
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
} else if (target.contains(node)) {
|
||||
// target contains cell; add iff queryType is not a CONTAINS or CROSSES query:
|
||||
return (queryType == QueryType.CONTAINS || queryType == QueryType.CROSSES) ?
|
||||
Relation.CELL_OUTSIDE_QUERY : Relation.CELL_INSIDE_QUERY;
|
||||
}
|
||||
// target intersects cell; continue traversing:
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
@ -166,17 +171,10 @@ abstract class RangeFieldQuery extends Query {
|
||||
return null;
|
||||
}
|
||||
checkFieldInfo(fieldInfo);
|
||||
boolean allDocsMatch = true;
|
||||
if (values.getDocCount() == reader.maxDoc()) {
|
||||
// if query crosses, docs need to be further scrutinized
|
||||
byte[] range = getInternalRange(values.getMinPackedValue(), values.getMaxPackedValue());
|
||||
// if the internal node is not equal and not contained by the query, all docs do not match
|
||||
if (queryType == QueryType.CROSSES || (!Arrays.equals(ranges, range)
|
||||
&& (target.contains(range) == false || queryType != QueryType.WITHIN))) {
|
||||
allDocsMatch = false;
|
||||
}
|
||||
} else {
|
||||
allDocsMatch = false;
|
||||
boolean allDocsMatch = false;
|
||||
if (values.getDocCount() == reader.maxDoc()
|
||||
&& compareRange(values.getMinPackedValue(), values.getMaxPackedValue()) == Relation.CELL_INSIDE_QUERY) {
|
||||
allDocsMatch = true;
|
||||
}
|
||||
|
||||
DocIdSetIterator iterator = allDocsMatch == true ?
|
||||
|
@ -463,8 +463,9 @@ class BufferedUpdatesStream implements Accountable {
|
||||
}
|
||||
|
||||
if (success) {
|
||||
// Does nothing if firstExc is null:
|
||||
IOUtils.reThrow(firstExc);
|
||||
if (firstExc != null) {
|
||||
throw IOUtils.rethrowAlways(firstExc);
|
||||
}
|
||||
}
|
||||
|
||||
if (infoStream.isEnabled("BD")) {
|
||||
|
@ -529,7 +529,7 @@ public final class CheckIndex implements Closeable {
|
||||
sis = SegmentInfos.readCommit(dir, lastSegmentsFile);
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
msg(infoStream, "ERROR: could not read any segments file in directory");
|
||||
result.missingSegments = true;
|
||||
@ -565,11 +565,12 @@ public final class CheckIndex implements Closeable {
|
||||
input = dir.openInput(segmentsFileName, IOContext.READONCE);
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
msg(infoStream, "ERROR: could not open segments file in directory");
|
||||
if (infoStream != null)
|
||||
if (infoStream != null) {
|
||||
t.printStackTrace(infoStream);
|
||||
}
|
||||
result.cantOpenSegments = true;
|
||||
return result;
|
||||
}
|
||||
@ -577,11 +578,12 @@ public final class CheckIndex implements Closeable {
|
||||
/*int format =*/ input.readInt();
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
msg(infoStream, "ERROR: could not read segment file version in directory");
|
||||
if (infoStream != null)
|
||||
if (infoStream != null) {
|
||||
t.printStackTrace(infoStream);
|
||||
}
|
||||
result.missingSegmentVersion = true;
|
||||
return result;
|
||||
} finally {
|
||||
@ -789,7 +791,7 @@ public final class CheckIndex implements Closeable {
|
||||
|
||||
} catch (Throwable t) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
msg(infoStream, "FAILED");
|
||||
String comment;
|
||||
@ -883,7 +885,7 @@ public final class CheckIndex implements Closeable {
|
||||
msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", nsToSec(System.nanoTime()-startNS)));
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -941,7 +943,7 @@ public final class CheckIndex implements Closeable {
|
||||
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -974,7 +976,7 @@ public final class CheckIndex implements Closeable {
|
||||
status.totFields = fieldInfos.size();
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -1013,7 +1015,7 @@ public final class CheckIndex implements Closeable {
|
||||
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields] [took %.3f sec]", status.totFields, nsToSec(System.nanoTime()-startNS)));
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -1769,7 +1771,7 @@ public final class CheckIndex implements Closeable {
|
||||
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR: " + e);
|
||||
status = new Status.TermIndexStatus();
|
||||
@ -1845,7 +1847,7 @@ public final class CheckIndex implements Closeable {
|
||||
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR: " + e);
|
||||
status.error = e;
|
||||
@ -2079,7 +2081,7 @@ public final class CheckIndex implements Closeable {
|
||||
nsToSec(System.nanoTime() - startNS)));
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -2126,7 +2128,7 @@ public final class CheckIndex implements Closeable {
|
||||
nsToSec(System.nanoTime()-startNS)));
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
@ -2567,7 +2569,7 @@ public final class CheckIndex implements Closeable {
|
||||
status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
|
||||
} catch (Throwable e) {
|
||||
if (failFast) {
|
||||
IOUtils.reThrow(e);
|
||||
throw IOUtils.rethrowAlways(e);
|
||||
}
|
||||
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
|
||||
status.error = e;
|
||||
|
@ -603,7 +603,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||
// PerField.invert to allow for later downgrading of the index options:
|
||||
fi.setIndexOptions(fieldType.indexOptions());
|
||||
|
||||
fp = new PerField(fi, invert);
|
||||
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
|
||||
fp.next = fieldHash[hashPos];
|
||||
fieldHash[hashPos] = fp;
|
||||
totalFieldCount++;
|
||||
@ -633,6 +633,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||
/** NOTE: not static: accesses at least docState, termsHash. */
|
||||
private final class PerField implements Comparable<PerField> {
|
||||
|
||||
final int indexCreatedVersionMajor;
|
||||
final FieldInfo fieldInfo;
|
||||
final Similarity similarity;
|
||||
|
||||
@ -659,7 +660,8 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||
// reused
|
||||
TokenStream tokenStream;
|
||||
|
||||
public PerField(FieldInfo fieldInfo, boolean invert) {
|
||||
public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
|
||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||
this.fieldInfo = fieldInfo;
|
||||
similarity = docState.similarity;
|
||||
if (invert) {
|
||||
@ -668,7 +670,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||
}
|
||||
|
||||
void setInvertState() {
|
||||
invertState = new FieldInvertState(fieldInfo.name);
|
||||
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
|
||||
termsHashPerField = termsHash.addField(invertState, fieldInfo);
|
||||
if (fieldInfo.omitsNorms() == false) {
|
||||
assert norms == null;
|
||||
|
@ -193,6 +193,10 @@ class DocumentsWriterPerThread {
|
||||
return fieldInfos;
|
||||
}
|
||||
|
||||
public int getIndexCreatedVersionMajor() {
|
||||
return indexWriter.segmentInfos.getIndexCreatedVersionMajor();
|
||||
}
|
||||
|
||||
final void testPoint(String message) {
|
||||
if (enableTestPoints) {
|
||||
assert infoStream.isEnabled("TP"); // don't enable unless you need them.
|
||||
|
@ -31,7 +31,8 @@ import org.apache.lucene.util.AttributeSource;
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class FieldInvertState {
|
||||
String name;
|
||||
final int indexCreatedVersionMajor;
|
||||
final String name;
|
||||
int position;
|
||||
int length;
|
||||
int numOverlap;
|
||||
@ -50,14 +51,15 @@ public final class FieldInvertState {
|
||||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name. */
|
||||
public FieldInvertState(String name) {
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name) {
|
||||
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Creates {code FieldInvertState} for the specified
|
||||
* field name and values for all fields. */
|
||||
public FieldInvertState(String name, int position, int length, int numOverlap, int offset) {
|
||||
this.name = name;
|
||||
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
|
||||
this(indexCreatedVersionMajor, name);
|
||||
this.position = position;
|
||||
this.length = length;
|
||||
this.numOverlap = numOverlap;
|
||||
@ -164,4 +166,11 @@ public final class FieldInvertState {
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the version that was used to create the index, or 6 if it was created before 7.0.
|
||||
*/
|
||||
public int getIndexCreatedVersionMajor() {
|
||||
return indexCreatedVersionMajor;
|
||||
}
|
||||
}
|
||||
|
@ -364,7 +364,7 @@ final class IndexFileDeleter implements Closeable {
|
||||
* Remove the CommitPoints in the commitsToDelete List by
|
||||
* DecRef'ing all files from each SegmentInfos.
|
||||
*/
|
||||
private void deleteCommits() {
|
||||
private void deleteCommits() throws IOException {
|
||||
|
||||
int size = commitsToDelete.size();
|
||||
|
||||
@ -388,8 +388,9 @@ final class IndexFileDeleter implements Closeable {
|
||||
}
|
||||
commitsToDelete.clear();
|
||||
|
||||
// NOTE: does nothing if firstThrowable is null
|
||||
IOUtils.reThrowUnchecked(firstThrowable);
|
||||
if (firstThrowable != null) {
|
||||
throw IOUtils.rethrowAlways(firstThrowable);
|
||||
}
|
||||
|
||||
// Now compact commits to remove deleted ones (preserving the sort):
|
||||
size = commits.size();
|
||||
@ -599,8 +600,9 @@ final class IndexFileDeleter implements Closeable {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: does nothing if firstThrowable is null
|
||||
IOUtils.reThrow(firstThrowable);
|
||||
if (firstThrowable != null) {
|
||||
throw IOUtils.rethrowAlways(firstThrowable);
|
||||
}
|
||||
}
|
||||
|
||||
/** Decrefs all provided files, ignoring any exceptions hit; call this if
|
||||
|
@ -144,7 +144,9 @@ public abstract class IndexReader implements Closeable {
|
||||
// overridden by StandardDirectoryReader and SegmentReader
|
||||
void notifyReaderClosedListeners(Throwable th) throws IOException {
|
||||
// nothing to notify in the base impl, just rethrow
|
||||
IOUtils.reThrow(th);
|
||||
if (th != null) {
|
||||
throw IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
private void reportCloseToParentReaders() {
|
||||
|
@ -611,7 +611,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
if (doSave) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
} else if (priorE == null) {
|
||||
priorE = t;
|
||||
}
|
||||
@ -631,14 +631,16 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
rld.dropReaders();
|
||||
} catch (Throwable t) {
|
||||
if (doSave) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
} else if (priorE == null) {
|
||||
priorE = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
assert readerMap.size() == 0;
|
||||
IOUtils.reThrow(priorE);
|
||||
if (priorE != null) {
|
||||
throw IOUtils.rethrowAlways(priorE);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3330,7 +3332,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
if (commitCompleted) {
|
||||
tragicEvent(t, "finishCommit");
|
||||
} else {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3898,7 +3900,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
throw (MergePolicy.MergeAbortedException) t;
|
||||
}
|
||||
} else {
|
||||
IOUtils.reThrow(t);
|
||||
assert t != null;
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4238,8 +4241,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
}
|
||||
|
||||
// If any error occurred, throw it.
|
||||
if (!suppressExceptions) {
|
||||
IOUtils.reThrow(th);
|
||||
if (!suppressExceptions && th != null) {
|
||||
throw IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4815,7 +4818,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
// It's possible you could have a really bad day
|
||||
if (this.tragedy != null) {
|
||||
// Another thread is already dealing / has dealt with the tragedy:
|
||||
IOUtils.reThrow(tragedy);
|
||||
throw IOUtils.rethrowAlways(tragedy);
|
||||
}
|
||||
|
||||
this.tragedy = tragedy;
|
||||
@ -4826,7 +4829,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
|
||||
rollbackInternal();
|
||||
}
|
||||
|
||||
IOUtils.reThrow(tragedy);
|
||||
throw IOUtils.rethrowAlways(tragedy);
|
||||
}
|
||||
|
||||
/** If this {@code IndexWriter} was closed as a side-effect of a tragic exception,
|
||||
|
@ -210,7 +210,10 @@ final class SegmentCoreReaders {
|
||||
}
|
||||
}
|
||||
}
|
||||
IOUtils.reThrow(th);
|
||||
|
||||
if (th != null) {
|
||||
throw IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,8 +90,9 @@ final class SegmentDocValues {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (t != null) {
|
||||
IOUtils.reThrow(t);
|
||||
throw IOUtils.rethrowAlways(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -303,7 +303,10 @@ public final class SegmentReader extends CodecReader {
|
||||
}
|
||||
}
|
||||
}
|
||||
IOUtils.reThrow(th);
|
||||
|
||||
if (th != null) {
|
||||
IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -391,7 +391,9 @@ public final class StandardDirectoryReader extends DirectoryReader {
|
||||
}
|
||||
|
||||
// throw the first exception
|
||||
IOUtils.reThrow(firstExc);
|
||||
if (firstExc != null) {
|
||||
throw IOUtils.rethrowAlways(firstExc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -504,7 +506,10 @@ public final class StandardDirectoryReader extends DirectoryReader {
|
||||
}
|
||||
}
|
||||
}
|
||||
IOUtils.reThrow(th);
|
||||
|
||||
if (th != null) {
|
||||
throw IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -58,6 +58,16 @@ public abstract class DoubleValuesSource {
|
||||
*/
|
||||
public abstract boolean needsScores();
|
||||
|
||||
/**
|
||||
* An explanation of the value for the named document.
|
||||
*
|
||||
* @param ctx the readers context to create the {@link Explanation} for.
|
||||
* @param docId the document's id relative to the given context's reader
|
||||
* @return an Explanation for the value
|
||||
* @throws IOException if an {@link IOException} occurs
|
||||
*/
|
||||
public abstract Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException;
|
||||
|
||||
/**
|
||||
* Create a sort field based on the value of this producer
|
||||
* @param reverse true if the sort should be decreasing
|
||||
@ -149,6 +159,11 @@ public abstract class DoubleValuesSource {
|
||||
public boolean needsScores() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) {
|
||||
return scoreExplanation;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
@ -176,6 +191,11 @@ public abstract class DoubleValuesSource {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) {
|
||||
return Explanation.match((float) value, "constant(" + value + ")");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "constant(" + value + ")";
|
||||
@ -186,7 +206,7 @@ public abstract class DoubleValuesSource {
|
||||
/**
|
||||
* Creates a DoubleValuesSource that is a function of another DoubleValuesSource
|
||||
*/
|
||||
public static DoubleValuesSource function(DoubleValuesSource in, DoubleUnaryOperator function) {
|
||||
public static DoubleValuesSource function(DoubleValuesSource in, String description, DoubleUnaryOperator function) {
|
||||
return new DoubleValuesSource() {
|
||||
@Override
|
||||
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
|
||||
@ -208,15 +228,22 @@ public abstract class DoubleValuesSource {
|
||||
public boolean needsScores() {
|
||||
return in.needsScores();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
|
||||
Explanation inner = in.explain(ctx, docId, scoreExplanation);
|
||||
return Explanation.match((float) function.applyAsDouble(inner.getValue()), description + ", computed from:", inner, scoreExplanation);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a DoubleValuesSource that is a function of another DoubleValuesSource and a score
|
||||
* @param in the DoubleValuesSource to use as an input
|
||||
* @param description a description of the function
|
||||
* @param function a function of the form (source, score) == result
|
||||
*/
|
||||
public static DoubleValuesSource scoringFunction(DoubleValuesSource in, ToDoubleBiFunction<Double, Double> function) {
|
||||
public static DoubleValuesSource scoringFunction(DoubleValuesSource in, String description, ToDoubleBiFunction<Double, Double> function) {
|
||||
return new DoubleValuesSource() {
|
||||
@Override
|
||||
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
|
||||
@ -238,6 +265,13 @@ public abstract class DoubleValuesSource {
|
||||
public boolean needsScores() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
|
||||
Explanation inner = in.explain(ctx, docId, scoreExplanation);
|
||||
return Explanation.match((float) function.applyAsDouble((double)inner.getValue(), (double)scoreExplanation.getValue()),
|
||||
description + ", computed from:", inner, scoreExplanation);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@ -303,6 +337,15 @@ public abstract class DoubleValuesSource {
|
||||
public boolean needsScores() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
|
||||
DoubleValues values = getValues(ctx, null);
|
||||
if (values.advanceExact(docId))
|
||||
return Explanation.match((float)values.doubleValue(), "double(" + field + ")");
|
||||
else
|
||||
return Explanation.noMatch("double(" + field + ")");
|
||||
}
|
||||
}
|
||||
|
||||
private static class DoubleValuesSortField extends SortField {
|
||||
|
@ -298,7 +298,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
|
||||
try {
|
||||
Query singleton = uniqueQueries.putIfAbsent(query, query);
|
||||
if (singleton == null) {
|
||||
onQueryCache(singleton, LINKED_HASHTABLE_RAM_BYTES_PER_ENTRY + ramBytesUsed(query));
|
||||
onQueryCache(query, LINKED_HASHTABLE_RAM_BYTES_PER_ENTRY + ramBytesUsed(query));
|
||||
} else {
|
||||
query = singleton;
|
||||
}
|
||||
|
@ -96,20 +96,6 @@ public class BM25Similarity extends Similarity {
|
||||
}
|
||||
}
|
||||
|
||||
/** The default implementation encodes <code>1 / sqrt(length)</code>
|
||||
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
|
||||
* Lucene's historic implementation: {@link ClassicSimilarity}. If you
|
||||
* change this, then you should change {@link #decodeNormValue(byte)} to match. */
|
||||
protected byte encodeNormValue(int fieldLength) {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(fieldLength)));
|
||||
}
|
||||
|
||||
/** The default implementation returns <code>1 / f<sup>2</sup></code>
|
||||
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
|
||||
protected float decodeNormValue(byte b) {
|
||||
return NORM_TABLE[b & 0xFF];
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
@ -132,21 +118,31 @@ public class BM25Similarity extends Similarity {
|
||||
}
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (f*f);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
|
||||
return encodeNormValue(numTerms);
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -207,34 +203,43 @@ public class BM25Similarity extends Similarity {
|
||||
@Override
|
||||
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
|
||||
|
||||
float avgdl = avgFieldLength(collectionStats);
|
||||
|
||||
// compute freq-independent part of bm25 equation across all norm values
|
||||
float cache[] = new float[256];
|
||||
float[] oldCache = new float[256];
|
||||
float[] cache = new float[256];
|
||||
for (int i = 0; i < cache.length; i++) {
|
||||
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
|
||||
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
|
||||
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
|
||||
}
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
|
||||
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
BM25Stats bm25stats = (BM25Stats) stats;
|
||||
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
|
||||
return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
|
||||
}
|
||||
|
||||
private class BM25DocScorer extends SimScorer {
|
||||
private final BM25Stats stats;
|
||||
private final float weightValue; // boost * idf * (k1 + 1)
|
||||
private final NumericDocValues norms;
|
||||
/** precomputed cache for all length values */
|
||||
private final float[] lengthCache;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float[] cache;
|
||||
|
||||
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.weight * (k1 + 1);
|
||||
this.cache = stats.cache;
|
||||
this.norms = norms;
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
lengthCache = LENGTH_TABLE;
|
||||
cache = stats.cache;
|
||||
} else {
|
||||
lengthCache = OLD_LENGTH_TABLE;
|
||||
cache = stats.oldCache;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -245,7 +250,7 @@ public class BM25Similarity extends Similarity {
|
||||
norm = k1;
|
||||
} else {
|
||||
if (norms.advanceExact(doc)) {
|
||||
norm = cache[(byte)norms.longValue() & 0xFF];
|
||||
norm = cache[((byte) norms.longValue()) & 0xFF];
|
||||
} else {
|
||||
norm = cache[0];
|
||||
}
|
||||
@ -255,7 +260,7 @@ public class BM25Similarity extends Similarity {
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
return explainScore(doc, freq, stats, norms, lengthCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -281,21 +286,23 @@ public class BM25Similarity extends Similarity {
|
||||
private final float weight;
|
||||
/** field name, for pulling norms */
|
||||
private final String field;
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
|
||||
private final float cache[];
|
||||
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
|
||||
* for both OLD_LENGTH_TABLE and LENGTH_TABLE */
|
||||
private final float[] oldCache, cache;
|
||||
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float cache[]) {
|
||||
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
|
||||
this.field = field;
|
||||
this.boost = boost;
|
||||
this.idf = idf;
|
||||
this.avgdl = avgdl;
|
||||
this.cache = cache;
|
||||
this.weight = idf.getValue() * boost;
|
||||
this.oldCache = oldCache;
|
||||
this.cache = cache;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
subs.add(freq);
|
||||
subs.add(Explanation.match(k1, "parameter k1"));
|
||||
@ -311,7 +318,7 @@ public class BM25Similarity extends Similarity {
|
||||
} else {
|
||||
norm = 0;
|
||||
}
|
||||
float doclen = decodeNormValue(norm);
|
||||
float doclen = lengthCache[norm & 0xff];
|
||||
subs.add(Explanation.match(b, "parameter b"));
|
||||
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
|
||||
subs.add(Explanation.match(doclen, "fieldLength"));
|
||||
@ -321,13 +328,13 @@ public class BM25Similarity extends Similarity {
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
|
||||
Explanation boostExpl = Explanation.match(stats.boost, "boost");
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
if (boostExpl.getValue() != 1.0f)
|
||||
subs.add(boostExpl);
|
||||
subs.add(stats.idf);
|
||||
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms);
|
||||
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
|
||||
subs.add(tfNormExpl);
|
||||
return Explanation.match(
|
||||
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
|
||||
|
@ -17,91 +17,27 @@
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
/**
|
||||
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
|
||||
* encodes} norm values as a single byte before being stored. At search time,
|
||||
* the norm byte value is read from the index
|
||||
* {@link org.apache.lucene.store.Directory directory} and
|
||||
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
|
||||
* This encoding/decoding, while reducing index size, comes with the price of
|
||||
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
|
||||
* instance, <i>decode(encode(0.89)) = 0.875</i>.
|
||||
* <p>
|
||||
* Compression of norm values to a single byte saves memory at search time,
|
||||
* because once a field is referenced at search time, its norms - for all
|
||||
* documents - are maintained in memory.
|
||||
* <p>
|
||||
* The rationale supporting such lossy compression of norm values is that given
|
||||
* the difficulty (and inaccuracy) of users to express their true information
|
||||
* need by a query, only big differences matter. <br>
|
||||
* <br>
|
||||
* Last, note that search time is too late to modify this <i>norm</i> part of
|
||||
* scoring, e.g. by using a different {@link Similarity} for search.
|
||||
* Expert: Historical scoring implementation. You might want to consider using
|
||||
* {@link BM25Similarity} instead, which is generally considered superior to
|
||||
* TF-IDF.
|
||||
*/
|
||||
public class ClassicSimilarity extends TFIDFSimilarity {
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public ClassicSimilarity() {}
|
||||
|
||||
/**
|
||||
* Encodes a normalization factor for storage in an index.
|
||||
* <p>
|
||||
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
|
||||
* zero-exponent point at 15, thus representing values from around 7x10^9 to
|
||||
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
|
||||
* represented. Negative numbers are rounded up to zero. Values too large to
|
||||
* represent are rounded down to the largest representable value. Positive
|
||||
* values too small to represent are rounded up to the smallest positive
|
||||
* representable value.
|
||||
*
|
||||
* @see org.apache.lucene.util.SmallFloat
|
||||
*/
|
||||
@Override
|
||||
public final long encodeNormValue(float f) {
|
||||
return SmallFloat.floatToByte315(f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the norm value, assuming it is a single byte.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
@Override
|
||||
public final float decodeNormValue(long norm) {
|
||||
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Implemented as
|
||||
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
|
||||
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
|
||||
* #setDiscountOverlaps} is false, else it's {@link
|
||||
* FieldInvertState#getLength()} - {@link
|
||||
* FieldInvertState#getNumOverlap()}.
|
||||
* <code>1/sqrt(length)</code>.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
public float lengthNorm(int numTerms) {
|
||||
return (float) (1.0 / Math.sqrt(numTerms));
|
||||
}
|
||||
|
||||
@ -138,33 +74,6 @@ public class ClassicSimilarity extends TFIDFSimilarity {
|
||||
public float idf(long docFreq, long docCount) {
|
||||
return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
|
||||
}
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
@ -190,7 +190,8 @@ public abstract class SimilarityBase extends Similarity {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
|
||||
if (stats instanceof MultiSimilarity.MultiStats) {
|
||||
// a multi term query (e.g. phrase). return the summation,
|
||||
// scoring almost as if it were boolean query
|
||||
@ -198,12 +199,12 @@ public abstract class SimilarityBase extends Similarity {
|
||||
SimScorer subScorers[] = new SimScorer[subStats.length];
|
||||
for (int i = 0; i < subScorers.length; i++) {
|
||||
BasicStats basicstats = (BasicStats) subStats[i];
|
||||
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
return new MultiSimilarity.MultiSimScorer(subScorers);
|
||||
} else {
|
||||
BasicStats basicstats = (BasicStats) stats;
|
||||
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
|
||||
return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
|
||||
}
|
||||
}
|
||||
|
||||
@ -216,40 +217,38 @@ public abstract class SimilarityBase extends Similarity {
|
||||
|
||||
// ------------------------------ Norm handling ------------------------------
|
||||
|
||||
/** Norm to document length map. */
|
||||
private static final float[] NORM_TABLE = new float[256];
|
||||
/** Cache of decoded bytes. */
|
||||
private static final float[] OLD_LENGTH_TABLE = new float[256];
|
||||
private static final float[] LENGTH_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 1; i < 256; i++) {
|
||||
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
|
||||
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
|
||||
float f = SmallFloat.byte315ToFloat((byte)i);
|
||||
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
|
||||
}
|
||||
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
|
||||
|
||||
for (int i = 0; i < 256; i++) {
|
||||
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
|
||||
}
|
||||
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
|
||||
}
|
||||
|
||||
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
|
||||
/** Encodes the document length in the same way as {@link BM25Similarity}. */
|
||||
@Override
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
final float numTerms;
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
return encodeNormValue(numTerms);
|
||||
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
|
||||
if (indexCreatedVersionMajor >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
|
||||
}
|
||||
}
|
||||
|
||||
/** Decodes a normalization factor (document length) stored in an index.
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
protected float decodeNormValue(byte norm) {
|
||||
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
|
||||
}
|
||||
|
||||
/** Encodes the length to a byte via SmallFloat. */
|
||||
protected byte encodeNormValue(float length) {
|
||||
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(length)));
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- Static methods ------------------------------
|
||||
|
||||
/** Returns the base two logarithm of {@code x}. */
|
||||
@ -266,35 +265,37 @@ public abstract class SimilarityBase extends Similarity {
|
||||
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
|
||||
* respectively.
|
||||
*/
|
||||
private class BasicSimScorer extends SimScorer {
|
||||
final class BasicSimScorer extends SimScorer {
|
||||
private final BasicStats stats;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normCache;
|
||||
|
||||
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
|
||||
BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
|
||||
this.stats = stats;
|
||||
this.norms = norms;
|
||||
this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
|
||||
}
|
||||
|
||||
private float getNormValue(int doc) throws IOException {
|
||||
float getLengthValue(int doc) throws IOException {
|
||||
if (norms == null) {
|
||||
return 1F;
|
||||
}
|
||||
if (norms.advanceExact(doc)) {
|
||||
return decodeNormValue((byte) norms.longValue());
|
||||
return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
|
||||
} else {
|
||||
return decodeNormValue((byte) 0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
// We have to supply something in case norms are omitted
|
||||
return SimilarityBase.this.score(stats, freq, getNormValue(doc));
|
||||
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return SimilarityBase.this.explain(stats, doc, freq, getNormValue(doc));
|
||||
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.SmallFloat;
|
||||
|
||||
|
||||
/**
|
||||
@ -233,11 +234,6 @@ import org.apache.lucene.util.BytesRef;
|
||||
* And this is exactly what normalizing the query vector <i>V(q)</i>
|
||||
* provides: comparability (to a certain extent) of two or more queries.
|
||||
* </li>
|
||||
*
|
||||
* <li>Applying query normalization on the scores helps to keep the
|
||||
* scores around the unit vector, hence preventing loss of score data
|
||||
* because of floating point precision limitations.
|
||||
* </li>
|
||||
* </ul>
|
||||
* </li>
|
||||
*
|
||||
@ -379,13 +375,49 @@ import org.apache.lucene.util.BytesRef;
|
||||
* @see IndexSearcher#setSimilarity(Similarity)
|
||||
*/
|
||||
public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
|
||||
/** Cache of decoded bytes. */
|
||||
static final float[] OLD_NORM_TABLE = new float[256];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < 256; i++) {
|
||||
OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass
|
||||
* constructors, typically implicit.)
|
||||
*/
|
||||
public TFIDFSimilarity() {}
|
||||
|
||||
|
||||
/**
|
||||
* True if overlap tokens (tokens with a position of increment of zero) are
|
||||
* discounted from the document's length.
|
||||
*/
|
||||
protected boolean discountOverlaps = true;
|
||||
|
||||
/** Determines whether overlap tokens (Tokens with
|
||||
* 0 position increment) are ignored when computing
|
||||
* norm. By default this is true, meaning overlap
|
||||
* tokens do not count when computing norms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*
|
||||
* @see #computeNorm
|
||||
*/
|
||||
public void setDiscountOverlaps(boolean v) {
|
||||
discountOverlaps = v;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if overlap tokens are discounted from the document's length.
|
||||
* @see #setDiscountOverlaps
|
||||
*/
|
||||
public boolean getDiscountOverlaps() {
|
||||
return discountOverlaps;
|
||||
}
|
||||
|
||||
/** Computes a score factor based on a term or phrase's frequency in a
|
||||
* document. This value is multiplied by the {@link #idf(long, long)}
|
||||
* factor for each term in the query and these products are then summed to
|
||||
@ -471,30 +503,25 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
/**
|
||||
* Compute an index-time normalization value for this field instance.
|
||||
* <p>
|
||||
* This value will be stored in a single byte lossy representation by
|
||||
* {@link #encodeNormValue(float)}.
|
||||
*
|
||||
* @param state statistics of the current field (such as length, boost, etc)
|
||||
* @return an index-time normalization value
|
||||
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) discounting overlaps}
|
||||
* @return a length normalization value
|
||||
*/
|
||||
public abstract float lengthNorm(FieldInvertState state);
|
||||
public abstract float lengthNorm(int length);
|
||||
|
||||
@Override
|
||||
public final long computeNorm(FieldInvertState state) {
|
||||
float normValue = lengthNorm(state);
|
||||
return encodeNormValue(normValue);
|
||||
final int numTerms;
|
||||
if (discountOverlaps)
|
||||
numTerms = state.getLength() - state.getNumOverlap();
|
||||
else
|
||||
numTerms = state.getLength();
|
||||
if (state.getIndexCreatedVersionMajor() >= 7) {
|
||||
return SmallFloat.intToByte4(numTerms);
|
||||
} else {
|
||||
return SmallFloat.floatToByte315(lengthNorm(numTerms));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes a normalization factor stored in an index.
|
||||
*
|
||||
* @see #encodeNormValue(float)
|
||||
*/
|
||||
public abstract float decodeNormValue(long norm);
|
||||
|
||||
/** Encodes a normalization factor for storage in an index. */
|
||||
public abstract long encodeNormValue(float f);
|
||||
|
||||
/** Computes the amount of a sloppy phrase match, based on an edit distance.
|
||||
* This value is summed for each sloppy phrase match in a document to form
|
||||
@ -529,24 +556,41 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||
final Explanation idf = termStats.length == 1
|
||||
? idfExplain(collectionStats, termStats[0])
|
||||
: idfExplain(collectionStats, termStats);
|
||||
return new IDFStats(collectionStats.field(), boost, idf);
|
||||
float[] normTable = new float[256];
|
||||
for (int i = 1; i < 256; ++i) {
|
||||
int length = SmallFloat.byte4ToInt((byte) i);
|
||||
float norm = lengthNorm(length);
|
||||
normTable[i] = norm;
|
||||
}
|
||||
normTable[0] = 1f / normTable[255];
|
||||
return new IDFStats(collectionStats.field(), boost, idf, normTable);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
|
||||
IDFStats idfstats = (IDFStats) stats;
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
|
||||
final float[] normTable;
|
||||
if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
|
||||
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
|
||||
normTable = idfstats.normTable;
|
||||
} else {
|
||||
// the norm is directly encoded in the index
|
||||
normTable = OLD_NORM_TABLE;
|
||||
}
|
||||
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
|
||||
}
|
||||
|
||||
private final class TFIDFSimScorer extends SimScorer {
|
||||
private final IDFStats stats;
|
||||
private final float weightValue;
|
||||
private final NumericDocValues norms;
|
||||
private final float[] normTable;
|
||||
|
||||
TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
this.stats = stats;
|
||||
this.weightValue = stats.queryWeight;
|
||||
this.norms = norms;
|
||||
this.normTable = normTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -556,13 +600,13 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||
if (norms == null) {
|
||||
return raw;
|
||||
} else {
|
||||
long normValue;
|
||||
float normValue;
|
||||
if (norms.advanceExact(doc)) {
|
||||
normValue = norms.longValue();
|
||||
normValue = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
} else {
|
||||
normValue = 0;
|
||||
}
|
||||
return raw * decodeNormValue(normValue); // normalize for field
|
||||
return raw * normValue; // normalize for field
|
||||
}
|
||||
}
|
||||
|
||||
@ -578,35 +622,39 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||
|
||||
@Override
|
||||
public Explanation explain(int doc, Explanation freq) throws IOException {
|
||||
return explainScore(doc, freq, stats, norms);
|
||||
return explainScore(doc, freq, stats, norms, normTable);
|
||||
}
|
||||
}
|
||||
|
||||
/** Collection statistics for the TF-IDF model. The only statistic of interest
|
||||
* to this model is idf. */
|
||||
private static class IDFStats extends SimWeight {
|
||||
static class IDFStats extends SimWeight {
|
||||
private final String field;
|
||||
/** The idf and its explanation */
|
||||
private final Explanation idf;
|
||||
private final float boost;
|
||||
private final float queryWeight;
|
||||
final float[] normTable;
|
||||
|
||||
public IDFStats(String field, float boost, Explanation idf) {
|
||||
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
|
||||
// TODO: Validate?
|
||||
this.field = field;
|
||||
this.idf = idf;
|
||||
this.boost = boost;
|
||||
this.queryWeight = boost * idf.getValue();
|
||||
this.normTable = normTable;
|
||||
}
|
||||
}
|
||||
|
||||
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
|
||||
float norm;
|
||||
if (norms != null && norms.advanceExact(doc)) {
|
||||
norm = decodeNormValue(norms.longValue());
|
||||
} else {
|
||||
if (norms == null) {
|
||||
norm = 1f;
|
||||
} else if (norms.advanceExact(doc) == false) {
|
||||
norm = 0f;
|
||||
} else {
|
||||
norm = normTable[(int) (norms.longValue() & 0xFF)];
|
||||
}
|
||||
|
||||
Explanation fieldNormExpl = Explanation.match(
|
||||
@ -619,9 +667,9 @@ public abstract class TFIDFSimilarity extends Similarity {
|
||||
tfExplanation, stats.idf, fieldNormExpl);
|
||||
}
|
||||
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
|
||||
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
|
||||
Explanation queryExpl = Explanation.match(stats.boost, "boost");
|
||||
Explanation fieldExpl = explainField(doc, freq, stats, norms);
|
||||
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
|
||||
if (stats.boost == 1f) {
|
||||
return fieldExpl;
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ public abstract class FSDirectory extends BaseDirectory {
|
||||
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
|
||||
for (Path path : stream) {
|
||||
String name = path.getFileName().toString();
|
||||
if (skipNames != null && skipNames.contains(name) == false) {
|
||||
if (skipNames == null || skipNames.contains(name) == false) {
|
||||
entries.add(name);
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ package org.apache.lucene.util;
|
||||
import java.lang.invoke.MethodHandle;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.lang.invoke.MethodType;
|
||||
import java.lang.reflect.UndeclaredThrowableException;
|
||||
|
||||
/**
|
||||
* An AttributeFactory creates instances of {@link AttributeImpl}s.
|
||||
@ -28,8 +29,14 @@ public abstract class AttributeFactory {
|
||||
|
||||
/**
|
||||
* Returns an {@link AttributeImpl} for the supplied {@link Attribute} interface class.
|
||||
*
|
||||
* @throws UndeclaredThrowableException A wrapper runtime exception thrown if the
|
||||
* constructor of the attribute class throws a checked exception.
|
||||
* Note that attributes should not throw or declare
|
||||
* checked exceptions; this may be verified and fail early in the future.
|
||||
*/
|
||||
public abstract AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass);
|
||||
public abstract AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass)
|
||||
throws UndeclaredThrowableException;
|
||||
|
||||
/**
|
||||
* Returns a correctly typed {@link MethodHandle} for the no-arg ctor of the given class.
|
||||
@ -61,17 +68,18 @@ public abstract class AttributeFactory {
|
||||
};
|
||||
|
||||
DefaultAttributeFactory() {}
|
||||
|
||||
|
||||
@Override
|
||||
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
|
||||
try {
|
||||
return (AttributeImpl) constructors.get(attClass).invokeExact();
|
||||
} catch (Throwable t) {
|
||||
rethrow(t);
|
||||
throw new AssertionError();
|
||||
} catch (Error | RuntimeException e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
throw new UndeclaredThrowableException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Class<? extends AttributeImpl> findImplClass(Class<? extends Attribute> attClass) {
|
||||
try {
|
||||
return Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()).asSubclass(AttributeImpl.class);
|
||||
@ -138,23 +146,12 @@ public abstract class AttributeFactory {
|
||||
protected A createInstance() {
|
||||
try {
|
||||
return (A) constr.invokeExact();
|
||||
} catch (Throwable t) {
|
||||
rethrow(t);
|
||||
throw new AssertionError();
|
||||
} catch (Error | RuntimeException e) {
|
||||
throw e;
|
||||
} catch (Throwable e) {
|
||||
throw new UndeclaredThrowableException(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Hack to rethrow unknown Exceptions from {@link MethodHandle#invoke}:
|
||||
// TODO: remove the impl in test-framework, this one is more elegant :-)
|
||||
static void rethrow(Throwable t) {
|
||||
AttributeFactory.<Error>rethrow0(t);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T extends Throwable> void rethrow0(Throwable t) throws T {
|
||||
throw (T) t;
|
||||
}
|
||||
|
||||
}
|
@ -96,7 +96,9 @@ public final class IOUtils {
|
||||
}
|
||||
}
|
||||
|
||||
reThrow(th);
|
||||
if (th != null) {
|
||||
throw rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -229,7 +231,9 @@ public final class IOUtils {
|
||||
}
|
||||
}
|
||||
|
||||
reThrow(th);
|
||||
if (th != null) {
|
||||
throw rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
public static void deleteFiles(Directory dir, String... files) throws IOException {
|
||||
@ -300,7 +304,9 @@ public final class IOUtils {
|
||||
}
|
||||
}
|
||||
|
||||
reThrow(th);
|
||||
if (th != null) {
|
||||
throw rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -376,37 +382,83 @@ public final class IOUtils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple utility method that takes a previously caught
|
||||
* {@code Throwable} and rethrows either {@code
|
||||
* IOException} or an unchecked exception. If the
|
||||
* argument is null then this method does nothing.
|
||||
* This utility method takes a previously caught (non-null)
|
||||
* {@code Throwable} and rethrows either the original argument
|
||||
* if it was a subclass of the {@code IOException} or an
|
||||
* {@code RuntimeException} with the cause set to the argument.
|
||||
*
|
||||
* <p>This method <strong>never returns any value</strong>, even though it declares
|
||||
* a return value of type {@link Error}. The return value declaration
|
||||
* is very useful to let the compiler know that the code path following
|
||||
* the invocation of this method is unreachable. So in most cases the
|
||||
* invocation of this method will be guarded by an {@code if} and
|
||||
* used together with a {@code throw} statement, as in:
|
||||
* </p>
|
||||
* <pre>{@code
|
||||
* if (t != null) throw IOUtils.rethrowAlways(t)
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param th The throwable to rethrow, <strong>must not be null</strong>.
|
||||
* @return This method always results in an exception, it never returns any value.
|
||||
* See method documentation for detailsa and usage example.
|
||||
* @throws IOException if the argument was an instance of IOException
|
||||
* @throws RuntimeException with the {@link RuntimeException#getCause()} set
|
||||
* to the argument, if it was not an instance of IOException.
|
||||
*/
|
||||
public static void reThrow(Throwable th) throws IOException {
|
||||
if (th != null) {
|
||||
if (th instanceof IOException) {
|
||||
throw (IOException) th;
|
||||
}
|
||||
reThrowUnchecked(th);
|
||||
public static Error rethrowAlways(Throwable th) throws IOException, RuntimeException {
|
||||
if (th == null) {
|
||||
throw new AssertionError("rethrow argument must not be null.");
|
||||
}
|
||||
|
||||
if (th instanceof IOException) {
|
||||
throw (IOException) th;
|
||||
}
|
||||
|
||||
if (th instanceof RuntimeException) {
|
||||
throw (RuntimeException) th;
|
||||
}
|
||||
|
||||
if (th instanceof Error) {
|
||||
throw (Error) th;
|
||||
}
|
||||
|
||||
throw new RuntimeException(th);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple utility method that takes a previously caught
|
||||
* {@code Throwable} and rethrows it as an unchecked exception.
|
||||
* If the argument is null then this method does nothing.
|
||||
* Rethrows the argument as {@code IOException} or {@code RuntimeException}
|
||||
* if it's not null.
|
||||
*
|
||||
* @deprecated This method is deprecated in favor of {@link #rethrowAlways}. Code should
|
||||
* be updated to {@link #rethrowAlways} and guarded with an additional null-argument check
|
||||
* (because {@link #rethrowAlways} is not accepting null arguments).
|
||||
*/
|
||||
@Deprecated
|
||||
public static void reThrow(Throwable th) throws IOException {
|
||||
if (th != null) {
|
||||
throw rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated This method is deprecated in favor of {@link #rethrowAlways}. Code should
|
||||
* be updated to {@link #rethrowAlways} and guarded with an additional null-argument check
|
||||
* (because {@link #rethrowAlways} is not accepting null arguments).
|
||||
*/
|
||||
@Deprecated
|
||||
public static void reThrowUnchecked(Throwable th) {
|
||||
if (th != null) {
|
||||
if (th instanceof RuntimeException) {
|
||||
throw (RuntimeException) th;
|
||||
}
|
||||
if (th instanceof Error) {
|
||||
throw (Error) th;
|
||||
}
|
||||
if (th instanceof RuntimeException) {
|
||||
throw (RuntimeException) th;
|
||||
}
|
||||
throw new RuntimeException(th);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Ensure that any writes to the given file is written to the storage device that contains it.
|
||||
* @param fileToSync the file to fsync
|
||||
|
@ -24,7 +24,12 @@ import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.store.ChecksumIndexInput;
|
||||
@ -73,6 +78,9 @@ public class OfflineSorter {
|
||||
private final int valueLength;
|
||||
private final String tempFileNamePrefix;
|
||||
|
||||
private final ExecutorService exec;
|
||||
private final Semaphore partitionsInRAM;
|
||||
|
||||
/**
|
||||
* A bit more descriptive unit for constructors.
|
||||
*
|
||||
@ -145,13 +153,13 @@ public class OfflineSorter {
|
||||
/** number of lines of data read */
|
||||
public int lineCount;
|
||||
/** time spent merging sorted partitions (in milliseconds) */
|
||||
public long mergeTime;
|
||||
public final AtomicLong mergeTimeMS = new AtomicLong();
|
||||
/** time spent sorting data (in milliseconds) */
|
||||
public long sortTime;
|
||||
public final AtomicLong sortTimeMS = new AtomicLong();
|
||||
/** total time spent (in milliseconds) */
|
||||
public long totalTime;
|
||||
public long totalTimeMS;
|
||||
/** time spent in i/o read (in milliseconds) */
|
||||
public long readTime;
|
||||
public long readTimeMS;
|
||||
/** read buffer size (in bytes) */
|
||||
public final long bufferSize = ramBufferSize.bytes;
|
||||
|
||||
@ -161,17 +169,15 @@ public class OfflineSorter {
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format(Locale.ROOT,
|
||||
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
|
||||
totalTime / 1000.0d, readTime / 1000.0d, sortTime / 1000.0d, mergeTime / 1000.0d,
|
||||
lineCount, tempMergeFiles, mergeRounds,
|
||||
(double) bufferSize / MB);
|
||||
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
|
||||
totalTimeMS / 1000.0d, readTimeMS / 1000.0d, sortTimeMS.get() / 1000.0d, mergeTimeMS.get() / 1000.0d,
|
||||
lineCount, tempMergeFiles, mergeRounds,
|
||||
(double) bufferSize / MB);
|
||||
}
|
||||
}
|
||||
|
||||
private final BufferSize ramBufferSize;
|
||||
|
||||
private final Counter bufferBytesUsed = Counter.newCounter();
|
||||
private final SortableBytesRefArray buffer;
|
||||
SortInfo sortInfo;
|
||||
private int maxTempFiles;
|
||||
private final Comparator<BytesRef> comparator;
|
||||
@ -185,7 +191,7 @@ public class OfflineSorter {
|
||||
* @see BufferSize#automatic()
|
||||
*/
|
||||
public OfflineSorter(Directory dir, String tempFileNamePrefix) throws IOException {
|
||||
this(dir, tempFileNamePrefix, DEFAULT_COMPARATOR, BufferSize.automatic(), MAX_TEMPFILES, -1);
|
||||
this(dir, tempFileNamePrefix, DEFAULT_COMPARATOR, BufferSize.automatic(), MAX_TEMPFILES, -1, null, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -194,14 +200,30 @@ public class OfflineSorter {
|
||||
* @see BufferSize#automatic()
|
||||
*/
|
||||
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator) throws IOException {
|
||||
this(dir, tempFileNamePrefix, comparator, BufferSize.automatic(), MAX_TEMPFILES, -1);
|
||||
this(dir, tempFileNamePrefix, comparator, BufferSize.automatic(), MAX_TEMPFILES, -1, null, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* All-details constructor. If {@code valueLength} is -1 (the default), the length of each value differs; otherwise,
|
||||
* all values have the specified length.
|
||||
* all values have the specified length. If you pass a non-null {@code ExecutorService} then it will be
|
||||
* used to run sorting operations that can be run concurrently, and maxPartitionsInRAM is the maximum
|
||||
* concurrent in-memory partitions. Thus the maximum possible RAM used by this class while sorting is
|
||||
* {@code maxPartitionsInRAM * ramBufferSize}.
|
||||
*/
|
||||
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator, BufferSize ramBufferSize, int maxTempfiles, int valueLength) {
|
||||
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator,
|
||||
BufferSize ramBufferSize, int maxTempfiles, int valueLength, ExecutorService exec,
|
||||
int maxPartitionsInRAM) {
|
||||
if (exec != null) {
|
||||
this.exec = exec;
|
||||
if (maxPartitionsInRAM <= 0) {
|
||||
throw new IllegalArgumentException("maxPartitionsInRAM must be > 0; got " + maxPartitionsInRAM);
|
||||
}
|
||||
} else {
|
||||
this.exec = new SameThreadExecutorService();
|
||||
maxPartitionsInRAM = 1;
|
||||
}
|
||||
this.partitionsInRAM = new Semaphore(maxPartitionsInRAM);
|
||||
|
||||
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
|
||||
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
|
||||
}
|
||||
@ -209,14 +231,11 @@ public class OfflineSorter {
|
||||
if (maxTempfiles < 2) {
|
||||
throw new IllegalArgumentException("maxTempFiles must be >= 2");
|
||||
}
|
||||
if (valueLength == -1) {
|
||||
buffer = new BytesRefArray(bufferBytesUsed);
|
||||
} else {
|
||||
if (valueLength == 0 || valueLength > Short.MAX_VALUE) {
|
||||
throw new IllegalArgumentException("valueLength must be 1 .. " + Short.MAX_VALUE + "; got: " + valueLength);
|
||||
}
|
||||
buffer = new FixedLengthBytesRefArray(valueLength);
|
||||
|
||||
if (valueLength != -1 && (valueLength == 0 || valueLength > Short.MAX_VALUE)) {
|
||||
throw new IllegalArgumentException("valueLength must be 1 .. " + Short.MAX_VALUE + "; got: " + valueLength);
|
||||
}
|
||||
|
||||
this.valueLength = valueLength;
|
||||
this.ramBufferSize = ramBufferSize;
|
||||
this.maxTempFiles = maxTempfiles;
|
||||
@ -241,26 +260,31 @@ public class OfflineSorter {
|
||||
public String sort(String inputFileName) throws IOException {
|
||||
|
||||
sortInfo = new SortInfo();
|
||||
sortInfo.totalTime = System.currentTimeMillis();
|
||||
long startMS = System.currentTimeMillis();
|
||||
|
||||
List<PartitionAndCount> segments = new ArrayList<>();
|
||||
List<Future<Partition>> segments = new ArrayList<>();
|
||||
int[] levelCounts = new int[1];
|
||||
|
||||
// So we can remove any partially written temp files on exception:
|
||||
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
|
||||
|
||||
boolean success = false;
|
||||
boolean[] isExhausted = new boolean[1];
|
||||
try (ByteSequencesReader is = getReader(dir.openChecksumInput(inputFileName, IOContext.READONCE), inputFileName)) {
|
||||
while (isExhausted[0] == false) {
|
||||
int lineCount = readPartition(is, isExhausted);
|
||||
if (lineCount == 0) {
|
||||
assert isExhausted[0];
|
||||
while (true) {
|
||||
Partition part = readPartition(is);
|
||||
if (part.count == 0) {
|
||||
if (partitionsInRAM != null) {
|
||||
partitionsInRAM.release();
|
||||
}
|
||||
assert part.exhausted;
|
||||
break;
|
||||
}
|
||||
segments.add(sortPartition(trackingDir, lineCount));
|
||||
|
||||
Callable<Partition> job = new SortPartitionTask(trackingDir, part);
|
||||
|
||||
segments.add(exec.submit(job));
|
||||
sortInfo.tempMergeFiles++;
|
||||
sortInfo.lineCount += lineCount;
|
||||
sortInfo.lineCount += part.count;
|
||||
levelCounts[0]++;
|
||||
|
||||
// Handle intermediate merges; we need a while loop to "cascade" the merge when necessary:
|
||||
@ -274,6 +298,10 @@ public class OfflineSorter {
|
||||
levelCounts[mergeLevel] = 0;
|
||||
mergeLevel++;
|
||||
}
|
||||
|
||||
if (part.exhausted) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: we shouldn't have to do this? Can't we return a merged reader to
|
||||
@ -292,13 +320,13 @@ public class OfflineSorter {
|
||||
result = out.getName();
|
||||
}
|
||||
} else {
|
||||
result = segments.get(0).fileName;
|
||||
result = getPartition(segments.get(0)).fileName;
|
||||
}
|
||||
|
||||
// We should be explicitly removing all intermediate files ourselves unless there is an exception:
|
||||
assert trackingDir.getCreatedFiles().size() == 1 && trackingDir.getCreatedFiles().contains(result);
|
||||
|
||||
sortInfo.totalTime = System.currentTimeMillis() - sortInfo.totalTime;
|
||||
sortInfo.totalTimeMS = System.currentTimeMillis() - startMS;
|
||||
|
||||
CodecUtil.checkFooter(is.in);
|
||||
|
||||
@ -306,6 +334,8 @@ public class OfflineSorter {
|
||||
|
||||
return result;
|
||||
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
} finally {
|
||||
if (success == false) {
|
||||
IOUtils.deleteFilesIgnoringExceptions(trackingDir, trackingDir.getCreatedFiles());
|
||||
@ -313,36 +343,6 @@ public class OfflineSorter {
|
||||
}
|
||||
}
|
||||
|
||||
/** Sort a single partition in-memory. */
|
||||
protected PartitionAndCount sortPartition(TrackingDirectoryWrapper trackingDir, int lineCount) throws IOException {
|
||||
|
||||
try (IndexOutput tempFile = trackingDir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT);
|
||||
ByteSequencesWriter out = getWriter(tempFile, lineCount);) {
|
||||
|
||||
BytesRef spare;
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
BytesRefIterator iter = buffer.iterator(comparator);
|
||||
sortInfo.sortTime += System.currentTimeMillis() - start;
|
||||
|
||||
int count = 0;
|
||||
while ((spare = iter.next()) != null) {
|
||||
assert spare.length <= Short.MAX_VALUE;
|
||||
out.write(spare);
|
||||
count++;
|
||||
}
|
||||
|
||||
assert count == lineCount;
|
||||
|
||||
// Clean up the buffer for the next partition.
|
||||
buffer.clear();
|
||||
|
||||
CodecUtil.writeFooter(out.out);
|
||||
|
||||
return new PartitionAndCount(lineCount, tempFile.getName());
|
||||
}
|
||||
}
|
||||
|
||||
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
|
||||
* information (checksum matched or didn't) as a suppressed exception. */
|
||||
private void verifyChecksum(Throwable priorException, ByteSequencesReader reader) throws IOException {
|
||||
@ -352,129 +352,107 @@ public class OfflineSorter {
|
||||
}
|
||||
|
||||
/** Merge the most recent {@code maxTempFile} partitions into a new partition. */
|
||||
void mergePartitions(Directory trackingDir, List<PartitionAndCount> segments) throws IOException {
|
||||
void mergePartitions(Directory trackingDir, List<Future<Partition>> segments) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
List<PartitionAndCount> segmentsToMerge;
|
||||
List<Future<Partition>> segmentsToMerge;
|
||||
if (segments.size() > maxTempFiles) {
|
||||
segmentsToMerge = segments.subList(segments.size() - maxTempFiles, segments.size());
|
||||
} else {
|
||||
segmentsToMerge = segments;
|
||||
}
|
||||
|
||||
long totalCount = 0;
|
||||
for (PartitionAndCount segment : segmentsToMerge) {
|
||||
totalCount += segment.count;
|
||||
}
|
||||
sortInfo.mergeRounds++;
|
||||
|
||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
|
||||
@Override
|
||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||
return comparator.compare(a.current, b.current) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
ByteSequencesReader[] streams = new ByteSequencesReader[segmentsToMerge.size()];
|
||||
|
||||
String newSegmentName = null;
|
||||
|
||||
try (ByteSequencesWriter writer = getWriter(trackingDir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT), totalCount)) {
|
||||
|
||||
newSegmentName = writer.out.getName();
|
||||
|
||||
// Open streams and read the top for each file
|
||||
for (int i = 0; i < segmentsToMerge.size(); i++) {
|
||||
streams[i] = getReader(dir.openChecksumInput(segmentsToMerge.get(i).fileName, IOContext.READONCE), segmentsToMerge.get(i).fileName);
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = streams[i].next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, streams[i]);
|
||||
}
|
||||
assert item != null;
|
||||
queue.insertWithOverflow(new FileAndTop(i, item));
|
||||
}
|
||||
|
||||
// Unix utility sort() uses ordered array of files to pick the next line from, updating
|
||||
// it as it reads new lines. The PQ used here is a more elegant solution and has
|
||||
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
|
||||
// so it shouldn't make much of a difference (didn't check).
|
||||
FileAndTop top;
|
||||
while ((top = queue.top()) != null) {
|
||||
writer.write(top.current);
|
||||
try {
|
||||
top.current = streams[top.fd].next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, streams[top.fd]);
|
||||
}
|
||||
|
||||
if (top.current != null) {
|
||||
queue.updateTop();
|
||||
} else {
|
||||
queue.pop();
|
||||
}
|
||||
}
|
||||
|
||||
CodecUtil.writeFooter(writer.out);
|
||||
|
||||
for(ByteSequencesReader reader : streams) {
|
||||
CodecUtil.checkFooter(reader.in);
|
||||
}
|
||||
|
||||
sortInfo.mergeTime += System.currentTimeMillis() - start;
|
||||
sortInfo.mergeRounds++;
|
||||
} finally {
|
||||
IOUtils.close(streams);
|
||||
}
|
||||
|
||||
IOUtils.deleteFiles(trackingDir, segmentsToMerge.stream().map(segment -> segment.fileName).collect(Collectors.toList()));
|
||||
MergePartitionsTask task = new MergePartitionsTask(trackingDir, new ArrayList<>(segmentsToMerge));
|
||||
|
||||
segmentsToMerge.clear();
|
||||
segments.add(new PartitionAndCount(totalCount, newSegmentName));
|
||||
segments.add(exec.submit(task));
|
||||
|
||||
sortInfo.tempMergeFiles++;
|
||||
}
|
||||
|
||||
/** Holds one partition of items, either loaded into memory or based on a file. */
|
||||
private static class Partition {
|
||||
public final SortableBytesRefArray buffer;
|
||||
public final boolean exhausted;
|
||||
public final long count;
|
||||
public final String fileName;
|
||||
|
||||
/** A partition loaded into memory. */
|
||||
public Partition(SortableBytesRefArray buffer, boolean exhausted) {
|
||||
this.buffer = buffer;
|
||||
this.fileName = null;
|
||||
this.count = buffer.size();
|
||||
this.exhausted = exhausted;
|
||||
}
|
||||
|
||||
/** An on-disk partition. */
|
||||
public Partition(String fileName, long count) {
|
||||
this.buffer = null;
|
||||
this.fileName = fileName;
|
||||
this.count = count;
|
||||
this.exhausted = true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Read in a single partition of data, setting isExhausted[0] to true if there are no more items. */
|
||||
int readPartition(ByteSequencesReader reader, boolean[] isExhausted) throws IOException {
|
||||
long start = System.currentTimeMillis();
|
||||
if (valueLength != -1) {
|
||||
int limit = ramBufferSize.bytes / valueLength;
|
||||
for(int i=0;i<limit;i++) {
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = reader.next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, reader);
|
||||
Partition readPartition(ByteSequencesReader reader) throws IOException, InterruptedException {
|
||||
if (partitionsInRAM != null) {
|
||||
partitionsInRAM.acquire();
|
||||
}
|
||||
boolean success = false;
|
||||
try {
|
||||
long start = System.currentTimeMillis();
|
||||
SortableBytesRefArray buffer;
|
||||
boolean exhausted = false;
|
||||
int count;
|
||||
if (valueLength != -1) {
|
||||
// fixed length case
|
||||
buffer = new FixedLengthBytesRefArray(valueLength);
|
||||
int limit = ramBufferSize.bytes / valueLength;
|
||||
for(int i=0;i<limit;i++) {
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = reader.next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, reader);
|
||||
}
|
||||
if (item == null) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
buffer.append(item);
|
||||
}
|
||||
if (item == null) {
|
||||
isExhausted[0] = true;
|
||||
break;
|
||||
} else {
|
||||
Counter bufferBytesUsed = Counter.newCounter();
|
||||
buffer = new BytesRefArray(bufferBytesUsed);
|
||||
while (true) {
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = reader.next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, reader);
|
||||
}
|
||||
if (item == null) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
buffer.append(item);
|
||||
// Account for the created objects.
|
||||
// (buffer slots do not account to buffer size.)
|
||||
if (bufferBytesUsed.get() > ramBufferSize.bytes) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
buffer.append(item);
|
||||
}
|
||||
} else {
|
||||
while (true) {
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = reader.next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, reader);
|
||||
}
|
||||
if (item == null) {
|
||||
isExhausted[0] = true;
|
||||
break;
|
||||
}
|
||||
buffer.append(item);
|
||||
// Account for the created objects.
|
||||
// (buffer slots do not account to buffer size.)
|
||||
if (bufferBytesUsed.get() > ramBufferSize.bytes) {
|
||||
break;
|
||||
}
|
||||
sortInfo.readTimeMS += System.currentTimeMillis() - start;
|
||||
success = true;
|
||||
return new Partition(buffer, exhausted);
|
||||
} finally {
|
||||
if (success == false && partitionsInRAM != null) {
|
||||
partitionsInRAM.release();
|
||||
}
|
||||
}
|
||||
sortInfo.readTime += System.currentTimeMillis() - start;
|
||||
return buffer.size();
|
||||
}
|
||||
|
||||
static class FileAndTop {
|
||||
@ -606,13 +584,146 @@ public class OfflineSorter {
|
||||
return comparator;
|
||||
}
|
||||
|
||||
private static class PartitionAndCount {
|
||||
final long count;
|
||||
final String fileName;
|
||||
/** Sorts one in-memory partition, writes it to disk, and returns the resulting file-based partition. */
|
||||
private class SortPartitionTask implements Callable<Partition> {
|
||||
|
||||
public PartitionAndCount(long count, String fileName) {
|
||||
this.count = count;
|
||||
this.fileName = fileName;
|
||||
private final Directory dir;
|
||||
private final Partition part;
|
||||
|
||||
public SortPartitionTask(Directory dir, Partition part) {
|
||||
this.dir = dir;
|
||||
this.part = part;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partition call() throws IOException {
|
||||
try (IndexOutput tempFile = dir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT);
|
||||
ByteSequencesWriter out = getWriter(tempFile, part.buffer.size());) {
|
||||
|
||||
BytesRef spare;
|
||||
|
||||
long startMS = System.currentTimeMillis();
|
||||
BytesRefIterator iter = part.buffer.iterator(comparator);
|
||||
sortInfo.sortTimeMS.addAndGet(System.currentTimeMillis() - startMS);
|
||||
|
||||
int count = 0;
|
||||
while ((spare = iter.next()) != null) {
|
||||
assert spare.length <= Short.MAX_VALUE;
|
||||
out.write(spare);
|
||||
count++;
|
||||
}
|
||||
|
||||
assert count == part.count;
|
||||
|
||||
CodecUtil.writeFooter(out.out);
|
||||
part.buffer.clear();
|
||||
|
||||
return new Partition(tempFile.getName(), part.count);
|
||||
} finally {
|
||||
if (partitionsInRAM != null) {
|
||||
partitionsInRAM.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Partition getPartition(Future<Partition> future) throws IOException {
|
||||
try {
|
||||
return future.get();
|
||||
} catch (InterruptedException ie) {
|
||||
throw new ThreadInterruptedException(ie);
|
||||
} catch (ExecutionException ee) {
|
||||
// Theoretically cause can be null; guard against that.
|
||||
Throwable cause = ee.getCause();
|
||||
throw IOUtils.rethrowAlways(cause != null ? cause : ee);
|
||||
}
|
||||
}
|
||||
|
||||
/** Merges multiple file-based partitions to a single on-disk partition. */
|
||||
private class MergePartitionsTask implements Callable<Partition> {
|
||||
private final Directory dir;
|
||||
private final List<Future<Partition>> segmentsToMerge;
|
||||
|
||||
public MergePartitionsTask(Directory dir, List<Future<Partition>> segmentsToMerge) {
|
||||
this.dir = dir;
|
||||
this.segmentsToMerge = segmentsToMerge;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Partition call() throws IOException {
|
||||
long totalCount = 0;
|
||||
for (Future<Partition> segment : segmentsToMerge) {
|
||||
totalCount += getPartition(segment).count;
|
||||
}
|
||||
|
||||
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
|
||||
@Override
|
||||
protected boolean lessThan(FileAndTop a, FileAndTop b) {
|
||||
return comparator.compare(a.current, b.current) < 0;
|
||||
}
|
||||
};
|
||||
|
||||
ByteSequencesReader[] streams = new ByteSequencesReader[segmentsToMerge.size()];
|
||||
|
||||
String newSegmentName = null;
|
||||
|
||||
long startMS = System.currentTimeMillis();
|
||||
try (ByteSequencesWriter writer = getWriter(dir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT), totalCount)) {
|
||||
|
||||
newSegmentName = writer.out.getName();
|
||||
|
||||
// Open streams and read the top for each file
|
||||
for (int i = 0; i < segmentsToMerge.size(); i++) {
|
||||
Partition segment = getPartition(segmentsToMerge.get(i));
|
||||
streams[i] = getReader(dir.openChecksumInput(segment.fileName, IOContext.READONCE), segment.fileName);
|
||||
|
||||
BytesRef item = null;
|
||||
try {
|
||||
item = streams[i].next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, streams[i]);
|
||||
}
|
||||
assert item != null;
|
||||
queue.insertWithOverflow(new FileAndTop(i, item));
|
||||
}
|
||||
|
||||
// Unix utility sort() uses ordered array of files to pick the next line from, updating
|
||||
// it as it reads new lines. The PQ used here is a more elegant solution and has
|
||||
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
|
||||
// so it shouldn't make much of a difference (didn't check).
|
||||
FileAndTop top;
|
||||
while ((top = queue.top()) != null) {
|
||||
writer.write(top.current);
|
||||
try {
|
||||
top.current = streams[top.fd].next();
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, streams[top.fd]);
|
||||
}
|
||||
|
||||
if (top.current != null) {
|
||||
queue.updateTop();
|
||||
} else {
|
||||
queue.pop();
|
||||
}
|
||||
}
|
||||
|
||||
CodecUtil.writeFooter(writer.out);
|
||||
|
||||
for(ByteSequencesReader reader : streams) {
|
||||
CodecUtil.checkFooter(reader.in);
|
||||
}
|
||||
|
||||
sortInfo.mergeTimeMS.addAndGet(System.currentTimeMillis() - startMS);
|
||||
} finally {
|
||||
IOUtils.close(streams);
|
||||
}
|
||||
List<String> toDelete = new ArrayList<>();
|
||||
for (Future<Partition> segment : segmentsToMerge) {
|
||||
toDelete.add(getPartition(segment).fileName);
|
||||
}
|
||||
IOUtils.deleteFiles(dir, toDelete);
|
||||
|
||||
return new Partition(newSegmentName, totalCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.AbstractExecutorService;
|
||||
import java.util.concurrent.RejectedExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/** An {@code ExecutorService} that executes tasks immediately in the calling thread during submit.
|
||||
*
|
||||
* @lucene.internal */
|
||||
public final class SameThreadExecutorService extends AbstractExecutorService {
|
||||
private volatile boolean shutdown;
|
||||
|
||||
@Override
|
||||
public void execute(Runnable command) {
|
||||
checkShutdown();
|
||||
command.run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Runnable> shutdownNow() {
|
||||
shutdown();
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
this.shutdown = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isTerminated() {
|
||||
// Simplified: we don't check for any threads hanging in execute (we could
|
||||
// introduce an atomic counter, but there seems to be no point).
|
||||
return shutdown == true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isShutdown() {
|
||||
return shutdown == true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException {
|
||||
// See comment in isTerminated();
|
||||
return true;
|
||||
}
|
||||
|
||||
private void checkShutdown() {
|
||||
if (shutdown) {
|
||||
throw new RejectedExecutionException("Executor is shut down.");
|
||||
}
|
||||
}
|
||||
}
|
@ -97,31 +97,74 @@ public class SmallFloat {
|
||||
return Float.intBitsToFloat(bits);
|
||||
}
|
||||
|
||||
|
||||
/** floatToByte(b, mantissaBits=5, zeroExponent=2)
|
||||
* <br>smallest nonzero value = 0.033203125
|
||||
* <br>largest value = 1984.0
|
||||
* <br>epsilon = 0.03125
|
||||
*/
|
||||
public static byte floatToByte52(float f) {
|
||||
int bits = Float.floatToRawIntBits(f);
|
||||
int smallfloat = bits >> (24-5);
|
||||
if (smallfloat <= (63-2)<<5) {
|
||||
return (bits<=0) ? (byte)0 : (byte)1;
|
||||
/** Float-like encoding for positive longs that preserves ordering and 4 significant bits. */
|
||||
public static int longToInt4(long i) {
|
||||
if (i < 0) {
|
||||
throw new IllegalArgumentException("Only supports positive values, got " + i);
|
||||
}
|
||||
if (smallfloat >= ((63-2)<<5) + 0x100) {
|
||||
return -1;
|
||||
int numBits = 64 - Long.numberOfLeadingZeros(i);
|
||||
if (numBits < 4) {
|
||||
// subnormal value
|
||||
return Math.toIntExact(i);
|
||||
} else {
|
||||
// normal value
|
||||
int shift = numBits - 4;
|
||||
// only keep the 5 most significant bits
|
||||
int encoded = Math.toIntExact(i >>> shift);
|
||||
// clear the most significant bit, which is implicit
|
||||
encoded &= 0x07;
|
||||
// encode the shift, adding 1 because 0 is reserved for subnormal values
|
||||
encoded |= (shift + 1) << 3;
|
||||
return encoded;
|
||||
}
|
||||
return (byte)(smallfloat - ((63-2)<<5));
|
||||
}
|
||||
|
||||
/** byteToFloat(b, mantissaBits=5, zeroExponent=2) */
|
||||
public static float byte52ToFloat(byte b) {
|
||||
// on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup
|
||||
// is only a little bit faster (anywhere from 0% to 7%)
|
||||
if (b == 0) return 0.0f;
|
||||
int bits = (b&0xff) << (24-5);
|
||||
bits += (63-2) << 24;
|
||||
return Float.intBitsToFloat(bits);
|
||||
/**
|
||||
* Decode values encoded with {@link #longToInt4(long)}.
|
||||
*/
|
||||
public static final long int4ToLong(int i) {
|
||||
long bits = i & 0x07;
|
||||
int shift = (i >>> 3) - 1;
|
||||
long decoded;
|
||||
if (shift == -1) {
|
||||
// subnormal value
|
||||
decoded = bits;
|
||||
} else {
|
||||
// normal value
|
||||
decoded = (bits | 0x08) << shift;
|
||||
}
|
||||
return decoded;
|
||||
}
|
||||
|
||||
private static final int MAX_INT4 = longToInt4(Integer.MAX_VALUE);
|
||||
private static final int NUM_FREE_VALUES = 255 - MAX_INT4;
|
||||
|
||||
/**
|
||||
* Encode an integer to a byte. It is built upon {@link #longToInt4(long)}
|
||||
* and leverages the fact that {@code longToInt4(Integer.MAX_VALUE)} is
|
||||
* less than 255 to encode low values more accurately.
|
||||
*/
|
||||
public static byte intToByte4(int i) {
|
||||
if (i < 0) {
|
||||
throw new IllegalArgumentException("Only supports positive values, got " + i);
|
||||
}
|
||||
if (i < NUM_FREE_VALUES) {
|
||||
return (byte) i;
|
||||
} else {
|
||||
return (byte) (NUM_FREE_VALUES + longToInt4(i - NUM_FREE_VALUES));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode values that have been encoded with {@link #intToByte4(int)}.
|
||||
*/
|
||||
public static int byte4ToInt(byte b) {
|
||||
int i = Byte.toUnsignedInt(b);
|
||||
if (i < NUM_FREE_VALUES) {
|
||||
return i;
|
||||
} else {
|
||||
long decoded = NUM_FREE_VALUES + int4ToLong(i - NUM_FREE_VALUES);
|
||||
return Math.toIntExact(decoded);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -115,6 +115,13 @@ public final class Version {
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_6_0 = new Version(6, 6, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 6.7.0 release.
|
||||
* @deprecated Use latest
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Version LUCENE_6_7_0 = new Version(6, 7, 0);
|
||||
|
||||
/**
|
||||
* Match settings and bugs in Lucene's 7.0.0 release.
|
||||
* <p>
|
||||
|
@ -884,7 +884,7 @@ public class BKDWriter implements Closeable {
|
||||
};
|
||||
}
|
||||
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
|
||||
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {
|
||||
|
||||
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
|
||||
@Override
|
||||
@ -1362,7 +1362,9 @@ public class BKDWriter implements Closeable {
|
||||
|
||||
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
|
||||
* information (checksum matched or didn't) as a suppressed exception. */
|
||||
private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
|
||||
private Error verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
|
||||
assert priorException != null;
|
||||
|
||||
// TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
|
||||
// right reader after recursing to children, and possibly within recursed children,
|
||||
// since all together they make a single pass through the file. But this is a sizable re-org,
|
||||
@ -1373,10 +1375,10 @@ public class BKDWriter implements Closeable {
|
||||
try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
|
||||
CodecUtil.checkFooter(in, priorException);
|
||||
}
|
||||
} else {
|
||||
// We are reading from heap; nothing to add:
|
||||
IOUtils.reThrow(priorException);
|
||||
}
|
||||
|
||||
// We are reading from heap; nothing to add:
|
||||
throw IOUtils.rethrowAlways(priorException);
|
||||
}
|
||||
|
||||
/** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
|
||||
@ -1398,7 +1400,7 @@ public class BKDWriter implements Closeable {
|
||||
reader.markOrds(rightCount-1, ordBitSet);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, source.writer);
|
||||
throw verifyChecksum(t, source.writer);
|
||||
}
|
||||
|
||||
return scratch1;
|
||||
@ -1469,10 +1471,7 @@ public class BKDWriter implements Closeable {
|
||||
}
|
||||
return new PathSlice(writer, 0, count);
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, source.writer);
|
||||
|
||||
// Dead code but javac disagrees:
|
||||
return null;
|
||||
throw verifyChecksum(t, source.writer);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1797,7 +1796,7 @@ public class BKDWriter implements Closeable {
|
||||
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
|
||||
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
|
||||
} catch (Throwable t) {
|
||||
verifyChecksum(t, slices[dim].writer);
|
||||
throw verifyChecksum(t, slices[dim].writer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -48,7 +48,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
|
||||
* This class also provides helpers to explore the different paths of the {@link Automaton}.
|
||||
*/
|
||||
public final class GraphTokenStreamFiniteStrings {
|
||||
private final Map<BytesRef, Integer> termToID = new HashMap<>();
|
||||
private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
|
||||
private final Map<Integer, Integer> idToInc = new HashMap<>();
|
||||
private final Automaton det;
|
||||
@ -247,35 +246,18 @@ public final class GraphTokenStreamFiniteStrings {
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets an integer id for a given term.
|
||||
*
|
||||
* If there is no position gaps for this token then we can reuse the id for the same term if it appeared at another
|
||||
* position without a gap. If we have a position gap generate a new id so we can keep track of the position
|
||||
* increment.
|
||||
* Gets an integer id for a given term and saves the position increment if needed.
|
||||
*/
|
||||
private int getTermID(int incr, int prevIncr, BytesRef term) {
|
||||
assert term != null;
|
||||
boolean isStackedGap = incr == 0 && prevIncr > 1;
|
||||
boolean hasGap = incr > 1;
|
||||
Integer id;
|
||||
if (hasGap || isStackedGap) {
|
||||
id = idToTerm.size();
|
||||
idToTerm.put(id, BytesRef.deepCopyOf(term));
|
||||
|
||||
// stacked token should have the same increment as original token at this position
|
||||
if (isStackedGap) {
|
||||
idToInc.put(id, prevIncr);
|
||||
} else {
|
||||
idToInc.put(id, incr);
|
||||
}
|
||||
} else {
|
||||
id = termToID.get(term);
|
||||
if (id == null) {
|
||||
term = BytesRef.deepCopyOf(term);
|
||||
id = idToTerm.size();
|
||||
termToID.put(term, id);
|
||||
idToTerm.put(id, term);
|
||||
}
|
||||
int id = idToTerm.size();
|
||||
idToTerm.put(id, BytesRef.deepCopyOf(term));
|
||||
// stacked token should have the same increment as original token at this position
|
||||
if (isStackedGap) {
|
||||
idToInc.put(id, prevIncr);
|
||||
} else if (incr > 1) {
|
||||
idToInc.put(id, incr);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
@ -303,4 +303,17 @@ public class TestCodecUtil extends LuceneTestCase {
|
||||
fakeChecksum.set((1L << 32) - 1); // ok
|
||||
CodecUtil.writeCRC(fakeOutput);
|
||||
}
|
||||
|
||||
public void testTruncatedFileThrowsCorruptIndexException() throws IOException {
|
||||
RAMFile file = new RAMFile();
|
||||
IndexOutput output = new RAMOutputStream(file, false);
|
||||
output.close();
|
||||
IndexInput input = new RAMInputStream("file", file);
|
||||
CorruptIndexException e = expectThrows(CorruptIndexException.class,
|
||||
() -> CodecUtil.checksumEntireFile(input));
|
||||
assertEquals("misplaced codec footer (file truncated?): length=0 but footerLength==16 (resource=RAMInputStream(name=file))", e.getMessage());
|
||||
e = expectThrows(CorruptIndexException.class,
|
||||
() -> CodecUtil.retrieveChecksum(input));
|
||||
assertEquals("misplaced codec footer (file truncated?): length=0 but footerLength==16 (resource=RAMInputStream(name=file))", e.getMessage());
|
||||
}
|
||||
}
|
||||
|
@ -237,8 +237,11 @@ public class TestDemoParallelLeafReader extends LuceneTestCase {
|
||||
firstExc = t;
|
||||
}
|
||||
}
|
||||
|
||||
// throw the first exception
|
||||
IOUtils.reThrow(firstExc);
|
||||
if (firstExc != null) {
|
||||
throw IOUtils.rethrowAlways(firstExc);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -549,10 +552,11 @@ public class TestDemoParallelLeafReader extends LuceneTestCase {
|
||||
}
|
||||
}
|
||||
|
||||
// If any error occured, throw it.
|
||||
IOUtils.reThrow(th);
|
||||
if (th != null) {
|
||||
throw IOUtils.rethrowAlways(th);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setMergeInfo(SegmentCommitInfo info) {
|
||||
// Record that this merged segment is current as of this schemaGen:
|
||||
|
@ -2403,4 +2403,86 @@ public class TestIndexSorting extends LuceneTestCase {
|
||||
}
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testIndexSortWithSparseField() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
SortField sortField = new SortField("dense_int", SortField.Type.INT, true);
|
||||
Sort indexSort = new Sort(sortField);
|
||||
iwc.setIndexSort(indexSort);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
Field textField = newTextField("sparse_text", "", Field.Store.NO);
|
||||
for (int i = 0; i < 128; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new NumericDocValuesField("dense_int", i));
|
||||
if (i < 64) {
|
||||
doc.add(new NumericDocValuesField("sparse_int", i));
|
||||
doc.add(new BinaryDocValuesField("sparse_binary", new BytesRef(Integer.toString(i))));
|
||||
textField.setStringValue("foo");
|
||||
doc.add(textField);
|
||||
}
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.commit();
|
||||
w.forceMerge(1);
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
assertEquals(1, r.leaves().size());
|
||||
LeafReader leafReader = r.leaves().get(0).reader();
|
||||
|
||||
NumericDocValues denseValues = leafReader.getNumericDocValues("dense_int");
|
||||
NumericDocValues sparseValues = leafReader.getNumericDocValues("sparse_int");
|
||||
BinaryDocValues sparseBinaryValues = leafReader.getBinaryDocValues("sparse_binary");
|
||||
NumericDocValues normsValues = leafReader.getNormValues("sparse_text");
|
||||
for(int docID = 0; docID < 128; docID++) {
|
||||
assertTrue(denseValues.advanceExact(docID));
|
||||
assertEquals(127-docID, (int) denseValues.longValue());
|
||||
if (docID >= 64) {
|
||||
assertTrue(denseValues.advanceExact(docID));
|
||||
assertTrue(sparseValues.advanceExact(docID));
|
||||
assertTrue(sparseBinaryValues.advanceExact(docID));
|
||||
assertTrue(normsValues.advanceExact(docID));
|
||||
assertEquals(1, normsValues.longValue());
|
||||
assertEquals(127-docID, (int) sparseValues.longValue());
|
||||
assertEquals(new BytesRef(Integer.toString(127-docID)), sparseBinaryValues.binaryValue());
|
||||
} else {
|
||||
assertFalse(sparseBinaryValues.advanceExact(docID));
|
||||
assertFalse(sparseValues.advanceExact(docID));
|
||||
assertFalse(normsValues.advanceExact(docID));
|
||||
}
|
||||
}
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
public void testIndexSortOnSparseField() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
SortField sortField = new SortField("sparse", SortField.Type.INT, false);
|
||||
sortField.setMissingValue(Integer.MIN_VALUE);
|
||||
Sort indexSort = new Sort(sortField);
|
||||
iwc.setIndexSort(indexSort);
|
||||
IndexWriter w = new IndexWriter(dir, iwc);
|
||||
for (int i = 0; i < 128; i++) {
|
||||
Document doc = new Document();
|
||||
if (i < 64) {
|
||||
doc.add(new NumericDocValuesField("sparse", i));
|
||||
}
|
||||
w.addDocument(doc);
|
||||
}
|
||||
w.commit();
|
||||
w.forceMerge(1);
|
||||
DirectoryReader r = DirectoryReader.open(w);
|
||||
assertEquals(1, r.leaves().size());
|
||||
LeafReader leafReader = r.leaves().get(0).reader();
|
||||
NumericDocValues sparseValues = leafReader.getNumericDocValues("sparse");
|
||||
for(int docID = 0; docID < 128; docID++) {
|
||||
if (docID >= 64) {
|
||||
assertTrue(sparseValues.advanceExact(docID));
|
||||
assertEquals(docID-64, (int) sparseValues.longValue());
|
||||
} else {
|
||||
assertFalse(sparseValues.advanceExact(docID));
|
||||
}
|
||||
}
|
||||
IOUtils.close(r, w, dir);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -17,6 +17,7 @@
|
||||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
@ -26,7 +27,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
@ -35,12 +38,12 @@ import org.apache.lucene.util.TestUtil;
|
||||
/**
|
||||
* Tests the maxTermFrequency statistic in FieldInvertState
|
||||
*/
|
||||
public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
Directory dir;
|
||||
IndexReader reader;
|
||||
/* expected maxTermFrequency values for our documents */
|
||||
ArrayList<Integer> expected = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
@ -59,14 +62,14 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
dir.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
|
||||
public void test() throws Exception {
|
||||
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
@ -95,30 +98,42 @@ public class TestMaxTermFrequency extends LuceneTestCase {
|
||||
Collections.shuffle(terms, random());
|
||||
return Arrays.toString(terms.toArray(new String[terms.size()]));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple similarity that encodes maxTermFrequency directly as a byte
|
||||
*/
|
||||
static class TestSimilarity extends TFIDFSimilarity {
|
||||
static class TestSimilarity extends Similarity {
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public long computeNorm(FieldInvertState state) {
|
||||
return state.getMaxTermFrequency();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (byte) f;
|
||||
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
|
||||
return new SimWeight() {};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
|
||||
return new SimScorer() {
|
||||
|
||||
@Override
|
||||
public float score(int doc, float freq) throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computeSlopFactor(int distance) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
}
|
||||
|
@ -32,13 +32,11 @@ import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LineFileDocs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
@ -49,67 +47,6 @@ import org.apache.lucene.util.TestUtil;
|
||||
@Slow
|
||||
public class TestNorms extends LuceneTestCase {
|
||||
static final String BYTE_TEST_FIELD = "normsTestByte";
|
||||
|
||||
static class CustomNormEncodingSimilarity extends TFIDFSimilarity {
|
||||
|
||||
@Override
|
||||
public long encodeNormValue(float f) {
|
||||
return (long) f;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float decodeNormValue(long norm) {
|
||||
return norm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
return state.getLength();
|
||||
}
|
||||
|
||||
@Override public float tf(float freq) { return 0; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 0; }
|
||||
@Override public float sloppyFreq(int distance) { return 0; }
|
||||
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
|
||||
}
|
||||
|
||||
// LUCENE-1260
|
||||
public void testCustomEncoder() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
MockAnalyzer analyzer = new MockAnalyzer(random());
|
||||
|
||||
IndexWriterConfig config = newIndexWriterConfig(analyzer);
|
||||
config.setSimilarity(new CustomNormEncodingSimilarity());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
|
||||
Document doc = new Document();
|
||||
Field foo = newTextField("foo", "", Field.Store.NO);
|
||||
Field bar = newTextField("bar", "", Field.Store.NO);
|
||||
doc.add(foo);
|
||||
doc.add(bar);
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
bar.setStringValue("singleton");
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
assertEquals(i, fooNorms.nextDoc());
|
||||
assertEquals(0, fooNorms.longValue());
|
||||
}
|
||||
|
||||
NumericDocValues barNorms = MultiDocValues.getNormValues(reader, "bar");
|
||||
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||
assertEquals(i, barNorms.nextDoc());
|
||||
assertEquals(1, barNorms.longValue());
|
||||
}
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testMaxByteNorms() throws IOException {
|
||||
Directory dir = newFSDirectory(createTempDir("TestNorms.testMaxByteNorms"));
|
||||
|
@ -44,9 +44,7 @@ import org.apache.lucene.util.LuceneTestCase;
|
||||
public class TestOmitTf extends LuceneTestCase {
|
||||
|
||||
public static class SimpleSimilarity extends TFIDFSimilarity {
|
||||
@Override public float decodeNormValue(long norm) { return norm; }
|
||||
@Override public long encodeNormValue(float f) { return (long) f; }
|
||||
@Override public float lengthNorm(FieldInvertState state) { return 1; }
|
||||
@Override public float lengthNorm(int length) { return 1; }
|
||||
@Override public float tf(float freq) { return freq; }
|
||||
@Override public float sloppyFreq(int distance) { return 2.0f; }
|
||||
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
|
||||
|
@ -30,7 +30,6 @@ import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.FieldInvertState;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
@ -72,7 +71,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
|
||||
}
|
||||
|
||||
@Override
|
||||
public float lengthNorm(FieldInvertState state) {
|
||||
public float lengthNorm(int length) {
|
||||
// Disable length norm
|
||||
return 1;
|
||||
}
|
||||
|
@ -31,11 +31,18 @@ public class TestDoubleRangeFieldQueries extends BaseRangeFieldQueryTestCase {
|
||||
private static final String FIELD_NAME = "doubleRangeField";
|
||||
|
||||
private double nextDoubleInternal() {
|
||||
if (rarely()) {
|
||||
return random().nextBoolean() ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
|
||||
switch (random().nextInt(5)) {
|
||||
case 0:
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
case 1:
|
||||
return Double.POSITIVE_INFINITY;
|
||||
default:
|
||||
if (random().nextBoolean()) {
|
||||
return random().nextDouble();
|
||||
} else {
|
||||
return (random().nextInt(15) - 7) / 3d;
|
||||
}
|
||||
}
|
||||
double max = Double.MAX_VALUE / 2;
|
||||
return (max + max) * random().nextDouble() - max;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
@ -26,6 +27,7 @@ import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FloatDocValuesField;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
@ -164,4 +166,65 @@ public class TestDoubleValuesSource extends LuceneTestCase {
|
||||
CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs);
|
||||
}
|
||||
}
|
||||
|
||||
static final Query[] testQueries = new Query[]{
|
||||
new MatchAllDocsQuery(),
|
||||
new TermQuery(new Term("oddeven", "odd")),
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("english", "one")), BooleanClause.Occur.MUST)
|
||||
.add(new TermQuery(new Term("english", "two")), BooleanClause.Occur.MUST)
|
||||
.build()
|
||||
};
|
||||
|
||||
public void testExplanations() throws Exception {
|
||||
for (Query q : testQueries) {
|
||||
testExplanations(q, DoubleValuesSource.fromIntField("int"));
|
||||
testExplanations(q, DoubleValuesSource.fromLongField("long"));
|
||||
testExplanations(q, DoubleValuesSource.fromFloatField("float"));
|
||||
testExplanations(q, DoubleValuesSource.fromDoubleField("double"));
|
||||
testExplanations(q, DoubleValuesSource.fromDoubleField("onefield"));
|
||||
testExplanations(q, DoubleValuesSource.constant(5.45));
|
||||
testExplanations(q, DoubleValuesSource.function(
|
||||
DoubleValuesSource.fromDoubleField("double"), "v * 4 + 73",
|
||||
v -> v * 4 + 73
|
||||
));
|
||||
testExplanations(q, DoubleValuesSource.scoringFunction(
|
||||
DoubleValuesSource.fromDoubleField("double"), "v * score", (v, s) -> v * s
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
private void testExplanations(Query q, DoubleValuesSource vs) throws IOException {
|
||||
searcher.search(q, new SimpleCollector() {
|
||||
|
||||
DoubleValues v;
|
||||
LeafReaderContext ctx;
|
||||
|
||||
@Override
|
||||
protected void doSetNextReader(LeafReaderContext context) throws IOException {
|
||||
this.ctx = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
this.v = vs.getValues(this.ctx, DoubleValuesSource.fromScorer(scorer));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
Explanation scoreExpl = searcher.explain(q, ctx.docBase + doc);
|
||||
if (this.v.advanceExact(doc)) {
|
||||
CheckHits.verifyExplanation("", doc, (float) v.doubleValue(), true, vs.explain(ctx, doc, scoreExpl));
|
||||
}
|
||||
else {
|
||||
assertFalse(vs.explain(ctx, doc, scoreExpl).isMatch());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean needsScores() {
|
||||
return vs.needsScores();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -33,6 +33,7 @@ import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.FieldValueHitQueue.Entry;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
@ -63,7 +64,7 @@ public class TestElevationComparator extends LuceneTestCase {
|
||||
writer.close();
|
||||
|
||||
IndexSearcher searcher = newSearcher(r);
|
||||
searcher.setSimilarity(new ClassicSimilarity());
|
||||
searcher.setSimilarity(new BM25Similarity());
|
||||
|
||||
runTest(searcher, true);
|
||||
runTest(searcher, false);
|
||||
@ -98,11 +99,11 @@ public class TestElevationComparator extends LuceneTestCase {
|
||||
assertEquals(3, topDocs.scoreDocs[1].doc);
|
||||
|
||||
if (reversed) {
|
||||
assertEquals(2, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(1, topDocs.scoreDocs[3].doc);
|
||||
} else {
|
||||
assertEquals(1, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(2, topDocs.scoreDocs[3].doc);
|
||||
} else {
|
||||
assertEquals(2, topDocs.scoreDocs[2].doc);
|
||||
assertEquals(1, topDocs.scoreDocs[3].doc);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -31,11 +31,18 @@ public class TestFloatRangeFieldQueries extends BaseRangeFieldQueryTestCase {
|
||||
private static final String FIELD_NAME = "floatRangeField";
|
||||
|
||||
private float nextFloatInternal() {
|
||||
if (rarely()) {
|
||||
return random().nextBoolean() ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
|
||||
switch (random().nextInt(5)) {
|
||||
case 0:
|
||||
return Float.NEGATIVE_INFINITY;
|
||||
case 1:
|
||||
return Float.POSITIVE_INFINITY;
|
||||
default:
|
||||
if (random().nextBoolean()) {
|
||||
return random().nextFloat();
|
||||
} else {
|
||||
return (random().nextInt(15) - 7) / 3f;
|
||||
}
|
||||
}
|
||||
float max = Float.MAX_VALUE / 2;
|
||||
return (max + max) * random().nextFloat() - max;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.document.IntRange;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Random testing for IntRange Queries.
|
||||
@ -31,11 +32,25 @@ public class TestIntRangeFieldQueries extends BaseRangeFieldQueryTestCase {
|
||||
private static final String FIELD_NAME = "intRangeField";
|
||||
|
||||
private int nextIntInternal() {
|
||||
if (rarely()) {
|
||||
return random().nextBoolean() ? Integer.MAX_VALUE : Integer.MIN_VALUE;
|
||||
switch (random().nextInt(5)) {
|
||||
case 0:
|
||||
return Integer.MIN_VALUE;
|
||||
case 1:
|
||||
return Integer.MAX_VALUE;
|
||||
default:
|
||||
int bpv = random().nextInt(32);
|
||||
switch (bpv) {
|
||||
case 32:
|
||||
return random().nextInt();
|
||||
default:
|
||||
int v = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
|
||||
if (bpv > 0) {
|
||||
// negative values sometimes
|
||||
v -= 1 << (bpv - 1);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
}
|
||||
int max = Integer.MAX_VALUE / 2;
|
||||
return (max + max) * random().nextInt() - max;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -660,12 +660,14 @@ public class TestLRUQueryCache extends LuceneTestCase {
|
||||
@Override
|
||||
protected void onQueryCache(Query query, long ramBytesUsed) {
|
||||
super.onQueryCache(query, ramBytesUsed);
|
||||
assertNotNull("cached query is null", query);
|
||||
ramBytesUsage.addAndGet(ramBytesUsed);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void onQueryEviction(Query query, long ramBytesUsed) {
|
||||
super.onQueryEviction(query, ramBytesUsed);
|
||||
assertNotNull("evicted query is null", query);
|
||||
ramBytesUsage.addAndGet(-ramBytesUsed);
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.document.LongRange;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
/**
|
||||
* Random testing for LongRange Queries.
|
||||
@ -31,11 +32,25 @@ public class TestLongRangeFieldQueries extends BaseRangeFieldQueryTestCase {
|
||||
private static final String FIELD_NAME = "longRangeField";
|
||||
|
||||
private long nextLongInternal() {
|
||||
if (rarely()) {
|
||||
return random().nextBoolean() ? Long.MAX_VALUE : Long.MIN_VALUE;
|
||||
switch (random().nextInt(5)) {
|
||||
case 0:
|
||||
return Long.MIN_VALUE;
|
||||
case 1:
|
||||
return Long.MAX_VALUE;
|
||||
default:
|
||||
int bpv = random().nextInt(64);
|
||||
switch (bpv) {
|
||||
case 64:
|
||||
return random().nextLong();
|
||||
default:
|
||||
long v = TestUtil.nextLong(random(), 0, (1L << bpv) - 1);
|
||||
if (bpv > 0) {
|
||||
// negative values sometimes
|
||||
v -= 1L << (bpv - 1);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
}
|
||||
long max = Long.MAX_VALUE / 2;
|
||||
return (max + max) * random().nextLong() - max;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user