Merge branch 'master' into feature/autoscaling

This commit is contained in:
Shalin Shekhar Mangar 2017-06-06 19:30:00 +05:30
commit 6a8768e395
1423 changed files with 84567 additions and 62622 deletions

41
.gitignore vendored
View File

@ -1,20 +1,19 @@
# .
/eclipse-build
/classes
**/build
build
/idea-build
**/dist
**/lib
**/test-lib
dist
lib
test-lib
/*~
/velocity.log
/build.properties
/.idea
lucene/**/*.iml
solr/**/*.iml
parent.iml
**/*.ipr
**/*.iws
*.ipr
*.iws
/.project
/.classpath
/.settings
@ -22,33 +21,7 @@ parent.iml
/prj.el
/bin
/bin.*
**/pom.xml
pom.xml
/nbproject
/nb-build
.pydevproject
/solr/package
# can this be minimized?
/solr/example/start.jar
/solr/example/webapps/*
/solr/example/logs/*.log
/solr/example/**/data
/solr/example/solr/lib
/solr/example/solr/logs
/solr/example/solr/zoo_data
/solr/example/work/*
/solr/example/exampledocs/post.jar
/solr/example/example-DIH/**/data
/solr/example/example-DIH/**/dataimport.properties
/solr/example/example-DIH/solr/mail/lib/*.jar
solr/contrib/dataimporthandler/test-lib/
solr/core/test-lib/
solr/server/logs/
solr/server/solr/zoo_data/
solr/server/solr-webapp
solr/server/start.jar

View File

@ -66,6 +66,13 @@
</foaf:Person>
</maintainer>
<release>
<Version>
<name>lucene-6.5.1</name>
<created>2017-04-27</created>
<revision>6.5.1</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-6.5.0</name>

View File

@ -66,6 +66,13 @@
</foaf:Person>
</maintainer>
<release>
<Version>
<name>solr-6.5.1</name>
<created>2017-04-27</created>
<revision>6.5.1</revision>
</Version>
</release>
<release>
<Version>
<name>solr-6.5.0</name>

View File

@ -1,7 +1,7 @@
<component name="libraryTable">
<library name="HSQLDB">
<CLASSES>
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-1.8.0.10.jar!/" />
<root url="jar://$PROJECT_DIR$/solr/example/example-DIH/solr/db/lib/hsqldb-2.4.0.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />

View File

@ -16,8 +16,9 @@
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="queries" />
<orderEntry type="module" scope="TEST" module-name="analysis-common" />
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module" module-name="grouping" />
<orderEntry type="module" module-name="misc" />
<orderEntry type="module" module-name="sandbox" />
</component>
</module>

View File

@ -32,5 +32,6 @@
<orderEntry type="module" module-name="join" />
<orderEntry type="module" module-name="sandbox" />
<orderEntry type="module" module-name="backward-codecs" />
<orderEntry type="module" module-name="codecs" />
</component>
</module>

View File

@ -97,8 +97,8 @@ def prepare(root, version, gpgKeyID, gpgPassword):
print(' Check DOAP files')
checkDOAPfiles(version)
print(' ant clean test')
run('ant clean test')
print(' ant clean test validate documentation-lint')
run('ant clean test validate documentation-lint')
open('rev.txt', mode='wb').write(rev.encode('UTF-8'))

View File

@ -296,7 +296,7 @@ def checkSummary(fullPath):
print()
print(fullPath)
printed = True
print(' missing: %s' % unescapeHTML(lastHREF))
print(' missing description: %s' % unescapeHTML(lastHREF))
anyMissing = True
elif lineLower.find('licensed to the apache software foundation') != -1 or lineLower.find('copyright 2004 the apache software foundation') != -1:
if not printed:

View File

@ -266,7 +266,10 @@ def checkAll(dirName):
if __name__ == '__main__':
if checkAll(sys.argv[1]):
print()
print('Broken javadocs links were found!')
print('Broken javadocs links were found! Common root causes:')
# please feel free to add to this list
print('* A typo of some sort for manually created links.')
print('* Public methods referencing non-public classes in their signature.')
sys.exit(1)
sys.exit(0)

View File

@ -707,8 +707,10 @@ def verifyUnpacked(java, project, artifact, unpackPath, gitRevision, version, te
print(' %s' % line.strip())
raise RuntimeError('source release has WARs...')
print(' run "ant validate"')
java.run_java8('ant validate', '%s/validate.log' % unpackPath)
# Can't run documentation-lint in lucene src, because dev-tools is missing
validateCmd = 'ant validate' if project == 'lucene' else 'ant validate documentation-lint';
print(' run "%s"' % validateCmd)
java.run_java8(validateCmd, '%s/validate.log' % unpackPath)
if project == 'lucene':
print(" run tests w/ Java 8 and testArgs='%s'..." % testArgs)

View File

@ -50,16 +50,31 @@ API Changes
* LUCENE-7701: Grouping collectors have been refactored, such that groups are
now defined by a GroupSelector implementation. (Alan Woodward)
* LUCENE-7741: DoubleValuesSource now has an explain() method (Alan Woodward,
Adrien Grand)
* LUCENE-7815: Removed the PostingsHighlighter; you should use the UnifiedHighlighter
instead, which derived from the UH. WholeBreakIterator and
CustomSeparatorBreakIterator were moved to UH's package. (David Smiley)
* LUCENE-7850: Removed support for legacy numerics. (Adrien Grand)
Bug Fixes
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
(Mike McCandless)
* LUCENE-7859: Spatial-extras PackedQuadPrefixTree bug that only revealed itself
with the new pointsOnly optimizations in LUCENE-7845. (David Smiley)
Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default
codec. (Adrien Grand)
* LUCENE-7730: More accurate encoding of the length normalization factor
thanks to the removal of index-time boosts. (Adrien Grand)
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
@ -78,6 +93,10 @@ Optimizations
values using different numbers of bits per value if this proves to save
storage. (Adrien Grand)
* LUCENE-7845: Enhance spatial-extras RecursivePrefixTreeStrategy queries when the
query is a point (for 2D) or a is a simple date interval (e.g. 1 month). When
the strategy is marked as pointsOnly, the results is a TermQuery. (David Smiley)
Other
* LUCENE-7328: Remove LegacyNumericEncoding from GeoPointField. (Nick Knize)
@ -89,14 +108,76 @@ Other
* LUCENE-7753: Make fields static when possible.
(Daniel Jelinski via Adrien Grand)
* LUCENE-7540: Upgrade ICU to 59.1 (Mike McCandless, Jim Ferenczi)
* LUCENE-7852: Correct copyright year(s) in lucene/LICENSE.txt file.
(Christine Poerschke, Steve Rowe)
======================= Lucene 6.7.0 =======================
Other
* LUCENE-7800: Remove code that potentially rethrows checked exceptions
from methods that don't declare them ("sneaky throw" hack). (Robert Muir,
Uwe Schindler, Dawid Weiss)
Improvements
* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss)
======================= Lucene 6.6.0 =======================
New Features
* LUCENE-7811: Add a concurrent SortedSet facets implementation.
(Mike McCandless)
Bug Fixes
* LUCENE-7777: ByteBlockPool.readBytes sometimes throws
ArrayIndexOutOfBoundsException when byte blocks larger than 32 KB
were added (Mike McCandless)
* LUCENE-7797: The static FSDirectory.listAll(Path) method was always
returning an empty array. (Atkins Chang via Mike McCandless)
* LUCENE-7481: Fixed missing rewrite methods for SpanPayloadCheckQuery
and PayloadScoreQuery. (Erik Hatcher)
* LUCENE-7808: Fixed PayloadScoreQuery and SpanPayloadCheckQuery
.equals and .hashCode methods. (Erik Hatcher)
* LUCENE-7798: Add .equals and .hashCode to ToParentBlockJoinSortField
(Mikhail Khludnev)
* LUCENE-7814: DateRangePrefixTree (in spatial-extras) had edge-case bugs for
years >= 292,000,000. (David Smiley)
* LUCENE-5365, LUCENE-7818: Fix incorrect condition in queryparser's
QueryNodeOperation#logicalAnd(). (Olivier Binda, Amrit Sarkar,
AppChecker via Uwe Schindler)
* LUCENE-7821: The classic and flexible query parsers, as well as Solr's
"lucene"/standard query parser, should require " TO " in range queries,
and accept "TO" as endpoints in range queries. (hossman, Steve Rowe)
* LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
(Jim Ferenczi)
* LUCENE-7817: Pass cached query to onQueryCache instead of null.
(Christoph Kaser via Adrien Grand)
* LUCENE-7831: CodecUtil should not seek to negative offsets. (Adrien Grand)
* LUCENE-7833: ToParentBlockJoinQuery computed the min score instead of the max
score with ScoreMode.MAX. (Adrien Grand)
* LUCENE-7847: Fixed all-docs-match optimization of range queries on range
fields. (Adrien Grand)
* LUCENE-7810: Fix equals() and hashCode() methods of several join queries.
(Hossman, Adrien Grand, Martijn van Groningen)
Improvements
* LUCENE-7782: OfflineSorter now passes the total number of items it
@ -105,6 +186,16 @@ Improvements
* LUCENE-7785: Move dictionary for Ukrainian analyzer to external dependency.
(Andriy Rysin via Steve Rowe, Dawid Weiss)
* LUCENE-7801: SortedSetDocValuesReaderState now implements
Accountable so you can see how much RAM it's using (Robert Muir,
Mike McCandless)
* LUCENE-7792: OfflineSorter can now run concurrently if you pass it
an optional ExecutorService (Dawid Weiss, Mike McCandless)
* LUCENE-7811: Sorted set facets now use sparse storage when
collecting hits, when appropriate. (Mike McCandless)
Optimizations
* LUCENE-7787: spatial-extras HeatmapFacetCounter will now short-circuit it's
@ -112,6 +203,12 @@ Optimizations
Other
* LUCENE-7796: Make IOUtils.reThrow idiom declare Error return type so
callers may use it in a way that compiler knows subsequent code is
unreachable. reThrow is now deprecated in favor of IOUtils.rethrowAlways
with a slightly different semantics (see javadoc). (Hossman, Robert Muir,
Dawid Weiss)
* LUCENE-7754: Inner classes should be static whenever possible.
(Daniel Jelinski via Adrien Grand)

View File

@ -74,3 +74,9 @@ collecting TopDocs for each group, but instead takes a GroupReducer that will
perform any type of reduction on the top groups collected on a first-pass. To
reproduce the old behaviour of SecondPassGroupingCollector, you should instead
use TopGroupsCollector.
## Removed legacy numerics (LUCENE-7850)
Support for legacy numerics has been removed since legacy numerics had been
deprecated since Lucene 6.0. Points should be used instead, see
org.apache.lucene.index.PointValues for an introduction.

View File

@ -1,5 +1,5 @@
Apache Lucene
Copyright 2014 The Apache Software Foundation
Copyright 2001-2017 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
@ -18,13 +18,13 @@ Some data files (under analysis/icu/src/data) are derived from Unicode data such
as the Unicode Character Database. See http://unicode.org/copyright.html for more
details.
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
automatically generated with the moman/finenight FSA library, created by
Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
see http://sites.google.com/site/rrettesite/moman and
see http://sites.google.com/site/rrettesite/moman and
http://bitbucket.org/jpbarrette/moman/overview/
The class org.apache.lucene.util.WeakIdentityMap was derived from
@ -78,7 +78,7 @@ analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.ja
analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
The Stempel analyzer (stempel) includes BSD-licensed software developed
The Stempel analyzer (stempel) includes BSD-licensed software developed
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
and Edmond Nolan.
@ -90,8 +90,8 @@ See http://project.carrot2.org/license.html.
The SmartChineseAnalyzer source code (smartcn) was
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
is derived from Unicode data such as the Unicode Character Database.
WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
is derived from Unicode data such as the Unicode Character Database.
See http://unicode.org/copyright.html for more details.
The Morfologik analyzer (morfologik) includes BSD-licensed software

View File

@ -24,6 +24,8 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.AttributeFactory;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Emits the entire input as a single token.
*/
@ -41,16 +43,16 @@ public final class KeywordTokenizer extends Tokenizer {
}
public KeywordTokenizer(int bufferSize) {
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
}
termAtt.resizeBuffer(bufferSize);
}
public KeywordTokenizer(AttributeFactory factory, int bufferSize) {
super(factory);
if (bufferSize <= 0) {
throw new IllegalArgumentException("bufferSize must be > 0");
if (bufferSize > MAX_TOKEN_LENGTH_LIMIT || bufferSize <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + bufferSize);
}
termAtt.resizeBuffer(bufferSize);
}

View File

@ -16,26 +16,39 @@
*/
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.Map;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Factory for {@link KeywordTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory"/&gt;
* &lt;tokenizer class="solr.KeywordTokenizerFactory" maxTokenLen="256"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* Options:
* <ul>
* <li>maxTokenLen: max token length, should be greater than 0 and less than
* MAX_TOKEN_LENGTH_LIMIT (1024*1024). It is rare to need to change this
* else {@link KeywordTokenizer}::DEFAULT_BUFFER_SIZE</li>
* </ul>
*/
public class KeywordTokenizerFactory extends TokenizerFactory {
private final int maxTokenLen;
/** Creates a new KeywordTokenizerFactory */
public KeywordTokenizerFactory(Map<String,String> args) {
super(args);
maxTokenLen = getInt(args, "maxTokenLen", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -43,6 +56,6 @@ public class KeywordTokenizerFactory extends TokenizerFactory {
@Override
public KeywordTokenizer create(AttributeFactory factory) {
return new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
return new KeywordTokenizer(factory, maxTokenLen);
}
}

View File

@ -50,6 +50,20 @@ public class LetterTokenizer extends CharTokenizer {
super(factory);
}
/**
* Construct a new LetterTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public LetterTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory, maxTokenLen);
}
/** Collects only characters which satisfy
* {@link Character#isLetter(int)}.*/
@Override

View File

@ -17,25 +17,40 @@
package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.util.Map;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Factory for {@link LetterTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.LetterTokenizerFactory"/&gt;
* &lt;tokenizer class="solr.LetterTokenizerFactory" maxTokenLen="256"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* Options:
* <ul>
* <li>maxTokenLen: max token length, must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
* It is rare to need to change this
* else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
* </ul>
*/
public class LetterTokenizerFactory extends TokenizerFactory {
private final int maxTokenLen;
/** Creates a new LetterTokenizerFactory */
public LetterTokenizerFactory(Map<String,String> args) {
super(args);
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -43,6 +58,6 @@ public class LetterTokenizerFactory extends TokenizerFactory {
@Override
public LetterTokenizer create(AttributeFactory factory) {
return new LetterTokenizer(factory);
return new LetterTokenizer(factory, maxTokenLen);
}
}

View File

@ -50,6 +50,19 @@ public final class LowerCaseTokenizer extends LetterTokenizer {
super(factory);
}
/**
* Construct a new LowerCaseTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public LowerCaseTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory, maxTokenLen);
}
/** Converts char to lower case
* {@link Character#toLowerCase(int)}.*/
@Override

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.core;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
@ -25,20 +26,36 @@ import org.apache.lucene.util.AttributeFactory;
import java.util.HashMap;
import java.util.Map;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Factory for {@link LowerCaseTokenizer}.
* Factory for {@link LowerCaseTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.LowerCaseTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.LowerCaseTokenizerFactory" maxTokenLen="256"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* <p>
* Options:
* <ul>
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
* It is rare to need to change this
* else {@link CharTokenizer}::DEFAULT_MAX_WORD_LEN</li>
* </ul>
*/
public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent {
/** Creates a new LowerCaseTokenizerFactory */
public LowerCaseTokenizerFactory(Map<String,String> args) {
private final int maxTokenLen;
/**
* Creates a new LowerCaseTokenizerFactory
*/
public LowerCaseTokenizerFactory(Map<String, String> args) {
super(args);
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -46,11 +63,13 @@ public class LowerCaseTokenizerFactory extends TokenizerFactory implements Multi
@Override
public LowerCaseTokenizer create(AttributeFactory factory) {
return new LowerCaseTokenizer(factory);
return new LowerCaseTokenizer(factory, maxTokenLen);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return new LowerCaseFilterFactory(new HashMap<>(getOriginalArgs()));
Map map = new HashMap<>(getOriginalArgs());
map.remove("maxTokenLen"); //removing "maxTokenLen" argument for LowerCaseFilterFactory init
return new LowerCaseFilterFactory(map);
}
}

View File

@ -58,7 +58,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* <ul>
* <li><code>wordset</code> - This is the default format, which supports one word per
* line (including any intra-word whitespace) and allows whole line comments
* begining with the "#" character. Blank lines are ignored. See
* beginning with the "#" character. Blank lines are ignored. See
* {@link WordlistLoader#getLines WordlistLoader.getLines} for details.
* </li>
* <li><code>snowball</code> - This format allows for multiple words specified on each

View File

@ -47,6 +47,19 @@ public final class UnicodeWhitespaceTokenizer extends CharTokenizer {
public UnicodeWhitespaceTokenizer(AttributeFactory factory) {
super(factory);
}
/**
* Construct a new UnicodeWhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public UnicodeWhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory, maxTokenLen);
}
/** Collects only characters which do not satisfy Unicode's WHITESPACE property. */
@Override

View File

@ -46,6 +46,19 @@ public final class WhitespaceTokenizer extends CharTokenizer {
public WhitespaceTokenizer(AttributeFactory factory) {
super(factory);
}
/**
* Construct a new WhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public WhitespaceTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory, maxTokenLen);
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.*/

View File

@ -22,15 +22,18 @@ import java.util.Collection;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* Factory for {@link WhitespaceTokenizer}.
* <pre class="prettyprint">
* &lt;fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"/&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode" maxTokenLen="256"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
@ -38,6 +41,9 @@ import org.apache.lucene.util.AttributeFactory;
* <ul>
* <li>rule: either "java" for {@link WhitespaceTokenizer}
* or "unicode" for {@link UnicodeWhitespaceTokenizer}</li>
* <li>maxTokenLen: max token length, should be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024).
* It is rare to need to change this
* else {@link CharTokenizer}::DEFAULT_MAX_TOKEN_LEN</li>
* </ul>
*/
public class WhitespaceTokenizerFactory extends TokenizerFactory {
@ -46,13 +52,17 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
private static final Collection<String> RULE_NAMES = Arrays.asList(RULE_JAVA, RULE_UNICODE);
private final String rule;
private final int maxTokenLen;
/** Creates a new WhitespaceTokenizerFactory */
public WhitespaceTokenizerFactory(Map<String,String> args) {
super(args);
rule = get(args, "rule", RULE_NAMES, RULE_JAVA);
maxTokenLen = getInt(args, "maxTokenLen", CharTokenizer.DEFAULT_MAX_WORD_LEN);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -62,9 +72,9 @@ public class WhitespaceTokenizerFactory extends TokenizerFactory {
public Tokenizer create(AttributeFactory factory) {
switch (rule) {
case RULE_JAVA:
return new WhitespaceTokenizer(factory);
return new WhitespaceTokenizer(factory, maxTokenLen);
case RULE_UNICODE:
return new UnicodeWhitespaceTokenizer(factory);
return new UnicodeWhitespaceTokenizer(factory, maxTokenLen);
default:
throw new AssertionError();
}

View File

@ -33,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeFactory;
import static org.apache.lucene.analysis.standard.StandardTokenizer.MAX_TOKEN_LENGTH_LIMIT;
/**
* An abstract base class for simple, character-oriented tokenizers.
* <p>
@ -50,6 +52,7 @@ public abstract class CharTokenizer extends Tokenizer {
* Creates a new {@link CharTokenizer} instance
*/
public CharTokenizer() {
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
}
/**
@ -60,6 +63,23 @@ public abstract class CharTokenizer extends Tokenizer {
*/
public CharTokenizer(AttributeFactory factory) {
super(factory);
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
}
/**
* Creates a new {@link CharTokenizer} instance
*
* @param factory the attribute factory to use for this {@link Tokenizer}
* @param maxTokenLen maximum token length the tokenizer will emit.
* Must be greater than 0 and less than MAX_TOKEN_LENGTH_LIMIT (1024*1024)
* @throws IllegalArgumentException if maxTokenLen is invalid.
*/
public CharTokenizer(AttributeFactory factory, int maxTokenLen) {
super(factory);
if (maxTokenLen > MAX_TOKEN_LENGTH_LIMIT || maxTokenLen <= 0) {
throw new IllegalArgumentException("maxTokenLen must be greater than 0 and less than " + MAX_TOKEN_LENGTH_LIMIT + " passed: " + maxTokenLen);
}
this.maxTokenLen = maxTokenLen;
}
/**
@ -193,9 +213,10 @@ public abstract class CharTokenizer extends Tokenizer {
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
public static final int DEFAULT_MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final int maxTokenLen;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -256,7 +277,7 @@ public abstract class CharTokenizer extends Tokenizer {
}
end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test
if (length >= maxTokenLen) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
} else if (length > 0) { // at non-Letter w/ chars

View File

@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was created using ICU4J v56.1.0.0
* The data was created using ICU4J v59.1.0.0
* <p>
* Unicode version: 8.0.0.0
* Unicode version: 9.0.0.0
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "8.0.0.0";
public static final String UNICODE_VERSION = "9.0.0.0";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(

View File

@ -31,6 +31,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package org.tartarus.snowball;
import java.lang.reflect.UndeclaredThrowableException;
import org.apache.lucene.util.ArrayUtil;
/**
@ -313,8 +315,10 @@ public abstract class SnowballProgram {
boolean res = false;
try {
res = (boolean) w.method.invokeExact(this);
} catch (Error | RuntimeException e) {
throw e;
} catch (Throwable e) {
rethrow(e);
throw new UndeclaredThrowableException(e);
}
cursor = c + w.s_size;
if (res) return w.result;
@ -376,8 +380,10 @@ public abstract class SnowballProgram {
boolean res = false;
try {
res = (boolean) w.method.invokeExact(this);
} catch (Error | RuntimeException e) {
throw e;
} catch (Throwable e) {
rethrow(e);
throw new UndeclaredThrowableException(e);
}
cursor = c - w.s_size;
if (res) return w.result;
@ -485,15 +491,5 @@ extern void debug(struct SN_env * z, int number, int line_count)
printf("'\n");
}
*/
// Hack to rethrow unknown Exceptions from {@link MethodHandle#invoke}:
private static void rethrow(Throwable t) {
SnowballProgram.<Error>rethrow0(t);
}
@SuppressWarnings("unchecked")
private static <T extends Throwable> void rethrow0(Throwable t) throws T {
throw (T) t;
}
};

View File

@ -53,7 +53,7 @@
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
characters as described before, between any two word characters a digit
in the range 0 to 9 may be specified. The absence of a digit is equivalent
to zero. The '.' character is reserved to indicate begining or ending
to zero. The '.' character is reserved to indicate beginning or ending
of words. -->
<!ELEMENT patterns (#PCDATA)>

View File

@ -54,7 +54,7 @@
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
characters as described before, between any two word characters a digit
in the range 0 to 9 may be specified. The absence of a digit is equivalent
to zero. The '.' character is reserved to indicate begining or ending
to zero. The '.' character is reserved to indicate beginning or ending
of words. -->
<!ELEMENT patterns (#PCDATA)>

View File

@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.AttributeFactory;
public class TestKeywordTokenizer extends BaseTokenStreamTestCase {
public void testSimple() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
KeywordTokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer \ud801\udc1ctest"});
}
public void testFactory() {
Map<String, String> args = new HashMap<>();
KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
assertEquals(KeywordTokenizer.class, tokenizer.getClass());
}
private Map<String, String> makeArgs(String... args) {
Map<String, String> ret = new HashMap<>();
for (int idx = 0; idx < args.length; idx += 2) {
ret.put(args[idx], args[idx + 1]);
}
return ret;
}
public void testParamsFactory() throws IOException {
// negative maxTokenLen
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "-1")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
// zero maxTokenLen
iae = expectThrows(IllegalArgumentException.class, () ->
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "0")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
// Added random param, should throw illegal error
iae = expectThrows(IllegalArgumentException.class, () ->
new KeywordTokenizerFactory(makeArgs("maxTokenLen", "255", "randomParam", "rValue")));
assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
// tokeniser will never split, no matter what is passed,
// but the buffer will not be more than length of the token
KeywordTokenizerFactory factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "5"));
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
StringReader reader = new StringReader("Tokenizertest");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"Tokenizertest"});
// tokeniser will never split, no matter what is passed,
// but the buffer will not be more than length of the token
factory = new KeywordTokenizerFactory(makeArgs("maxTokenLen", "2"));
attributeFactory = newAttributeFactory();
tokenizer = factory.create(attributeFactory);
reader = new StringReader("Tokenizer\u00A0test");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer\u00A0test"});
}
}

View File

@ -54,4 +54,55 @@ public class TestUnicodeWhitespaceTokenizer extends BaseTokenStreamTestCase {
assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
}
private Map<String, String> makeArgs(String... args) {
Map<String, String> ret = new HashMap<>();
for (int idx = 0; idx < args.length; idx += 2) {
ret.put(args[idx], args[idx + 1]);
}
return ret;
}
public void testParamsFactory() throws IOException {
// negative maxTokenLen
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () ->
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
// zero maxTokenLen
iae = expectThrows(IllegalArgumentException.class, () ->
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
// Added random param, should throw illegal error
iae = expectThrows(IllegalArgumentException.class, () ->
new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
// tokeniser will split at 5, Token | izer, no matter what happens
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"Token", "izer", "\ud801\udc1ctes", "t"});
// tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
attributeFactory = newAttributeFactory();
tokenizer = factory.create(attributeFactory);
reader = new StringReader("Tokenizer\u00A0test");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"To", "ke", "ni", "ze", "r", "te", "st"});
// tokeniser will split at 10, no matter what happens,
// but tokens' length are less than that
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
attributeFactory = newAttributeFactory();
tokenizer = factory.create(attributeFactory);
reader = new StringReader("Tokenizer\u00A0test");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[]{"Tokenizer", "test"});
}
}

View File

@ -25,8 +25,10 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.TestUtil;
@ -89,6 +91,99 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase {
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
/*
* tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
*/
public void testCustomMaxTokenLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 100; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT),
builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString(), builder.toString()});
// Let's test that we can get a token longer than 255 through.
builder.setLength(0);
for (int i = 0; i < 500; i++) {
builder.append("Z");
}
tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
// Just to be sure what is happening here, token lengths of zero make no sense,
// Let's try the edge cases, token > I/O buffer (4096)
builder.setLength(0);
for (int i = 0; i < 600; i++) {
builder.append("aUrOkIjq"); // 600 * 8 = 4800 chars.
}
e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString().toLowerCase(Locale.ROOT)});
e = expectThrows(IllegalArgumentException.class, () ->
new KeywordTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
new KeywordTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
e = expectThrows(IllegalArgumentException.class, () ->
new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
new LetterTokenizer(newAttributeFactory(), 2_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
e = expectThrows(IllegalArgumentException.class, () ->
new WhitespaceTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () ->
new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[]{builder.toString()});
}
/*
* tests the max word length of 255 with a surrogate pair at position 255

View File

@ -168,11 +168,14 @@ FFE3>
1134D>
11366..1136C>
11370..11374>
11442>
11446>
114C2..114C3>
115BF..115C0>
1163F>
116B6..116B7>
1172B>
11C3F>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>
@ -181,6 +184,8 @@ FFE3>
1D185..1D18B>
1D1AA..1D1AD>
1E8D0..1E8D6>
1E944..1E946>
1E948..1E94A>
# Latin script "composed" that do not further decompose, so decompose here
# These are from AsciiFoldingFilter

View File

@ -510,6 +510,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
112F7>0037 # KHUDAWADI DIGIT SEVEN
112F8>0038 # KHUDAWADI DIGIT EIGHT
112F9>0039 # KHUDAWADI DIGIT NINE
11450>0030 # NEWA DIGIT ZERO
11451>0031 # NEWA DIGIT ONE
11452>0032 # NEWA DIGIT TWO
11453>0033 # NEWA DIGIT THREE
11454>0034 # NEWA DIGIT FOUR
11455>0035 # NEWA DIGIT FIVE
11456>0036 # NEWA DIGIT SIX
11457>0037 # NEWA DIGIT SEVEN
11458>0038 # NEWA DIGIT EIGHT
11459>0039 # NEWA DIGIT NINE
114D0>0030 # TIRHUTA DIGIT ZERO
114D1>0031 # TIRHUTA DIGIT ONE
114D2>0032 # TIRHUTA DIGIT TWO
@ -560,6 +570,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
118E7>0037 # WARANG CITI DIGIT SEVEN
118E8>0038 # WARANG CITI DIGIT EIGHT
118E9>0039 # WARANG CITI DIGIT NINE
11C50>0030 # BHAIKSUKI DIGIT ZERO
11C51>0031 # BHAIKSUKI DIGIT ONE
11C52>0032 # BHAIKSUKI DIGIT TWO
11C53>0033 # BHAIKSUKI DIGIT THREE
11C54>0034 # BHAIKSUKI DIGIT FOUR
11C55>0035 # BHAIKSUKI DIGIT FIVE
11C56>0036 # BHAIKSUKI DIGIT SIX
11C57>0037 # BHAIKSUKI DIGIT SEVEN
11C58>0038 # BHAIKSUKI DIGIT EIGHT
11C59>0039 # BHAIKSUKI DIGIT NINE
16A60>0030 # MRO DIGIT ZERO
16A61>0031 # MRO DIGIT ONE
16A62>0032 # MRO DIGIT TWO
@ -580,4 +600,14 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
16B59>0039 # PAHAWH HMONG DIGIT NINE
1E950>0030 # ADLAM DIGIT ZERO
1E951>0031 # ADLAM DIGIT ONE
1E952>0032 # ADLAM DIGIT TWO
1E953>0033 # ADLAM DIGIT THREE
1E954>0034 # ADLAM DIGIT FOUR
1E955>0035 # ADLAM DIGIT FIVE
1E956>0036 # ADLAM DIGIT SIX
1E957>0037 # ADLAM DIGIT SEVEN
1E958>0038 # ADLAM DIGIT EIGHT
1E959>0039 # ADLAM DIGIT NINE

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2014, International Business Machines
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfc.txt
@ -7,7 +7,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 7.0.0
* Unicode 9.0.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
@ -129,6 +129,8 @@
0825..0827:230
0829..082D:230
0859..085B:220
08D4..08E1:230
08E3:220
08E4..08E5:230
08E6:220
08E7..08E8:230
@ -232,6 +234,7 @@
1DCF:220
1DD0:202
1DD1..1DF5:230
1DFB:230
1DFC:233
1DFD:220
1DFE:230
@ -260,7 +263,7 @@
3099..309A:8
A66F:230
A674..A67D:230
A69F:230
A69E..A69F:230
A6F0..A6F1:230
A806:9
A8C4:9
@ -280,6 +283,7 @@ ABED:9
FB1E:26
FE20..FE26:230
FE27..FE2D:220
FE2E..FE2F:230
101FD:220
102E0:220
10376..1037A:230
@ -299,6 +303,7 @@ FE27..FE2D:220
11133..11134:9
11173:7
111C0:9
111CA:7
11235:9
11236:7
112E9:7
@ -307,6 +312,8 @@ FE27..FE2D:220
1134D:9
11366..1136C:230
11370..11374:230
11442:9
11446:7
114C2:9
114C3:7
115BF:9
@ -314,6 +321,8 @@ FE27..FE2D:220
1163F:9
116B6:9
116B7:7
1172B:9
11C3F:9
16AF0..16AF4:1
16B30..16B36:230
1BC9E:1
@ -326,7 +335,14 @@ FE27..FE2D:220
1D18A..1D18B:220
1D1AA..1D1AD:230
1D242..1D244:230
1E000..1E006:230
1E008..1E018:230
1E01B..1E021:230
1E023..1E024:230
1E026..1E02A:230
1E8D0..1E8D6:220
1E944..1E949:230
1E94A:7
# Canonical decomposition mappings
00C0>0041 0300 # one-way: diacritic 0300

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2014, International Business Machines
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
@ -11,7 +11,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 7.0.0
* Unicode 9.0.0
00A0>0020
00A8>0020 0308
@ -3675,6 +3675,7 @@ FFEE>25CB
1F238>7533
1F239>5272
1F23A>55B6
1F23B>914D
1F240>3014 672C 3015
1F241>3014 4E09 3015
1F242>3014 4E8C 3015

View File

@ -1,5 +1,5 @@
# Unicode Character Database
# Copyright (c) 1991-2014 Unicode, Inc.
# Copyright (c) 1991-2016 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 7.0.0
* Unicode 9.0.0
0041>0061
0042>0062
@ -632,8 +632,22 @@
10CD>2D2D
10FC>10DC
115F..1160>
13F8>13F0
13F9>13F1
13FA>13F2
13FB>13F3
13FC>13F4
13FD>13F5
17B4..17B5>
180B..180E>
1C80>0432
1C81>0434
1C82>043E
1C83>0441
1C84..1C85>0442
1C86>044A
1C87>0463
1C88>A64B
1D2C>0061
1D2D>00E6
1D2E>0062
@ -2382,14 +2396,99 @@ A7AA>0266
A7AB>025C
A7AC>0261
A7AD>026C
A7AE>026A
A7B0>029E
A7B1>0287
A7B2>029D
A7B3>AB53
A7B4>A7B5
A7B6>A7B7
A7F8>0127
A7F9>0153
AB5C>A727
AB5D>AB37
AB5E>026B
AB5F>AB52
AB70>13A0
AB71>13A1
AB72>13A2
AB73>13A3
AB74>13A4
AB75>13A5
AB76>13A6
AB77>13A7
AB78>13A8
AB79>13A9
AB7A>13AA
AB7B>13AB
AB7C>13AC
AB7D>13AD
AB7E>13AE
AB7F>13AF
AB80>13B0
AB81>13B1
AB82>13B2
AB83>13B3
AB84>13B4
AB85>13B5
AB86>13B6
AB87>13B7
AB88>13B8
AB89>13B9
AB8A>13BA
AB8B>13BB
AB8C>13BC
AB8D>13BD
AB8E>13BE
AB8F>13BF
AB90>13C0
AB91>13C1
AB92>13C2
AB93>13C3
AB94>13C4
AB95>13C5
AB96>13C6
AB97>13C7
AB98>13C8
AB99>13C9
AB9A>13CA
AB9B>13CB
AB9C>13CC
AB9D>13CD
AB9E>13CE
AB9F>13CF
ABA0>13D0
ABA1>13D1
ABA2>13D2
ABA3>13D3
ABA4>13D4
ABA5>13D5
ABA6>13D6
ABA7>13D7
ABA8>13D8
ABA9>13D9
ABAA>13DA
ABAB>13DB
ABAC>13DC
ABAD>13DD
ABAE>13DE
ABAF>13DF
ABB0>13E0
ABB1>13E1
ABB2>13E2
ABB3>13E3
ABB4>13E4
ABB5>13E5
ABB6>13E6
ABB7>13E7
ABB8>13E8
ABB9>13E9
ABBA>13EA
ABBB>13EB
ABBC>13EC
ABBD>13ED
ABBE>13EE
ABBF>13EF
F900>8C48
F901>66F4
F902>8ECA
@ -3766,6 +3865,93 @@ FFF0..FFF8>
10425>1044D
10426>1044E
10427>1044F
104B0>104D8
104B1>104D9
104B2>104DA
104B3>104DB
104B4>104DC
104B5>104DD
104B6>104DE
104B7>104DF
104B8>104E0
104B9>104E1
104BA>104E2
104BB>104E3
104BC>104E4
104BD>104E5
104BE>104E6
104BF>104E7
104C0>104E8
104C1>104E9
104C2>104EA
104C3>104EB
104C4>104EC
104C5>104ED
104C6>104EE
104C7>104EF
104C8>104F0
104C9>104F1
104CA>104F2
104CB>104F3
104CC>104F4
104CD>104F5
104CE>104F6
104CF>104F7
104D0>104F8
104D1>104F9
104D2>104FA
104D3>104FB
10C80>10CC0
10C81>10CC1
10C82>10CC2
10C83>10CC3
10C84>10CC4
10C85>10CC5
10C86>10CC6
10C87>10CC7
10C88>10CC8
10C89>10CC9
10C8A>10CCA
10C8B>10CCB
10C8C>10CCC
10C8D>10CCD
10C8E>10CCE
10C8F>10CCF
10C90>10CD0
10C91>10CD1
10C92>10CD2
10C93>10CD3
10C94>10CD4
10C95>10CD5
10C96>10CD6
10C97>10CD7
10C98>10CD8
10C99>10CD9
10C9A>10CDA
10C9B>10CDB
10C9C>10CDC
10C9D>10CDD
10C9E>10CDE
10C9F>10CDF
10CA0>10CE0
10CA1>10CE1
10CA2>10CE2
10CA3>10CE3
10CA4>10CE4
10CA5>10CE5
10CA6>10CE6
10CA7>10CE7
10CA8>10CE8
10CA9>10CE9
10CAA>10CEA
10CAB>10CEB
10CAC>10CEC
10CAD>10CED
10CAE>10CEE
10CAF>10CEF
10CB0>10CF0
10CB1>10CF1
10CB2>10CF2
118A0>118C0
118A1>118C1
118A2>118C2
@ -4803,6 +4989,40 @@ FFF0..FFF8>
1D7FD>0037
1D7FE>0038
1D7FF>0039
1E900>1E922
1E901>1E923
1E902>1E924
1E903>1E925
1E904>1E926
1E905>1E927
1E906>1E928
1E907>1E929
1E908>1E92A
1E909>1E92B
1E90A>1E92C
1E90B>1E92D
1E90C>1E92E
1E90D>1E92F
1E90E>1E930
1E90F>1E931
1E910>1E932
1E911>1E933
1E912>1E934
1E913>1E935
1E914>1E936
1E915>1E937
1E916>1E938
1E917>1E939
1E918>1E93A
1E919>1E93B
1E91A>1E93C
1E91B>1E93D
1E91C>1E93E
1E91D>1E93F
1E91E>1E940
1E91F>1E941
1E920>1E942
1E921>1E943
1EE00>0627
1EE01>0628
1EE02>062C
@ -5067,6 +5287,7 @@ FFF0..FFF8>
1F238>7533
1F239>5272
1F23A>55B6
1F23B>914D
1F240>3014 672C 3015
1F241>3014 4E09 3015
1F242>3014 4E8C 3015

View File

@ -53,7 +53,14 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
new String[] { "", "购买", "", "道具", "", "服装" }
);
}
public void testTraditionalChinese() throws Exception {
assertAnalyzesTo(a, "我購買了道具和服裝。",
new String[] { "", "購買", "", "道具", "", "服裝"});
assertAnalyzesTo(a, "定義切分字串的基本單位是訂定分詞標準的首要工作", // From http://godel.iis.sinica.edu.tw/CKIP/paper/wordsegment_standard.pdf
new String[] { "定義", "", "", "字串", "", "基本", "單位", "", "訂定", "分詞", "標準", "", "首要", "工作" });
}
public void testChineseNumerics() throws Exception {
assertAnalyzesTo(a, "", new String[] { "" });
assertAnalyzesTo(a, "院內分機9483。",

View File

@ -63,7 +63,7 @@ import java.util.regex.Pattern;
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-54-1";
private static final String ICU_RELEASE_TAG = "release-58-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";

View File

@ -116,6 +116,8 @@ public final class UkrainianMorfologikAnalyzer extends StopwordAnalyzerBase {
// ignored characters
builder.add("\u0301", "");
builder.add("\u00AD", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");
NormalizeCharMap normMap = builder.build();
reader = new MappingCharFilter(normMap, reader);

View File

@ -52,10 +52,17 @@ public class TestUkrainianAnalyzer extends BaseTokenStreamTestCase {
public void testCapsTokenStream() throws Exception {
Analyzer a = new UkrainianMorfologikAnalyzer();
assertAnalyzesTo(a, "Цих Чайковського і Ґете.",
new String[] { "Чайковське", "Чайковський", "Ґете" });
new String[] { "Чайковське", "Чайковський", "Гете" });
a.close();
}
public void testCharNormalization() throws Exception {
Analyzer a = new UkrainianMorfologikAnalyzer();
assertAnalyzesTo(a, "Ґюмрі та Гюмрі.",
new String[] { "Гюмрі", "Гюмрі" });
a.close();
}
public void testSampleSentence() throws Exception {
Analyzer a = new UkrainianMorfologikAnalyzer();
assertAnalyzesTo(a, "Це — проект генерування словника з тегами частин мови для української мови.",

View File

@ -60,10 +60,6 @@ import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.legacy.LegacyIntField;
import org.apache.lucene.legacy.LegacyLongField;
import org.apache.lucene.legacy.LegacyNumericRangeQuery;
import org.apache.lucene.legacy.LegacyNumericUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
@ -299,7 +295,9 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
"6.4.2-cfs",
"6.4.2-nocfs",
"6.5.0-cfs",
"6.5.0-nocfs"
"6.5.0-nocfs",
"6.5.1-cfs",
"6.5.1-nocfs"
};
final String[] unsupportedNames = {
@ -1112,9 +1110,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620 ab\ud917\udc17cd", customType2));
doc.add(new Field("content2", "here is more content with aaa aaa aaa", customType2));
doc.add(new Field("fie\u2C77ld", "field with non-ascii name", customType2));
// add numeric fields, to test if flex preserves encoding
doc.add(new LegacyIntField("trieInt", id, Field.Store.NO));
doc.add(new LegacyLongField("trieLong", (long) id, Field.Store.NO));
// add docvalues fields
doc.add(new NumericDocValuesField("dvByte", (byte) id));
@ -1292,51 +1287,6 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
}
}
public void testNumericFields() throws Exception {
for (String name : oldNames) {
Directory dir = oldIndexDirs.get(name);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(reader);
for (int id=10; id<15; id++) {
ScoreDoc[] hits = searcher.search(LegacyNumericRangeQuery.newIntRange("trieInt", LegacyNumericUtils.PRECISION_STEP_DEFAULT_32, Integer.valueOf(id), Integer.valueOf(id), true, true), 100).scoreDocs;
assertEquals("wrong number of hits", 1, hits.length);
Document d = searcher.doc(hits[0].doc);
assertEquals(String.valueOf(id), d.get("id"));
hits = searcher.search(LegacyNumericRangeQuery.newLongRange("trieLong", LegacyNumericUtils.PRECISION_STEP_DEFAULT, Long.valueOf(id), Long.valueOf(id), true, true), 100).scoreDocs;
assertEquals("wrong number of hits", 1, hits.length);
d = searcher.doc(hits[0].doc);
assertEquals(String.valueOf(id), d.get("id"));
}
// check that also lower-precision fields are ok
ScoreDoc[] hits = searcher.search(LegacyNumericRangeQuery.newIntRange("trieInt", LegacyNumericUtils.PRECISION_STEP_DEFAULT_32, Integer.MIN_VALUE, Integer.MAX_VALUE, false, false), 100).scoreDocs;
assertEquals("wrong number of hits", 34, hits.length);
hits = searcher.search(LegacyNumericRangeQuery.newLongRange("trieLong", LegacyNumericUtils.PRECISION_STEP_DEFAULT, Long.MIN_VALUE, Long.MAX_VALUE, false, false), 100).scoreDocs;
assertEquals("wrong number of hits", 34, hits.length);
// check decoding of terms
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "trieInt");
TermsEnum termsEnum = LegacyNumericUtils.filterPrefixCodedInts(terms.iterator());
while (termsEnum.next() != null) {
int val = LegacyNumericUtils.prefixCodedToInt(termsEnum.term());
assertTrue("value in id bounds", val >= 0 && val < 35);
}
terms = MultiFields.getTerms(searcher.getIndexReader(), "trieLong");
termsEnum = LegacyNumericUtils.filterPrefixCodedLongs(terms.iterator());
while (termsEnum.next() != null) {
long val = LegacyNumericUtils.prefixCodedToLong(termsEnum.term());
assertTrue("value in id bounds", val >= 0L && val < 35L);
}
reader.close();
}
}
private int checkAllSegmentsUpgraded(Directory dir, int indexCreatedVersion) throws IOException {
final SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
if (VERBOSE) {

View File

@ -38,7 +38,7 @@ file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
highlighter=HlImpl:NONE:SH_A:UH_A:UH_P:UH_PV
{ "Populate"
CreateIndex
@ -60,6 +60,6 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
CloseReader
NewRound
} : 6
} : 5
RepSumByPrefRound HL

View File

@ -42,7 +42,6 @@ import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
@ -133,8 +132,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
case "PH_P": hlImpl = new PostingsHLImpl(); break;
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
}
}
@ -224,33 +221,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
return clone;
}
private class PostingsHLImpl implements HLImpl {
PostingsHighlighter highlighter;
String[] fields = hlFields.toArray(new String[hlFields.size()]);
int[] maxPassages;
PostingsHLImpl() {
highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
@Override
protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
return analyzer;
}
@Override
protected BreakIterator getBreakIterator(String field) {
return BreakIterator.getSentenceInstance(Locale.ENGLISH);
}
};
maxPassages = new int[hlFields.size()];
Arrays.fill(maxPassages, maxFrags);
}
@Override
public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
preventOptimizeAway = result.size();
}
}
private class UnifiedHLImpl implements HLImpl {
UnifiedHighlighter highlighter;
IndexSearcher lastSearcher;

View File

@ -28,6 +28,8 @@
<path refid="base.classpath"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${grouping.jar}"/>
<pathelement path="${sandbox.jar}"/>
<pathelement path="${analyzers-common.jar}"/>
</path>
<path id="test.classpath">
@ -36,16 +38,18 @@
<path refid="test.base.classpath"/>
</path>
<target name="compile-core" depends="jar-grouping,jar-queries,jar-analyzers-common,common.compile-core" />
<target name="compile-core" depends="jar-sandbox,jar-grouping,jar-queries,jar-analyzers-common,common.compile-core" />
<target name="jar-core" depends="common.jar-core" />
<target name="javadocs" depends="javadocs-grouping,compile-core,check-javadocs-uptodate"
<target name="javadocs" depends="javadocs-sandbox,javadocs-grouping,compile-core,check-javadocs-uptodate"
unless="javadocs-uptodate-${name}">
<invoke-module-javadoc>
<links>
<link href="../queries"/>
<link href="../analyzers-common"/>
<link href="../grouping"/>
<link href="../sandbox"/>
</links>
</invoke-module-javadoc>
</target>

View File

@ -0,0 +1,243 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.classification;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.util.BytesRef;
/**
* A classifier approximating naive bayes classifier by using pure queries on BM25.
*
* @lucene.experimental
*/
public class BM25NBClassifier implements Classifier<BytesRef> {
/**
* {@link IndexReader} used to access the {@link Classifier}'s
* index
*/
private final IndexReader indexReader;
/**
* names of the fields to be used as input text
*/
private final String[] textFieldNames;
/**
* name of the field to be used as a class / category output
*/
private final String classFieldName;
/**
* {@link Analyzer} to be used for tokenizing unseen input text
*/
private final Analyzer analyzer;
/**
* {@link IndexSearcher} to run searches on the index for retrieving frequencies
*/
private final IndexSearcher indexSearcher;
/**
* {@link Query} used to eventually filter the document set to be used to classify
*/
private final Query query;
/**
* Creates a new NaiveBayes classifier.
*
* @param indexReader the reader on the index to be used for classification
* @param analyzer an {@link Analyzer} used to analyze unseen text
* @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
* if all the indexed docs should be used
* @param classFieldName the name of the field used as the output for the classifier NOTE: must not be havely analyzed
* as the returned class will be a token indexed for this field
* @param textFieldNames the name of the fields used as the inputs for the classifier, NO boosting supported per field
*/
public BM25NBClassifier(IndexReader indexReader, Analyzer analyzer, Query query, String classFieldName, String... textFieldNames) {
this.indexReader = indexReader;
this.indexSearcher = new IndexSearcher(this.indexReader);
this.indexSearcher.setSimilarity(new BM25Similarity());
this.textFieldNames = textFieldNames;
this.classFieldName = classFieldName;
this.analyzer = analyzer;
this.query = query;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
return assignClassNormalizedList(inputDocument).get(0);
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
Collections.sort(assignedClasses);
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = assignClassNormalizedList(text);
Collections.sort(assignedClasses);
return assignedClasses.subList(0, max);
}
/**
* Calculate probabilities for all classes for a given input text
*
* @param inputDocument the input text as a {@code String}
* @return a {@code List} of {@code ClassificationResult}, one for each existing class
* @throws IOException if assigning probabilities fails
*/
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
Terms classes = MultiFields.getTerms(indexReader, classFieldName);
TermsEnum classesEnum = classes.iterator();
BytesRef next;
String[] tokenizedText = tokenize(inputDocument);
while ((next = classesEnum.next()) != null) {
if (next.length > 0) {
Term term = new Term(this.classFieldName, next);
assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
}
}
return normClassificationResults(assignedClasses);
}
/**
* Normalize the classification results based on the max score available
*
* @param assignedClasses the list of assigned classes
* @return the normalized results
*/
private ArrayList<ClassificationResult<BytesRef>> normClassificationResults(List<ClassificationResult<BytesRef>> assignedClasses) {
// normalization; the values transforms to a 0-1 range
ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
if (!assignedClasses.isEmpty()) {
Collections.sort(assignedClasses);
// this is a negative number closest to 0 = a
double smax = assignedClasses.get(0).getScore();
double sumLog = 0;
// log(sum(exp(x_n-a)))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
// getScore-smax <=0 (both negative, smax is the smallest abs()
sumLog += Math.exp(cr.getScore() - smax);
}
// loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
double loga = smax;
loga += Math.log(sumLog);
// 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
for (ClassificationResult<BytesRef> cr : assignedClasses) {
double scoreDiff = cr.getScore() - loga;
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(scoreDiff)));
}
}
return returnList;
}
/**
* tokenize a <code>String</code> on this classifier's text fields and analyzer
*
* @param text the <code>String</code> representing an input text (to be classified)
* @return a <code>String</code> array of the resulting tokens
* @throws IOException if tokenization fails
*/
private String[] tokenize(String text) throws IOException {
Collection<String> result = new LinkedList<>();
for (String textFieldName : textFieldNames) {
try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(charTermAttribute.toString());
}
tokenStream.end();
}
}
return result.toArray(new String[result.size()]);
}
private double calculateLogLikelihood(String[] tokens, Term term) throws IOException {
double result = 0d;
for (String word : tokens) {
result += Math.log(getTermProbForClass(term, word));
}
return result;
}
private double getTermProbForClass(Term classTerm, String... words) throws IOException {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new BooleanClause(new TermQuery(classTerm), BooleanClause.Occur.MUST));
for (String textFieldName : textFieldNames) {
for (String word : words) {
builder.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.SHOULD));
}
}
if (query != null) {
builder.add(query, BooleanClause.Occur.MUST);
}
TopDocs search = indexSearcher.search(builder.build(), 1);
return search.totalHits > 0 ? search.getMaxScore() : 1;
}
private double calculateLogPrior(Term term) throws IOException {
TermQuery termQuery = new TermQuery(term);
BooleanQuery.Builder bq = new BooleanQuery.Builder();
bq.add(termQuery, BooleanClause.Occur.MUST);
if (query != null) {
bq.add(query, BooleanClause.Occur.MUST);
}
TopDocs topDocs = indexSearcher.search(bq.build(), 1);
return topDocs.totalHits > 0 ? Math.log(topDocs.getMaxScore()) : 0;
}
}

View File

@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.classification;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
/**
* A k-Nearest Neighbor classifier based on {@link FuzzyLikeThisQuery}.
*
* @lucene.experimental
*/
public class KNearestFuzzyClassifier implements Classifier<BytesRef> {
/**
* the name of the fields used as the input text
*/
protected final String[] textFieldNames;
/**
* the name of the field used as the output text
*/
protected final String classFieldName;
/**
* an {@link IndexSearcher} used to perform queries
*/
protected final IndexSearcher indexSearcher;
/**
* the no. of docs to compare in order to find the nearest neighbor to the input text
*/
protected final int k;
/**
* a {@link Query} used to filter the documents that should be used from this classifier's underlying {@link LeafReader}
*/
protected final Query query;
private final Analyzer analyzer;
/**
* Creates a {@link KNearestFuzzyClassifier}.
*
* @param indexReader the reader on the index to be used for classification
* @param analyzer an {@link Analyzer} used to analyze unseen text
* @param similarity the {@link Similarity} to be used by the underlying {@link IndexSearcher} or {@code null}
* (defaults to {@link BM25Similarity})
* @param query a {@link Query} to eventually filter the docs used for training the classifier, or {@code null}
* if all the indexed docs should be used
* @param k the no. of docs to select in the MLT results to find the nearest neighbor
* @param classFieldName the name of the field used as the output for the classifier
* @param textFieldNames the name of the fields used as the inputs for the classifier, they can contain boosting indication e.g. title^10
*/
public KNearestFuzzyClassifier(IndexReader indexReader, Similarity similarity, Analyzer analyzer, Query query, int k,
String classFieldName, String... textFieldNames) {
this.textFieldNames = textFieldNames;
this.classFieldName = classFieldName;
this.analyzer = analyzer;
this.indexSearcher = new IndexSearcher(indexReader);
if (similarity != null) {
this.indexSearcher.setSimilarity(similarity);
} else {
this.indexSearcher.setSimilarity(new BM25Similarity());
}
this.query = query;
this.k = k;
}
/**
* {@inheritDoc}
*/
@Override
public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
TopDocs knnResults = knnSearch(text);
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
ClassificationResult<BytesRef> assignedClass = null;
double maxscore = -Double.MAX_VALUE;
for (ClassificationResult<BytesRef> cl : assignedClasses) {
if (cl.getScore() > maxscore) {
assignedClass = cl;
maxscore = cl.getScore();
}
}
return assignedClass;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text) throws IOException {
TopDocs knnResults = knnSearch(text);
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
Collections.sort(assignedClasses);
return assignedClasses;
}
/**
* {@inheritDoc}
*/
@Override
public List<ClassificationResult<BytesRef>> getClasses(String text, int max) throws IOException {
TopDocs knnResults = knnSearch(text);
List<ClassificationResult<BytesRef>> assignedClasses = buildListFromTopDocs(knnResults);
Collections.sort(assignedClasses);
return assignedClasses.subList(0, max);
}
private TopDocs knnSearch(String text) throws IOException {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(300, analyzer);
for (String fieldName : textFieldNames) {
fuzzyLikeThisQuery.addTerms(text, fieldName, 1f, 2); // TODO: make this parameters configurable
}
bq.add(fuzzyLikeThisQuery, BooleanClause.Occur.MUST);
Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));
bq.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
if (query != null) {
bq.add(query, BooleanClause.Occur.MUST);
}
return indexSearcher.search(bq.build(), k);
}
/**
* build a list of classification results from search results
*
* @param topDocs the search results as a {@link TopDocs} object
* @return a {@link List} of {@link ClassificationResult}, one for each existing class
* @throws IOException if it's not possible to get the stored value of class field
*/
protected List<ClassificationResult<BytesRef>> buildListFromTopDocs(TopDocs topDocs) throws IOException {
Map<BytesRef, Integer> classCounts = new HashMap<>();
Map<BytesRef, Double> classBoosts = new HashMap<>(); // this is a boost based on class ranking positions in topDocs
float maxScore = topDocs.getMaxScore();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
IndexableField storableField = indexSearcher.doc(scoreDoc.doc).getField(classFieldName);
if (storableField != null) {
BytesRef cl = new BytesRef(storableField.stringValue());
//update count
Integer count = classCounts.get(cl);
if (count != null) {
classCounts.put(cl, count + 1);
} else {
classCounts.put(cl, 1);
}
//update boost, the boost is based on the best score
Double totalBoost = classBoosts.get(cl);
double singleBoost = scoreDoc.score / maxScore;
if (totalBoost != null) {
classBoosts.put(cl, totalBoost + singleBoost);
} else {
classBoosts.put(cl, singleBoost);
}
}
}
List<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
List<ClassificationResult<BytesRef>> temporaryList = new ArrayList<>();
int sumdoc = 0;
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
Integer count = entry.getValue();
Double normBoost = classBoosts.get(entry.getKey()) / count; //the boost is normalized to be 0<b<1
temporaryList.add(new ClassificationResult<>(entry.getKey().clone(), (count * normBoost) / (double) k));
sumdoc += count;
}
//correction
if (sumdoc < k) {
for (ClassificationResult<BytesRef> cr : temporaryList) {
returnList.add(new ClassificationResult<>(cr.getAssignedClass(), cr.getScore() * k / (double) sumdoc));
}
} else {
returnList = temporaryList;
}
return returnList;
}
@Override
public String toString() {
return "KNearestFuzzyClassifier{" +
"textFieldNames=" + Arrays.toString(textFieldNames) +
", classFieldName='" + classFieldName + '\'' +
", k=" + k +
", query=" + query +
", similarity=" + indexSearcher.getSimilarity(true) +
'}';
}
}

View File

@ -121,7 +121,7 @@ public class DatasetSplitter {
int b = 0;
// iterate over existing documents
for (GroupDocs group : topGroups.groups) {
for (GroupDocs<Object> group : topGroups.groups) {
int totalHits = group.totalHits;
double testSize = totalHits * testRatio;
int tc = 0;

View File

@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.classification.utils.ConfusionMatrixGenerator;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;
/**
* Tests for {@link BM25NBClassifier}
*/
public class BM25NBClassifierTest extends ClassificationTestBase<BytesRef> {
@Test
public void testBasicUsage() throws Exception {
LeafReader leafReader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
leafReader = getSampleIndex(analyzer);
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
if (leafReader != null) {
leafReader.close();
}
}
}
@Test
public void testBasicUsageWithQuery() throws Exception {
LeafReader leafReader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
leafReader = getSampleIndex(analyzer);
TermQuery query = new TermQuery(new Term(textFieldName, "not"));
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, query, categoryFieldName, textFieldName);
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
if (leafReader != null) {
leafReader.close();
}
}
}
@Test
public void testNGramUsage() throws Exception {
LeafReader leafReader = null;
try {
Analyzer analyzer = new NGramAnalyzer();
leafReader = getSampleIndex(analyzer);
BM25NBClassifier classifier = new BM25NBClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
if (leafReader != null) {
leafReader.close();
}
}
}
private class NGramAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ReverseStringFilter(new EdgeNGramTokenFilter(new ReverseStringFilter(tokenizer), 10, 20)));
}
}
@Test
public void testPerformance() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
LeafReader leafReader = getRandomIndex(analyzer, 100);
try {
long trainStart = System.currentTimeMillis();
BM25NBClassifier classifier = new BM25NBClassifier(leafReader,
analyzer, null, categoryFieldName, textFieldName);
long trainEnd = System.currentTimeMillis();
long trainTime = trainEnd - trainStart;
assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
long evaluationStart = System.currentTimeMillis();
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
long evaluationEnd = System.currentTimeMillis();
long evaluationTime = evaluationEnd - evaluationStart;
assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
assertTrue("avg classification time: " + avgClassificationTime, 5000 > avgClassificationTime);
double f1 = confusionMatrix.getF1Measure();
assertTrue(f1 >= 0d);
assertTrue(f1 <= 1d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
TermsEnum iterator = terms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
String s = term.utf8ToString();
recall = confusionMatrix.getRecall(s);
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
precision = confusionMatrix.getPrecision(s);
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double f1Measure = confusionMatrix.getF1Measure(s);
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
} finally {
leafReader.close();
}
}
}

View File

@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.classification;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.classification.utils.ConfusionMatrixGenerator;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;
/**
* Testcase for {@link KNearestFuzzyClassifier}
*/
public class KNearestFuzzyClassifierTest extends ClassificationTestBase<BytesRef> {
@Test
public void testBasicUsage() throws Exception {
LeafReader leafReader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
leafReader = getSampleIndex(analyzer);
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
checkCorrectClassification(classifier, POLITICS_INPUT, POLITICS_RESULT);
} finally {
if (leafReader != null) {
leafReader.close();
}
}
}
@Test
public void testBasicUsageWithQuery() throws Exception {
LeafReader leafReader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
leafReader = getSampleIndex(analyzer);
TermQuery query = new TermQuery(new Term(textFieldName, "not"));
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, query, 3, categoryFieldName, textFieldName);
checkCorrectClassification(classifier, TECHNOLOGY_INPUT, TECHNOLOGY_RESULT);
} finally {
if (leafReader != null) {
leafReader.close();
}
}
}
@Test
public void testPerformance() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
LeafReader leafReader = getRandomIndex(analyzer, 100);
try {
long trainStart = System.currentTimeMillis();
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(leafReader, null, analyzer, null, 3, categoryFieldName, textFieldName);
long trainEnd = System.currentTimeMillis();
long trainTime = trainEnd - trainStart;
assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
long evaluationStart = System.currentTimeMillis();
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
long evaluationEnd = System.currentTimeMillis();
long evaluationTime = evaluationEnd - evaluationStart;
assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
assertTrue(5000 > avgClassificationTime);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
TermsEnum iterator = terms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
String s = term.utf8ToString();
recall = confusionMatrix.getRecall(s);
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
precision = confusionMatrix.getPrecision(s);
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double f1Measure = confusionMatrix.getF1Measure(s);
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
} finally {
leafReader.close();
}
}
}

View File

@ -21,11 +21,13 @@ import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.classification.BM25NBClassifier;
import org.apache.lucene.classification.BooleanPerceptronClassifier;
import org.apache.lucene.classification.CachingNaiveBayesClassifier;
import org.apache.lucene.classification.ClassificationResult;
import org.apache.lucene.classification.ClassificationTestBase;
import org.apache.lucene.classification.Classifier;
import org.apache.lucene.classification.KNearestFuzzyClassifier;
import org.apache.lucene.classification.KNearestNeighborClassifier;
import org.apache.lucene.classification.SimpleNaiveBayesClassifier;
import org.apache.lucene.index.LeafReader;
@ -94,22 +96,43 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
Classifier<BytesRef> classifier = new SimpleNaiveBayesClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
assertNotNull(confusionMatrix.getLinearizedMatrix());
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double f1Measure = confusionMatrix.getF1Measure();
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
checkCM(confusionMatrix);
} finally {
if (reader != null) {
reader.close();
}
}
}
private void checkCM(ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix) {
assertNotNull(confusionMatrix);
assertNotNull(confusionMatrix.getLinearizedMatrix());
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double f1Measure = confusionMatrix.getF1Measure();
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
@Test
public void testGetConfusionMatrixWithBM25NB() throws Exception {
LeafReader reader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
reader = getSampleIndex(analyzer);
Classifier<BytesRef> classifier = new BM25NBClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, categoryFieldName, textFieldName, -1);
checkCM(confusionMatrix);
} finally {
if (reader != null) {
reader.close();
@ -126,22 +149,7 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
Classifier<BytesRef> classifier = new CachingNaiveBayesClassifier(reader, analyzer, null, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
assertNotNull(confusionMatrix.getLinearizedMatrix());
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double f1Measure = confusionMatrix.getF1Measure();
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
checkCM(confusionMatrix);
} finally {
if (reader != null) {
reader.close();
@ -158,22 +166,24 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
Classifier<BytesRef> classifier = new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
assertNotNull(confusionMatrix.getLinearizedMatrix());
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double f1Measure = confusionMatrix.getF1Measure();
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
checkCM(confusionMatrix);
} finally {
if (reader != null) {
reader.close();
}
}
}
@Test
public void testGetConfusionMatrixWithFLTKNN() throws Exception {
LeafReader reader = null;
try {
MockAnalyzer analyzer = new MockAnalyzer(random());
reader = getSampleIndex(analyzer);
Classifier<BytesRef> classifier = new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, categoryFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, categoryFieldName, textFieldName, -1);
checkCM(confusionMatrix);
} finally {
if (reader != null) {
reader.close();
@ -190,22 +200,7 @@ public class ConfusionMatrixGeneratorTest extends ClassificationTestBase<Object>
Classifier<Boolean> classifier = new BooleanPerceptronClassifier(reader, analyzer, null, 1, null, booleanFieldName, textFieldName);
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(reader,
classifier, booleanFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
assertNotNull(confusionMatrix.getLinearizedMatrix());
assertEquals(7, confusionMatrix.getNumberOfEvaluatedDocs());
assertTrue(confusionMatrix.getAvgClassificationTime() >= 0d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double f1Measure = confusionMatrix.getF1Measure();
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
checkCM(confusionMatrix);
assertTrue(confusionMatrix.getPrecision("true") >= 0d);
assertTrue(confusionMatrix.getPrecision("true") <= 1d);
assertTrue(confusionMatrix.getPrecision("false") >= 0d);

View File

@ -877,7 +877,7 @@ final class SimpleTextBKDWriter implements Closeable {
};
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
@ -1170,7 +1170,8 @@ final class SimpleTextBKDWriter implements Closeable {
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
* information (checksum matched or didn't) as a suppressed exception. */
private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
private Error verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
assert priorException != null;
// TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
// right reader after recursing to children, and possibly within recursed children,
// since all together they make a single pass through the file. But this is a sizable re-org,
@ -1181,10 +1182,10 @@ final class SimpleTextBKDWriter implements Closeable {
try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
CodecUtil.checkFooter(in, priorException);
}
} else {
// We are reading from heap; nothing to add:
IOUtils.reThrow(priorException);
}
// We are reading from heap; nothing to add:
throw IOUtils.rethrowAlways(priorException);
}
/** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
@ -1206,7 +1207,7 @@ final class SimpleTextBKDWriter implements Closeable {
reader.markOrds(rightCount-1, ordBitSet);
}
} catch (Throwable t) {
verifyChecksum(t, source.writer);
throw verifyChecksum(t, source.writer);
}
return scratch1;
@ -1255,10 +1256,7 @@ final class SimpleTextBKDWriter implements Closeable {
}
return new PathSlice(writer, 0, count);
} catch (Throwable t) {
verifyChecksum(t, source.writer);
// Dead code but javac disagrees:
return null;
throw verifyChecksum(t, source.writer);
}
}
@ -1564,7 +1562,7 @@ final class SimpleTextBKDWriter implements Closeable {
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
} catch (Throwable t) {
verifyChecksum(t, slices[dim].writer);
throw verifyChecksum(t, slices[dim].writer);
}
}

View File

@ -331,6 +331,9 @@ public final class CodecUtil {
/** Retrieves the full footer from the provided {@link IndexInput}. This throws
* {@link CorruptIndexException} if this file does not have a valid footer. */
public static byte[] readFooter(IndexInput in) throws IOException {
if (in.length() < footerLength()) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), in);
}
in.seek(in.length() - footerLength());
validateFooter(in);
in.seek(in.length() - footerLength());
@ -467,7 +470,7 @@ public final class CodecUtil {
// catch-all for things that shouldn't go wrong (e.g. OOM during readInt) but could...
priorException.addSuppressed(new CorruptIndexException("checksum status indeterminate: unexpected exception", in, t));
}
IOUtils.reThrow(priorException);
throw IOUtils.rethrowAlways(priorException);
}
}
@ -516,6 +519,9 @@ public final class CodecUtil {
clone.seek(0);
ChecksumIndexInput in = new BufferedChecksumIndexInput(clone);
assert in.getFilePointer() == 0;
if (in.length() < footerLength()) {
throw new CorruptIndexException("misplaced codec footer (file truncated?): length=" + in.length() + " but footerLength==" + footerLength(), input);
}
in.seek(in.length() - footerLength());
return checkFooter(in);
}

View File

@ -112,6 +112,7 @@ abstract class RangeFieldQuery extends Query {
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
final RangeFieldComparator target = new RangeFieldComparator();
private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException {
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
values.intersect(
@ -133,25 +134,29 @@ abstract class RangeFieldQuery extends Query {
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
byte[] node = getInternalRange(minPackedValue, maxPackedValue);
// compute range relation for BKD traversal
if (target.intersects(node) == false) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (target.within(node)) {
// target within cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
} else if (target.contains(node)) {
// target contains cell; add iff queryType is not a CONTAINS or CROSSES query:
return (queryType == QueryType.CONTAINS || queryType == QueryType.CROSSES) ?
Relation.CELL_OUTSIDE_QUERY : Relation.CELL_INSIDE_QUERY;
}
// target intersects cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
return compareRange(minPackedValue, maxPackedValue);
}
});
return result.build();
}
private Relation compareRange(byte[] minPackedValue, byte[] maxPackedValue) {
byte[] node = getInternalRange(minPackedValue, maxPackedValue);
// compute range relation for BKD traversal
if (target.intersects(node) == false) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (target.within(node)) {
// target within cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
} else if (target.contains(node)) {
// target contains cell; add iff queryType is not a CONTAINS or CROSSES query:
return (queryType == QueryType.CONTAINS || queryType == QueryType.CROSSES) ?
Relation.CELL_OUTSIDE_QUERY : Relation.CELL_INSIDE_QUERY;
}
// target intersects cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
@ -166,17 +171,10 @@ abstract class RangeFieldQuery extends Query {
return null;
}
checkFieldInfo(fieldInfo);
boolean allDocsMatch = true;
if (values.getDocCount() == reader.maxDoc()) {
// if query crosses, docs need to be further scrutinized
byte[] range = getInternalRange(values.getMinPackedValue(), values.getMaxPackedValue());
// if the internal node is not equal and not contained by the query, all docs do not match
if (queryType == QueryType.CROSSES || (!Arrays.equals(ranges, range)
&& (target.contains(range) == false || queryType != QueryType.WITHIN))) {
allDocsMatch = false;
}
} else {
allDocsMatch = false;
boolean allDocsMatch = false;
if (values.getDocCount() == reader.maxDoc()
&& compareRange(values.getMinPackedValue(), values.getMaxPackedValue()) == Relation.CELL_INSIDE_QUERY) {
allDocsMatch = true;
}
DocIdSetIterator iterator = allDocsMatch == true ?

View File

@ -463,8 +463,9 @@ class BufferedUpdatesStream implements Accountable {
}
if (success) {
// Does nothing if firstExc is null:
IOUtils.reThrow(firstExc);
if (firstExc != null) {
throw IOUtils.rethrowAlways(firstExc);
}
}
if (infoStream.isEnabled("BD")) {

View File

@ -529,7 +529,7 @@ public final class CheckIndex implements Closeable {
sis = SegmentInfos.readCommit(dir, lastSegmentsFile);
} catch (Throwable t) {
if (failFast) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
msg(infoStream, "ERROR: could not read any segments file in directory");
result.missingSegments = true;
@ -565,11 +565,12 @@ public final class CheckIndex implements Closeable {
input = dir.openInput(segmentsFileName, IOContext.READONCE);
} catch (Throwable t) {
if (failFast) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
msg(infoStream, "ERROR: could not open segments file in directory");
if (infoStream != null)
if (infoStream != null) {
t.printStackTrace(infoStream);
}
result.cantOpenSegments = true;
return result;
}
@ -577,11 +578,12 @@ public final class CheckIndex implements Closeable {
/*int format =*/ input.readInt();
} catch (Throwable t) {
if (failFast) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
msg(infoStream, "ERROR: could not read segment file version in directory");
if (infoStream != null)
if (infoStream != null) {
t.printStackTrace(infoStream);
}
result.missingSegmentVersion = true;
return result;
} finally {
@ -789,7 +791,7 @@ public final class CheckIndex implements Closeable {
} catch (Throwable t) {
if (failFast) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
msg(infoStream, "FAILED");
String comment;
@ -883,7 +885,7 @@ public final class CheckIndex implements Closeable {
msg(infoStream, String.format(Locale.ROOT, "OK [took %.3f sec]", nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -941,7 +943,7 @@ public final class CheckIndex implements Closeable {
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -974,7 +976,7 @@ public final class CheckIndex implements Closeable {
status.totFields = fieldInfos.size();
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -1013,7 +1015,7 @@ public final class CheckIndex implements Closeable {
msg(infoStream, String.format(Locale.ROOT, "OK [%d fields] [took %.3f sec]", status.totFields, nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -1769,7 +1771,7 @@ public final class CheckIndex implements Closeable {
status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR: " + e);
status = new Status.TermIndexStatus();
@ -1845,7 +1847,7 @@ public final class CheckIndex implements Closeable {
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR: " + e);
status.error = e;
@ -2079,7 +2081,7 @@ public final class CheckIndex implements Closeable {
nsToSec(System.nanoTime() - startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -2126,7 +2128,7 @@ public final class CheckIndex implements Closeable {
nsToSec(System.nanoTime()-startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;
@ -2567,7 +2569,7 @@ public final class CheckIndex implements Closeable {
status.totVectors, vectorAvg, nsToSec(System.nanoTime() - startNS)));
} catch (Throwable e) {
if (failFast) {
IOUtils.reThrow(e);
throw IOUtils.rethrowAlways(e);
}
msg(infoStream, "ERROR [" + String.valueOf(e.getMessage()) + "]");
status.error = e;

View File

@ -603,7 +603,7 @@ final class DefaultIndexingChain extends DocConsumer {
// PerField.invert to allow for later downgrading of the index options:
fi.setIndexOptions(fieldType.indexOptions());
fp = new PerField(fi, invert);
fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
@ -633,6 +633,7 @@ final class DefaultIndexingChain extends DocConsumer {
/** NOTE: not static: accesses at least docState, termsHash. */
private final class PerField implements Comparable<PerField> {
final int indexCreatedVersionMajor;
final FieldInfo fieldInfo;
final Similarity similarity;
@ -659,7 +660,8 @@ final class DefaultIndexingChain extends DocConsumer {
// reused
TokenStream tokenStream;
public PerField(FieldInfo fieldInfo, boolean invert) {
public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.fieldInfo = fieldInfo;
similarity = docState.similarity;
if (invert) {
@ -668,7 +670,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
void setInvertState() {
invertState = new FieldInvertState(fieldInfo.name);
invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;

View File

@ -193,6 +193,10 @@ class DocumentsWriterPerThread {
return fieldInfos;
}
public int getIndexCreatedVersionMajor() {
return indexWriter.segmentInfos.getIndexCreatedVersionMajor();
}
final void testPoint(String message) {
if (enableTestPoints) {
assert infoStream.isEnabled("TP"); // don't enable unless you need them.

View File

@ -31,7 +31,8 @@ import org.apache.lucene.util.AttributeSource;
* @lucene.experimental
*/
public final class FieldInvertState {
String name;
final int indexCreatedVersionMajor;
final String name;
int position;
int length;
int numOverlap;
@ -50,14 +51,15 @@ public final class FieldInvertState {
/** Creates {code FieldInvertState} for the specified
* field name. */
public FieldInvertState(String name) {
public FieldInvertState(int indexCreatedVersionMajor, String name) {
this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.name = name;
}
/** Creates {code FieldInvertState} for the specified
* field name and values for all fields. */
public FieldInvertState(String name, int position, int length, int numOverlap, int offset) {
this.name = name;
public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
this(indexCreatedVersionMajor, name);
this.position = position;
this.length = length;
this.numOverlap = numOverlap;
@ -164,4 +166,11 @@ public final class FieldInvertState {
public String getName() {
return name;
}
/**
* Return the version that was used to create the index, or 6 if it was created before 7.0.
*/
public int getIndexCreatedVersionMajor() {
return indexCreatedVersionMajor;
}
}

View File

@ -364,7 +364,7 @@ final class IndexFileDeleter implements Closeable {
* Remove the CommitPoints in the commitsToDelete List by
* DecRef'ing all files from each SegmentInfos.
*/
private void deleteCommits() {
private void deleteCommits() throws IOException {
int size = commitsToDelete.size();
@ -388,8 +388,9 @@ final class IndexFileDeleter implements Closeable {
}
commitsToDelete.clear();
// NOTE: does nothing if firstThrowable is null
IOUtils.reThrowUnchecked(firstThrowable);
if (firstThrowable != null) {
throw IOUtils.rethrowAlways(firstThrowable);
}
// Now compact commits to remove deleted ones (preserving the sort):
size = commits.size();
@ -599,8 +600,9 @@ final class IndexFileDeleter implements Closeable {
}
}
// NOTE: does nothing if firstThrowable is null
IOUtils.reThrow(firstThrowable);
if (firstThrowable != null) {
throw IOUtils.rethrowAlways(firstThrowable);
}
}
/** Decrefs all provided files, ignoring any exceptions hit; call this if

View File

@ -144,7 +144,9 @@ public abstract class IndexReader implements Closeable {
// overridden by StandardDirectoryReader and SegmentReader
void notifyReaderClosedListeners(Throwable th) throws IOException {
// nothing to notify in the base impl, just rethrow
IOUtils.reThrow(th);
if (th != null) {
throw IOUtils.rethrowAlways(th);
}
}
private void reportCloseToParentReaders() {

View File

@ -611,7 +611,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
} catch (Throwable t) {
if (doSave) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
} else if (priorE == null) {
priorE = t;
}
@ -631,14 +631,16 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
rld.dropReaders();
} catch (Throwable t) {
if (doSave) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
} else if (priorE == null) {
priorE = t;
}
}
}
assert readerMap.size() == 0;
IOUtils.reThrow(priorE);
if (priorE != null) {
throw IOUtils.rethrowAlways(priorE);
}
}
/**
@ -3330,7 +3332,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
if (commitCompleted) {
tragicEvent(t, "finishCommit");
} else {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
}
@ -3898,7 +3900,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
throw (MergePolicy.MergeAbortedException) t;
}
} else {
IOUtils.reThrow(t);
assert t != null;
throw IOUtils.rethrowAlways(t);
}
}
@ -4238,8 +4241,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
}
// If any error occurred, throw it.
if (!suppressExceptions) {
IOUtils.reThrow(th);
if (!suppressExceptions && th != null) {
throw IOUtils.rethrowAlways(th);
}
}
@ -4815,7 +4818,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
// It's possible you could have a really bad day
if (this.tragedy != null) {
// Another thread is already dealing / has dealt with the tragedy:
IOUtils.reThrow(tragedy);
throw IOUtils.rethrowAlways(tragedy);
}
this.tragedy = tragedy;
@ -4826,7 +4829,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit, Accountable {
rollbackInternal();
}
IOUtils.reThrow(tragedy);
throw IOUtils.rethrowAlways(tragedy);
}
/** If this {@code IndexWriter} was closed as a side-effect of a tragic exception,

View File

@ -210,7 +210,10 @@ final class SegmentCoreReaders {
}
}
}
IOUtils.reThrow(th);
if (th != null) {
throw IOUtils.rethrowAlways(th);
}
}
}

View File

@ -90,8 +90,9 @@ final class SegmentDocValues {
}
}
}
if (t != null) {
IOUtils.reThrow(t);
throw IOUtils.rethrowAlways(t);
}
}
}

View File

@ -303,7 +303,10 @@ public final class SegmentReader extends CodecReader {
}
}
}
IOUtils.reThrow(th);
if (th != null) {
IOUtils.rethrowAlways(th);
}
}
}

View File

@ -391,7 +391,9 @@ public final class StandardDirectoryReader extends DirectoryReader {
}
// throw the first exception
IOUtils.reThrow(firstExc);
if (firstExc != null) {
throw IOUtils.rethrowAlways(firstExc);
}
}
@Override
@ -504,7 +506,10 @@ public final class StandardDirectoryReader extends DirectoryReader {
}
}
}
IOUtils.reThrow(th);
if (th != null) {
throw IOUtils.rethrowAlways(th);
}
}
}

View File

@ -58,6 +58,16 @@ public abstract class DoubleValuesSource {
*/
public abstract boolean needsScores();
/**
* An explanation of the value for the named document.
*
* @param ctx the readers context to create the {@link Explanation} for.
* @param docId the document's id relative to the given context's reader
* @return an Explanation for the value
* @throws IOException if an {@link IOException} occurs
*/
public abstract Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException;
/**
* Create a sort field based on the value of this producer
* @param reverse true if the sort should be decreasing
@ -149,6 +159,11 @@ public abstract class DoubleValuesSource {
public boolean needsScores() {
return true;
}
@Override
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) {
return scoreExplanation;
}
};
/**
@ -176,6 +191,11 @@ public abstract class DoubleValuesSource {
return false;
}
@Override
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) {
return Explanation.match((float) value, "constant(" + value + ")");
}
@Override
public String toString() {
return "constant(" + value + ")";
@ -186,7 +206,7 @@ public abstract class DoubleValuesSource {
/**
* Creates a DoubleValuesSource that is a function of another DoubleValuesSource
*/
public static DoubleValuesSource function(DoubleValuesSource in, DoubleUnaryOperator function) {
public static DoubleValuesSource function(DoubleValuesSource in, String description, DoubleUnaryOperator function) {
return new DoubleValuesSource() {
@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
@ -208,15 +228,22 @@ public abstract class DoubleValuesSource {
public boolean needsScores() {
return in.needsScores();
}
@Override
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
Explanation inner = in.explain(ctx, docId, scoreExplanation);
return Explanation.match((float) function.applyAsDouble(inner.getValue()), description + ", computed from:", inner, scoreExplanation);
}
};
}
/**
* Creates a DoubleValuesSource that is a function of another DoubleValuesSource and a score
* @param in the DoubleValuesSource to use as an input
* @param description a description of the function
* @param function a function of the form (source, score) == result
*/
public static DoubleValuesSource scoringFunction(DoubleValuesSource in, ToDoubleBiFunction<Double, Double> function) {
public static DoubleValuesSource scoringFunction(DoubleValuesSource in, String description, ToDoubleBiFunction<Double, Double> function) {
return new DoubleValuesSource() {
@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
@ -238,6 +265,13 @@ public abstract class DoubleValuesSource {
public boolean needsScores() {
return true;
}
@Override
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
Explanation inner = in.explain(ctx, docId, scoreExplanation);
return Explanation.match((float) function.applyAsDouble((double)inner.getValue(), (double)scoreExplanation.getValue()),
description + ", computed from:", inner, scoreExplanation);
}
};
}
@ -303,6 +337,15 @@ public abstract class DoubleValuesSource {
public boolean needsScores() {
return false;
}
@Override
public Explanation explain(LeafReaderContext ctx, int docId, Explanation scoreExplanation) throws IOException {
DoubleValues values = getValues(ctx, null);
if (values.advanceExact(docId))
return Explanation.match((float)values.doubleValue(), "double(" + field + ")");
else
return Explanation.noMatch("double(" + field + ")");
}
}
private static class DoubleValuesSortField extends SortField {

View File

@ -298,7 +298,7 @@ public class LRUQueryCache implements QueryCache, Accountable {
try {
Query singleton = uniqueQueries.putIfAbsent(query, query);
if (singleton == null) {
onQueryCache(singleton, LINKED_HASHTABLE_RAM_BYTES_PER_ENTRY + ramBytesUsed(query));
onQueryCache(query, LINKED_HASHTABLE_RAM_BYTES_PER_ENTRY + ramBytesUsed(query));
} else {
query = singleton;
}

View File

@ -96,20 +96,6 @@ public class BM25Similarity extends Similarity {
}
}
/** The default implementation encodes <code>1 / sqrt(length)</code>
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
* Lucene's historic implementation: {@link ClassicSimilarity}. If you
* change this, then you should change {@link #decodeNormValue(byte)} to match. */
protected byte encodeNormValue(int fieldLength) {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(fieldLength)));
}
/** The default implementation returns <code>1 / f<sup>2</sup></code>
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
protected float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF];
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
@ -132,21 +118,31 @@ public class BM25Similarity extends Similarity {
}
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
private static final float[] OLD_LENGTH_TABLE = new float[256];
private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (f*f);
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
}
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
for (int i = 0; i < 256; i++) {
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
return encodeNormValue(numTerms);
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
if (indexCreatedVersionMajor >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
}
}
/**
@ -207,34 +203,43 @@ public class BM25Similarity extends Similarity {
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
float avgdl = avgFieldLength(collectionStats);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];
float[] oldCache = new float[256];
float[] cache = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
BM25Stats bm25stats = (BM25Stats) stats;
return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
}
private class BM25DocScorer extends SimScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final NumericDocValues norms;
/** precomputed cache for all length values */
private final float[] lengthCache;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1);
this.cache = stats.cache;
this.norms = norms;
if (indexCreatedVersionMajor >= 7) {
lengthCache = LENGTH_TABLE;
cache = stats.cache;
} else {
lengthCache = OLD_LENGTH_TABLE;
cache = stats.oldCache;
}
}
@Override
@ -245,7 +250,7 @@ public class BM25Similarity extends Similarity {
norm = k1;
} else {
if (norms.advanceExact(doc)) {
norm = cache[(byte)norms.longValue() & 0xFF];
norm = cache[((byte) norms.longValue()) & 0xFF];
} else {
norm = cache[0];
}
@ -255,7 +260,7 @@ public class BM25Similarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms);
return explainScore(doc, freq, stats, norms, lengthCache);
}
@Override
@ -281,21 +286,23 @@ public class BM25Similarity extends Similarity {
private final float weight;
/** field name, for pulling norms */
private final String field;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float cache[];
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
* for both OLD_LENGTH_TABLE and LENGTH_TABLE */
private final float[] oldCache, cache;
BM25Stats(String field, float boost, Explanation idf, float avgdl, float cache[]) {
BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
this.field = field;
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
this.cache = cache;
this.weight = idf.getValue() * boost;
this.oldCache = oldCache;
this.cache = cache;
}
}
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "parameter k1"));
@ -311,7 +318,7 @@ public class BM25Similarity extends Similarity {
} else {
norm = 0;
}
float doclen = decodeNormValue(norm);
float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "parameter b"));
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
subs.add(Explanation.match(doclen, "fieldLength"));
@ -321,13 +328,13 @@ public class BM25Similarity extends Similarity {
}
}
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
Explanation boostExpl = Explanation.match(stats.boost, "boost");
List<Explanation> subs = new ArrayList<>();
if (boostExpl.getValue() != 1.0f)
subs.add(boostExpl);
subs.add(stats.idf);
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms);
Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
subs.add(tfNormExpl);
return Explanation.match(
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),

View File

@ -17,91 +17,27 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/**
* Expert: Default scoring implementation which {@link #encodeNormValue(float)
* encodes} norm values as a single byte before being stored. At search time,
* the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
* instance, <i>decode(encode(0.89)) = 0.875</i>.
* <p>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for all
* documents - are maintained in memory.
* <p>
* The rationale supporting such lossy compression of norm values is that given
* the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter. <br>
* &nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of
* scoring, e.g. by using a different {@link Similarity} for search.
* Expert: Historical scoring implementation. You might want to consider using
* {@link BM25Similarity} instead, which is generally considered superior to
* TF-IDF.
*/
public class ClassicSimilarity extends TFIDFSimilarity {
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/** Sole constructor: parameter-free */
public ClassicSimilarity() {}
/**
* Encodes a normalization factor for storage in an index.
* <p>
* The encoding uses a three-bit mantissa, a five-bit exponent, and the
* zero-exponent point at 15, thus representing values from around 7x10^9 to
* 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
* represented. Negative numbers are rounded up to zero. Values too large to
* represent are rounded down to the largest representable value. Positive
* values too small to represent are rounded up to the smallest positive
* representable value.
*
* @see org.apache.lucene.util.SmallFloat
*/
@Override
public final long encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/**
* Decodes the norm value, assuming it is a single byte.
*
* @see #encodeNormValue(float)
*/
@Override
public final float decodeNormValue(long norm) {
return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
}
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
* <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
* #setDiscountOverlaps} is false, else it's {@link
* FieldInvertState#getLength()} - {@link
* FieldInvertState#getNumOverlap()}.
* <code>1/sqrt(length)</code>.
*
* @lucene.experimental */
@Override
public float lengthNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
public float lengthNorm(int numTerms) {
return (float) (1.0 / Math.sqrt(numTerms));
}
@ -138,33 +74,6 @@ public class ClassicSimilarity extends TFIDFSimilarity {
public float idf(long docFreq, long docCount) {
return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {

View File

@ -190,7 +190,8 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
@ -198,12 +199,12 @@ public abstract class SimilarityBase extends Similarity {
SimScorer subScorers[] = new SimScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
BasicStats basicstats = (BasicStats) subStats[i];
subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
return new MultiSimilarity.MultiSimScorer(subScorers);
} else {
BasicStats basicstats = (BasicStats) stats;
return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
}
@ -216,40 +217,38 @@ public abstract class SimilarityBase extends Similarity {
// ------------------------------ Norm handling ------------------------------
/** Norm to document length map. */
private static final float[] NORM_TABLE = new float[256];
/** Cache of decoded bytes. */
private static final float[] OLD_LENGTH_TABLE = new float[256];
private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
float f = SmallFloat.byte315ToFloat((byte)i);
OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
}
OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
for (int i = 0; i < 256; i++) {
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
/** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
public long computeNorm(FieldInvertState state) {
final float numTerms;
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
return encodeNormValue(numTerms);
int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
if (indexCreatedVersionMajor >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
}
}
/** Decodes a normalization factor (document length) stored in an index.
* @see #encodeNormValue(float)
*/
protected float decodeNormValue(byte norm) {
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes the length to a byte via SmallFloat. */
protected byte encodeNormValue(float length) {
return SmallFloat.floatToByte315((float) (1 / Math.sqrt(length)));
}
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */
@ -266,35 +265,37 @@ public abstract class SimilarityBase extends Similarity {
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
* respectively.
*/
private class BasicSimScorer extends SimScorer {
final class BasicSimScorer extends SimScorer {
private final BasicStats stats;
private final NumericDocValues norms;
private final float[] normCache;
BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.norms = norms;
this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
}
private float getNormValue(int doc) throws IOException {
float getLengthValue(int doc) throws IOException {
if (norms == null) {
return 1F;
}
if (norms.advanceExact(doc)) {
return decodeNormValue((byte) norms.longValue());
return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
} else {
return decodeNormValue((byte) 0);
return 0;
}
}
@Override
public float score(int doc, float freq) throws IOException {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq, getNormValue(doc));
return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return SimilarityBase.this.explain(stats, doc, freq, getNormValue(doc));
return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
}
@Override

View File

@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
/**
@ -233,11 +234,6 @@ import org.apache.lucene.util.BytesRef;
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
@ -379,13 +375,49 @@ import org.apache.lucene.util.BytesRef;
* @see IndexSearcher#setSimilarity(Similarity)
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Cache of decoded bytes. */
static final float[] OLD_NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
}
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public TFIDFSimilarity() {}
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(long, long)}
* factor for each term in the query and these products are then summed to
@ -471,30 +503,25 @@ public abstract class TFIDFSimilarity extends Similarity {
/**
* Compute an index-time normalization value for this field instance.
* <p>
* This value will be stored in a single byte lossy representation by
* {@link #encodeNormValue(float)}.
*
* @param state statistics of the current field (such as length, boost, etc)
* @return an index-time normalization value
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) discounting overlaps}
* @return a length normalization value
*/
public abstract float lengthNorm(FieldInvertState state);
public abstract float lengthNorm(int length);
@Override
public final long computeNorm(FieldInvertState state) {
float normValue = lengthNorm(state);
return encodeNormValue(normValue);
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
if (state.getIndexCreatedVersionMajor() >= 7) {
return SmallFloat.intToByte4(numTerms);
} else {
return SmallFloat.floatToByte315(lengthNorm(numTerms));
}
}
/**
* Decodes a normalization factor stored in an index.
*
* @see #encodeNormValue(float)
*/
public abstract float decodeNormValue(long norm);
/** Encodes a normalization factor for storage in an index. */
public abstract long encodeNormValue(float f);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
@ -529,24 +556,41 @@ public abstract class TFIDFSimilarity extends Similarity {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
return new IDFStats(collectionStats.field(), boost, idf);
float[] normTable = new float[256];
for (int i = 1; i < 256; ++i) {
int length = SmallFloat.byte4ToInt((byte) i);
float norm = lengthNorm(length);
normTable[i] = norm;
}
normTable[0] = 1f / normTable[255];
return new IDFStats(collectionStats.field(), boost, idf, normTable);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
IDFStats idfstats = (IDFStats) stats;
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
final float[] normTable;
if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
// the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
normTable = idfstats.normTable;
} else {
// the norm is directly encoded in the index
normTable = OLD_NORM_TABLE;
}
return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
}
private final class TFIDFSimScorer extends SimScorer {
private final IDFStats stats;
private final float weightValue;
private final NumericDocValues norms;
private final float[] normTable;
TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
this.stats = stats;
this.weightValue = stats.queryWeight;
this.norms = norms;
this.normTable = normTable;
}
@Override
@ -556,13 +600,13 @@ public abstract class TFIDFSimilarity extends Similarity {
if (norms == null) {
return raw;
} else {
long normValue;
float normValue;
if (norms.advanceExact(doc)) {
normValue = norms.longValue();
normValue = normTable[(int) (norms.longValue() & 0xFF)];
} else {
normValue = 0;
}
return raw * decodeNormValue(normValue); // normalize for field
return raw * normValue; // normalize for field
}
}
@ -578,35 +622,39 @@ public abstract class TFIDFSimilarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
return explainScore(doc, freq, stats, norms);
return explainScore(doc, freq, stats, norms, normTable);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
private static class IDFStats extends SimWeight {
static class IDFStats extends SimWeight {
private final String field;
/** The idf and its explanation */
private final Explanation idf;
private final float boost;
private final float queryWeight;
final float[] normTable;
public IDFStats(String field, float boost, Explanation idf) {
public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.boost = boost;
this.queryWeight = boost * idf.getValue();
this.normTable = normTable;
}
}
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
float norm;
if (norms != null && norms.advanceExact(doc)) {
norm = decodeNormValue(norms.longValue());
} else {
if (norms == null) {
norm = 1f;
} else if (norms.advanceExact(doc) == false) {
norm = 0f;
} else {
norm = normTable[(int) (norms.longValue() & 0xFF)];
}
Explanation fieldNormExpl = Explanation.match(
@ -619,9 +667,9 @@ public abstract class TFIDFSimilarity extends Similarity {
tfExplanation, stats.idf, fieldNormExpl);
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation queryExpl = Explanation.match(stats.boost, "boost");
Explanation fieldExpl = explainField(doc, freq, stats, norms);
Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
if (stats.boost == 1f) {
return fieldExpl;
}

View File

@ -215,7 +215,7 @@ public abstract class FSDirectory extends BaseDirectory {
try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
for (Path path : stream) {
String name = path.getFileName().toString();
if (skipNames != null && skipNames.contains(name) == false) {
if (skipNames == null || skipNames.contains(name) == false) {
entries.add(name);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.util;
import java.lang.invoke.MethodHandle;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.lang.reflect.UndeclaredThrowableException;
/**
* An AttributeFactory creates instances of {@link AttributeImpl}s.
@ -28,8 +29,14 @@ public abstract class AttributeFactory {
/**
* Returns an {@link AttributeImpl} for the supplied {@link Attribute} interface class.
*
* @throws UndeclaredThrowableException A wrapper runtime exception thrown if the
* constructor of the attribute class throws a checked exception.
* Note that attributes should not throw or declare
* checked exceptions; this may be verified and fail early in the future.
*/
public abstract AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass);
public abstract AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass)
throws UndeclaredThrowableException;
/**
* Returns a correctly typed {@link MethodHandle} for the no-arg ctor of the given class.
@ -61,17 +68,18 @@ public abstract class AttributeFactory {
};
DefaultAttributeFactory() {}
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
try {
return (AttributeImpl) constructors.get(attClass).invokeExact();
} catch (Throwable t) {
rethrow(t);
throw new AssertionError();
} catch (Error | RuntimeException e) {
throw e;
} catch (Throwable e) {
throw new UndeclaredThrowableException(e);
}
}
private Class<? extends AttributeImpl> findImplClass(Class<? extends Attribute> attClass) {
try {
return Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()).asSubclass(AttributeImpl.class);
@ -138,23 +146,12 @@ public abstract class AttributeFactory {
protected A createInstance() {
try {
return (A) constr.invokeExact();
} catch (Throwable t) {
rethrow(t);
throw new AssertionError();
} catch (Error | RuntimeException e) {
throw e;
} catch (Throwable e) {
throw new UndeclaredThrowableException(e);
}
}
};
}
// Hack to rethrow unknown Exceptions from {@link MethodHandle#invoke}:
// TODO: remove the impl in test-framework, this one is more elegant :-)
static void rethrow(Throwable t) {
AttributeFactory.<Error>rethrow0(t);
}
@SuppressWarnings("unchecked")
private static <T extends Throwable> void rethrow0(Throwable t) throws T {
throw (T) t;
}
}

View File

@ -96,7 +96,9 @@ public final class IOUtils {
}
}
reThrow(th);
if (th != null) {
throw rethrowAlways(th);
}
}
/**
@ -229,7 +231,9 @@ public final class IOUtils {
}
}
reThrow(th);
if (th != null) {
throw rethrowAlways(th);
}
}
public static void deleteFiles(Directory dir, String... files) throws IOException {
@ -300,7 +304,9 @@ public final class IOUtils {
}
}
reThrow(th);
if (th != null) {
throw rethrowAlways(th);
}
}
/**
@ -376,37 +382,83 @@ public final class IOUtils {
}
/**
* Simple utility method that takes a previously caught
* {@code Throwable} and rethrows either {@code
* IOException} or an unchecked exception. If the
* argument is null then this method does nothing.
* This utility method takes a previously caught (non-null)
* {@code Throwable} and rethrows either the original argument
* if it was a subclass of the {@code IOException} or an
* {@code RuntimeException} with the cause set to the argument.
*
* <p>This method <strong>never returns any value</strong>, even though it declares
* a return value of type {@link Error}. The return value declaration
* is very useful to let the compiler know that the code path following
* the invocation of this method is unreachable. So in most cases the
* invocation of this method will be guarded by an {@code if} and
* used together with a {@code throw} statement, as in:
* </p>
* <pre>{@code
* if (t != null) throw IOUtils.rethrowAlways(t)
* }
* </pre>
*
* @param th The throwable to rethrow, <strong>must not be null</strong>.
* @return This method always results in an exception, it never returns any value.
* See method documentation for detailsa and usage example.
* @throws IOException if the argument was an instance of IOException
* @throws RuntimeException with the {@link RuntimeException#getCause()} set
* to the argument, if it was not an instance of IOException.
*/
public static void reThrow(Throwable th) throws IOException {
if (th != null) {
if (th instanceof IOException) {
throw (IOException) th;
}
reThrowUnchecked(th);
public static Error rethrowAlways(Throwable th) throws IOException, RuntimeException {
if (th == null) {
throw new AssertionError("rethrow argument must not be null.");
}
if (th instanceof IOException) {
throw (IOException) th;
}
if (th instanceof RuntimeException) {
throw (RuntimeException) th;
}
if (th instanceof Error) {
throw (Error) th;
}
throw new RuntimeException(th);
}
/**
* Simple utility method that takes a previously caught
* {@code Throwable} and rethrows it as an unchecked exception.
* If the argument is null then this method does nothing.
* Rethrows the argument as {@code IOException} or {@code RuntimeException}
* if it's not null.
*
* @deprecated This method is deprecated in favor of {@link #rethrowAlways}. Code should
* be updated to {@link #rethrowAlways} and guarded with an additional null-argument check
* (because {@link #rethrowAlways} is not accepting null arguments).
*/
@Deprecated
public static void reThrow(Throwable th) throws IOException {
if (th != null) {
throw rethrowAlways(th);
}
}
/**
* @deprecated This method is deprecated in favor of {@link #rethrowAlways}. Code should
* be updated to {@link #rethrowAlways} and guarded with an additional null-argument check
* (because {@link #rethrowAlways} is not accepting null arguments).
*/
@Deprecated
public static void reThrowUnchecked(Throwable th) {
if (th != null) {
if (th instanceof RuntimeException) {
throw (RuntimeException) th;
}
if (th instanceof Error) {
throw (Error) th;
}
if (th instanceof RuntimeException) {
throw (RuntimeException) th;
}
throw new RuntimeException(th);
}
}
}
/**
* Ensure that any writes to the given file is written to the storage device that contains it.
* @param fileToSync the file to fsync

View File

@ -24,7 +24,12 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ChecksumIndexInput;
@ -73,6 +78,9 @@ public class OfflineSorter {
private final int valueLength;
private final String tempFileNamePrefix;
private final ExecutorService exec;
private final Semaphore partitionsInRAM;
/**
* A bit more descriptive unit for constructors.
*
@ -145,13 +153,13 @@ public class OfflineSorter {
/** number of lines of data read */
public int lineCount;
/** time spent merging sorted partitions (in milliseconds) */
public long mergeTime;
public final AtomicLong mergeTimeMS = new AtomicLong();
/** time spent sorting data (in milliseconds) */
public long sortTime;
public final AtomicLong sortTimeMS = new AtomicLong();
/** total time spent (in milliseconds) */
public long totalTime;
public long totalTimeMS;
/** time spent in i/o read (in milliseconds) */
public long readTime;
public long readTimeMS;
/** read buffer size (in bytes) */
public final long bufferSize = ramBufferSize.bytes;
@ -161,17 +169,15 @@ public class OfflineSorter {
@Override
public String toString() {
return String.format(Locale.ROOT,
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
totalTime / 1000.0d, readTime / 1000.0d, sortTime / 1000.0d, mergeTime / 1000.0d,
lineCount, tempMergeFiles, mergeRounds,
(double) bufferSize / MB);
"time=%.2f sec. total (%.2f reading, %.2f sorting, %.2f merging), lines=%d, temp files=%d, merges=%d, soft ram limit=%.2f MB",
totalTimeMS / 1000.0d, readTimeMS / 1000.0d, sortTimeMS.get() / 1000.0d, mergeTimeMS.get() / 1000.0d,
lineCount, tempMergeFiles, mergeRounds,
(double) bufferSize / MB);
}
}
private final BufferSize ramBufferSize;
private final Counter bufferBytesUsed = Counter.newCounter();
private final SortableBytesRefArray buffer;
SortInfo sortInfo;
private int maxTempFiles;
private final Comparator<BytesRef> comparator;
@ -185,7 +191,7 @@ public class OfflineSorter {
* @see BufferSize#automatic()
*/
public OfflineSorter(Directory dir, String tempFileNamePrefix) throws IOException {
this(dir, tempFileNamePrefix, DEFAULT_COMPARATOR, BufferSize.automatic(), MAX_TEMPFILES, -1);
this(dir, tempFileNamePrefix, DEFAULT_COMPARATOR, BufferSize.automatic(), MAX_TEMPFILES, -1, null, 0);
}
/**
@ -194,14 +200,30 @@ public class OfflineSorter {
* @see BufferSize#automatic()
*/
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator) throws IOException {
this(dir, tempFileNamePrefix, comparator, BufferSize.automatic(), MAX_TEMPFILES, -1);
this(dir, tempFileNamePrefix, comparator, BufferSize.automatic(), MAX_TEMPFILES, -1, null, 0);
}
/**
* All-details constructor. If {@code valueLength} is -1 (the default), the length of each value differs; otherwise,
* all values have the specified length.
* all values have the specified length. If you pass a non-null {@code ExecutorService} then it will be
* used to run sorting operations that can be run concurrently, and maxPartitionsInRAM is the maximum
* concurrent in-memory partitions. Thus the maximum possible RAM used by this class while sorting is
* {@code maxPartitionsInRAM * ramBufferSize}.
*/
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator, BufferSize ramBufferSize, int maxTempfiles, int valueLength) {
public OfflineSorter(Directory dir, String tempFileNamePrefix, Comparator<BytesRef> comparator,
BufferSize ramBufferSize, int maxTempfiles, int valueLength, ExecutorService exec,
int maxPartitionsInRAM) {
if (exec != null) {
this.exec = exec;
if (maxPartitionsInRAM <= 0) {
throw new IllegalArgumentException("maxPartitionsInRAM must be > 0; got " + maxPartitionsInRAM);
}
} else {
this.exec = new SameThreadExecutorService();
maxPartitionsInRAM = 1;
}
this.partitionsInRAM = new Semaphore(maxPartitionsInRAM);
if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) {
throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes);
}
@ -209,14 +231,11 @@ public class OfflineSorter {
if (maxTempfiles < 2) {
throw new IllegalArgumentException("maxTempFiles must be >= 2");
}
if (valueLength == -1) {
buffer = new BytesRefArray(bufferBytesUsed);
} else {
if (valueLength == 0 || valueLength > Short.MAX_VALUE) {
throw new IllegalArgumentException("valueLength must be 1 .. " + Short.MAX_VALUE + "; got: " + valueLength);
}
buffer = new FixedLengthBytesRefArray(valueLength);
if (valueLength != -1 && (valueLength == 0 || valueLength > Short.MAX_VALUE)) {
throw new IllegalArgumentException("valueLength must be 1 .. " + Short.MAX_VALUE + "; got: " + valueLength);
}
this.valueLength = valueLength;
this.ramBufferSize = ramBufferSize;
this.maxTempFiles = maxTempfiles;
@ -241,26 +260,31 @@ public class OfflineSorter {
public String sort(String inputFileName) throws IOException {
sortInfo = new SortInfo();
sortInfo.totalTime = System.currentTimeMillis();
long startMS = System.currentTimeMillis();
List<PartitionAndCount> segments = new ArrayList<>();
List<Future<Partition>> segments = new ArrayList<>();
int[] levelCounts = new int[1];
// So we can remove any partially written temp files on exception:
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(dir);
boolean success = false;
boolean[] isExhausted = new boolean[1];
try (ByteSequencesReader is = getReader(dir.openChecksumInput(inputFileName, IOContext.READONCE), inputFileName)) {
while (isExhausted[0] == false) {
int lineCount = readPartition(is, isExhausted);
if (lineCount == 0) {
assert isExhausted[0];
while (true) {
Partition part = readPartition(is);
if (part.count == 0) {
if (partitionsInRAM != null) {
partitionsInRAM.release();
}
assert part.exhausted;
break;
}
segments.add(sortPartition(trackingDir, lineCount));
Callable<Partition> job = new SortPartitionTask(trackingDir, part);
segments.add(exec.submit(job));
sortInfo.tempMergeFiles++;
sortInfo.lineCount += lineCount;
sortInfo.lineCount += part.count;
levelCounts[0]++;
// Handle intermediate merges; we need a while loop to "cascade" the merge when necessary:
@ -274,6 +298,10 @@ public class OfflineSorter {
levelCounts[mergeLevel] = 0;
mergeLevel++;
}
if (part.exhausted) {
break;
}
}
// TODO: we shouldn't have to do this? Can't we return a merged reader to
@ -292,13 +320,13 @@ public class OfflineSorter {
result = out.getName();
}
} else {
result = segments.get(0).fileName;
result = getPartition(segments.get(0)).fileName;
}
// We should be explicitly removing all intermediate files ourselves unless there is an exception:
assert trackingDir.getCreatedFiles().size() == 1 && trackingDir.getCreatedFiles().contains(result);
sortInfo.totalTime = System.currentTimeMillis() - sortInfo.totalTime;
sortInfo.totalTimeMS = System.currentTimeMillis() - startMS;
CodecUtil.checkFooter(is.in);
@ -306,6 +334,8 @@ public class OfflineSorter {
return result;
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
} finally {
if (success == false) {
IOUtils.deleteFilesIgnoringExceptions(trackingDir, trackingDir.getCreatedFiles());
@ -313,36 +343,6 @@ public class OfflineSorter {
}
}
/** Sort a single partition in-memory. */
protected PartitionAndCount sortPartition(TrackingDirectoryWrapper trackingDir, int lineCount) throws IOException {
try (IndexOutput tempFile = trackingDir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT);
ByteSequencesWriter out = getWriter(tempFile, lineCount);) {
BytesRef spare;
long start = System.currentTimeMillis();
BytesRefIterator iter = buffer.iterator(comparator);
sortInfo.sortTime += System.currentTimeMillis() - start;
int count = 0;
while ((spare = iter.next()) != null) {
assert spare.length <= Short.MAX_VALUE;
out.write(spare);
count++;
}
assert count == lineCount;
// Clean up the buffer for the next partition.
buffer.clear();
CodecUtil.writeFooter(out.out);
return new PartitionAndCount(lineCount, tempFile.getName());
}
}
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
* information (checksum matched or didn't) as a suppressed exception. */
private void verifyChecksum(Throwable priorException, ByteSequencesReader reader) throws IOException {
@ -352,129 +352,107 @@ public class OfflineSorter {
}
/** Merge the most recent {@code maxTempFile} partitions into a new partition. */
void mergePartitions(Directory trackingDir, List<PartitionAndCount> segments) throws IOException {
void mergePartitions(Directory trackingDir, List<Future<Partition>> segments) throws IOException {
long start = System.currentTimeMillis();
List<PartitionAndCount> segmentsToMerge;
List<Future<Partition>> segmentsToMerge;
if (segments.size() > maxTempFiles) {
segmentsToMerge = segments.subList(segments.size() - maxTempFiles, segments.size());
} else {
segmentsToMerge = segments;
}
long totalCount = 0;
for (PartitionAndCount segment : segmentsToMerge) {
totalCount += segment.count;
}
sortInfo.mergeRounds++;
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
@Override
protected boolean lessThan(FileAndTop a, FileAndTop b) {
return comparator.compare(a.current, b.current) < 0;
}
};
ByteSequencesReader[] streams = new ByteSequencesReader[segmentsToMerge.size()];
String newSegmentName = null;
try (ByteSequencesWriter writer = getWriter(trackingDir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT), totalCount)) {
newSegmentName = writer.out.getName();
// Open streams and read the top for each file
for (int i = 0; i < segmentsToMerge.size(); i++) {
streams[i] = getReader(dir.openChecksumInput(segmentsToMerge.get(i).fileName, IOContext.READONCE), segmentsToMerge.get(i).fileName);
BytesRef item = null;
try {
item = streams[i].next();
} catch (Throwable t) {
verifyChecksum(t, streams[i]);
}
assert item != null;
queue.insertWithOverflow(new FileAndTop(i, item));
}
// Unix utility sort() uses ordered array of files to pick the next line from, updating
// it as it reads new lines. The PQ used here is a more elegant solution and has
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
// so it shouldn't make much of a difference (didn't check).
FileAndTop top;
while ((top = queue.top()) != null) {
writer.write(top.current);
try {
top.current = streams[top.fd].next();
} catch (Throwable t) {
verifyChecksum(t, streams[top.fd]);
}
if (top.current != null) {
queue.updateTop();
} else {
queue.pop();
}
}
CodecUtil.writeFooter(writer.out);
for(ByteSequencesReader reader : streams) {
CodecUtil.checkFooter(reader.in);
}
sortInfo.mergeTime += System.currentTimeMillis() - start;
sortInfo.mergeRounds++;
} finally {
IOUtils.close(streams);
}
IOUtils.deleteFiles(trackingDir, segmentsToMerge.stream().map(segment -> segment.fileName).collect(Collectors.toList()));
MergePartitionsTask task = new MergePartitionsTask(trackingDir, new ArrayList<>(segmentsToMerge));
segmentsToMerge.clear();
segments.add(new PartitionAndCount(totalCount, newSegmentName));
segments.add(exec.submit(task));
sortInfo.tempMergeFiles++;
}
/** Holds one partition of items, either loaded into memory or based on a file. */
private static class Partition {
public final SortableBytesRefArray buffer;
public final boolean exhausted;
public final long count;
public final String fileName;
/** A partition loaded into memory. */
public Partition(SortableBytesRefArray buffer, boolean exhausted) {
this.buffer = buffer;
this.fileName = null;
this.count = buffer.size();
this.exhausted = exhausted;
}
/** An on-disk partition. */
public Partition(String fileName, long count) {
this.buffer = null;
this.fileName = fileName;
this.count = count;
this.exhausted = true;
}
}
/** Read in a single partition of data, setting isExhausted[0] to true if there are no more items. */
int readPartition(ByteSequencesReader reader, boolean[] isExhausted) throws IOException {
long start = System.currentTimeMillis();
if (valueLength != -1) {
int limit = ramBufferSize.bytes / valueLength;
for(int i=0;i<limit;i++) {
BytesRef item = null;
try {
item = reader.next();
} catch (Throwable t) {
verifyChecksum(t, reader);
Partition readPartition(ByteSequencesReader reader) throws IOException, InterruptedException {
if (partitionsInRAM != null) {
partitionsInRAM.acquire();
}
boolean success = false;
try {
long start = System.currentTimeMillis();
SortableBytesRefArray buffer;
boolean exhausted = false;
int count;
if (valueLength != -1) {
// fixed length case
buffer = new FixedLengthBytesRefArray(valueLength);
int limit = ramBufferSize.bytes / valueLength;
for(int i=0;i<limit;i++) {
BytesRef item = null;
try {
item = reader.next();
} catch (Throwable t) {
verifyChecksum(t, reader);
}
if (item == null) {
exhausted = true;
break;
}
buffer.append(item);
}
if (item == null) {
isExhausted[0] = true;
break;
} else {
Counter bufferBytesUsed = Counter.newCounter();
buffer = new BytesRefArray(bufferBytesUsed);
while (true) {
BytesRef item = null;
try {
item = reader.next();
} catch (Throwable t) {
verifyChecksum(t, reader);
}
if (item == null) {
exhausted = true;
break;
}
buffer.append(item);
// Account for the created objects.
// (buffer slots do not account to buffer size.)
if (bufferBytesUsed.get() > ramBufferSize.bytes) {
break;
}
}
buffer.append(item);
}
} else {
while (true) {
BytesRef item = null;
try {
item = reader.next();
} catch (Throwable t) {
verifyChecksum(t, reader);
}
if (item == null) {
isExhausted[0] = true;
break;
}
buffer.append(item);
// Account for the created objects.
// (buffer slots do not account to buffer size.)
if (bufferBytesUsed.get() > ramBufferSize.bytes) {
break;
}
sortInfo.readTimeMS += System.currentTimeMillis() - start;
success = true;
return new Partition(buffer, exhausted);
} finally {
if (success == false && partitionsInRAM != null) {
partitionsInRAM.release();
}
}
sortInfo.readTime += System.currentTimeMillis() - start;
return buffer.size();
}
static class FileAndTop {
@ -606,13 +584,146 @@ public class OfflineSorter {
return comparator;
}
private static class PartitionAndCount {
final long count;
final String fileName;
/** Sorts one in-memory partition, writes it to disk, and returns the resulting file-based partition. */
private class SortPartitionTask implements Callable<Partition> {
public PartitionAndCount(long count, String fileName) {
this.count = count;
this.fileName = fileName;
private final Directory dir;
private final Partition part;
public SortPartitionTask(Directory dir, Partition part) {
this.dir = dir;
this.part = part;
}
@Override
public Partition call() throws IOException {
try (IndexOutput tempFile = dir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT);
ByteSequencesWriter out = getWriter(tempFile, part.buffer.size());) {
BytesRef spare;
long startMS = System.currentTimeMillis();
BytesRefIterator iter = part.buffer.iterator(comparator);
sortInfo.sortTimeMS.addAndGet(System.currentTimeMillis() - startMS);
int count = 0;
while ((spare = iter.next()) != null) {
assert spare.length <= Short.MAX_VALUE;
out.write(spare);
count++;
}
assert count == part.count;
CodecUtil.writeFooter(out.out);
part.buffer.clear();
return new Partition(tempFile.getName(), part.count);
} finally {
if (partitionsInRAM != null) {
partitionsInRAM.release();
}
}
}
}
private Partition getPartition(Future<Partition> future) throws IOException {
try {
return future.get();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
} catch (ExecutionException ee) {
// Theoretically cause can be null; guard against that.
Throwable cause = ee.getCause();
throw IOUtils.rethrowAlways(cause != null ? cause : ee);
}
}
/** Merges multiple file-based partitions to a single on-disk partition. */
private class MergePartitionsTask implements Callable<Partition> {
private final Directory dir;
private final List<Future<Partition>> segmentsToMerge;
public MergePartitionsTask(Directory dir, List<Future<Partition>> segmentsToMerge) {
this.dir = dir;
this.segmentsToMerge = segmentsToMerge;
}
@Override
public Partition call() throws IOException {
long totalCount = 0;
for (Future<Partition> segment : segmentsToMerge) {
totalCount += getPartition(segment).count;
}
PriorityQueue<FileAndTop> queue = new PriorityQueue<FileAndTop>(segmentsToMerge.size()) {
@Override
protected boolean lessThan(FileAndTop a, FileAndTop b) {
return comparator.compare(a.current, b.current) < 0;
}
};
ByteSequencesReader[] streams = new ByteSequencesReader[segmentsToMerge.size()];
String newSegmentName = null;
long startMS = System.currentTimeMillis();
try (ByteSequencesWriter writer = getWriter(dir.createTempOutput(tempFileNamePrefix, "sort", IOContext.DEFAULT), totalCount)) {
newSegmentName = writer.out.getName();
// Open streams and read the top for each file
for (int i = 0; i < segmentsToMerge.size(); i++) {
Partition segment = getPartition(segmentsToMerge.get(i));
streams[i] = getReader(dir.openChecksumInput(segment.fileName, IOContext.READONCE), segment.fileName);
BytesRef item = null;
try {
item = streams[i].next();
} catch (Throwable t) {
verifyChecksum(t, streams[i]);
}
assert item != null;
queue.insertWithOverflow(new FileAndTop(i, item));
}
// Unix utility sort() uses ordered array of files to pick the next line from, updating
// it as it reads new lines. The PQ used here is a more elegant solution and has
// a nicer theoretical complexity bound :) The entire sorting process is I/O bound anyway
// so it shouldn't make much of a difference (didn't check).
FileAndTop top;
while ((top = queue.top()) != null) {
writer.write(top.current);
try {
top.current = streams[top.fd].next();
} catch (Throwable t) {
verifyChecksum(t, streams[top.fd]);
}
if (top.current != null) {
queue.updateTop();
} else {
queue.pop();
}
}
CodecUtil.writeFooter(writer.out);
for(ByteSequencesReader reader : streams) {
CodecUtil.checkFooter(reader.in);
}
sortInfo.mergeTimeMS.addAndGet(System.currentTimeMillis() - startMS);
} finally {
IOUtils.close(streams);
}
List<String> toDelete = new ArrayList<>();
for (Future<Partition> segment : segmentsToMerge) {
toDelete.add(getPartition(segment).fileName);
}
IOUtils.deleteFiles(dir, toDelete);
return new Partition(newSegmentName, totalCount);
}
}
}

View File

@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.AbstractExecutorService;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
/** An {@code ExecutorService} that executes tasks immediately in the calling thread during submit.
*
* @lucene.internal */
public final class SameThreadExecutorService extends AbstractExecutorService {
private volatile boolean shutdown;
@Override
public void execute(Runnable command) {
checkShutdown();
command.run();
}
@Override
public List<Runnable> shutdownNow() {
shutdown();
return Collections.emptyList();
}
@Override
public void shutdown() {
this.shutdown = true;
}
@Override
public boolean isTerminated() {
// Simplified: we don't check for any threads hanging in execute (we could
// introduce an atomic counter, but there seems to be no point).
return shutdown == true;
}
@Override
public boolean isShutdown() {
return shutdown == true;
}
@Override
public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException {
// See comment in isTerminated();
return true;
}
private void checkShutdown() {
if (shutdown) {
throw new RejectedExecutionException("Executor is shut down.");
}
}
}

View File

@ -97,31 +97,74 @@ public class SmallFloat {
return Float.intBitsToFloat(bits);
}
/** floatToByte(b, mantissaBits=5, zeroExponent=2)
* <br>smallest nonzero value = 0.033203125
* <br>largest value = 1984.0
* <br>epsilon = 0.03125
*/
public static byte floatToByte52(float f) {
int bits = Float.floatToRawIntBits(f);
int smallfloat = bits >> (24-5);
if (smallfloat <= (63-2)<<5) {
return (bits<=0) ? (byte)0 : (byte)1;
/** Float-like encoding for positive longs that preserves ordering and 4 significant bits. */
public static int longToInt4(long i) {
if (i < 0) {
throw new IllegalArgumentException("Only supports positive values, got " + i);
}
if (smallfloat >= ((63-2)<<5) + 0x100) {
return -1;
int numBits = 64 - Long.numberOfLeadingZeros(i);
if (numBits < 4) {
// subnormal value
return Math.toIntExact(i);
} else {
// normal value
int shift = numBits - 4;
// only keep the 5 most significant bits
int encoded = Math.toIntExact(i >>> shift);
// clear the most significant bit, which is implicit
encoded &= 0x07;
// encode the shift, adding 1 because 0 is reserved for subnormal values
encoded |= (shift + 1) << 3;
return encoded;
}
return (byte)(smallfloat - ((63-2)<<5));
}
/** byteToFloat(b, mantissaBits=5, zeroExponent=2) */
public static float byte52ToFloat(byte b) {
// on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup
// is only a little bit faster (anywhere from 0% to 7%)
if (b == 0) return 0.0f;
int bits = (b&0xff) << (24-5);
bits += (63-2) << 24;
return Float.intBitsToFloat(bits);
/**
* Decode values encoded with {@link #longToInt4(long)}.
*/
public static final long int4ToLong(int i) {
long bits = i & 0x07;
int shift = (i >>> 3) - 1;
long decoded;
if (shift == -1) {
// subnormal value
decoded = bits;
} else {
// normal value
decoded = (bits | 0x08) << shift;
}
return decoded;
}
private static final int MAX_INT4 = longToInt4(Integer.MAX_VALUE);
private static final int NUM_FREE_VALUES = 255 - MAX_INT4;
/**
* Encode an integer to a byte. It is built upon {@link #longToInt4(long)}
* and leverages the fact that {@code longToInt4(Integer.MAX_VALUE)} is
* less than 255 to encode low values more accurately.
*/
public static byte intToByte4(int i) {
if (i < 0) {
throw new IllegalArgumentException("Only supports positive values, got " + i);
}
if (i < NUM_FREE_VALUES) {
return (byte) i;
} else {
return (byte) (NUM_FREE_VALUES + longToInt4(i - NUM_FREE_VALUES));
}
}
/**
* Decode values that have been encoded with {@link #intToByte4(int)}.
*/
public static int byte4ToInt(byte b) {
int i = Byte.toUnsignedInt(b);
if (i < NUM_FREE_VALUES) {
return i;
} else {
long decoded = NUM_FREE_VALUES + int4ToLong(i - NUM_FREE_VALUES);
return Math.toIntExact(decoded);
}
}
}

View File

@ -115,6 +115,13 @@ public final class Version {
@Deprecated
public static final Version LUCENE_6_6_0 = new Version(6, 6, 0);
/**
* Match settings and bugs in Lucene's 6.7.0 release.
* @deprecated Use latest
*/
@Deprecated
public static final Version LUCENE_6_7_0 = new Version(6, 7, 0);
/**
* Match settings and bugs in Lucene's 7.0.0 release.
* <p>

View File

@ -884,7 +884,7 @@ public class BKDWriter implements Closeable {
};
}
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, null, 0) {
/** We write/read fixed-byte-width file that {@link OfflinePointReader} can read. */
@Override
@ -1362,7 +1362,9 @@ public class BKDWriter implements Closeable {
/** Called on exception, to check whether the checksum is also corrupt in this source, and add that
* information (checksum matched or didn't) as a suppressed exception. */
private void verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
private Error verifyChecksum(Throwable priorException, PointWriter writer) throws IOException {
assert priorException != null;
// TODO: we could improve this, to always validate checksum as we recurse, if we shared left and
// right reader after recursing to children, and possibly within recursed children,
// since all together they make a single pass through the file. But this is a sizable re-org,
@ -1373,10 +1375,10 @@ public class BKDWriter implements Closeable {
try (ChecksumIndexInput in = tempDir.openChecksumInput(tempFileName, IOContext.READONCE)) {
CodecUtil.checkFooter(in, priorException);
}
} else {
// We are reading from heap; nothing to add:
IOUtils.reThrow(priorException);
}
// We are reading from heap; nothing to add:
throw IOUtils.rethrowAlways(priorException);
}
/** Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue). */
@ -1398,7 +1400,7 @@ public class BKDWriter implements Closeable {
reader.markOrds(rightCount-1, ordBitSet);
}
} catch (Throwable t) {
verifyChecksum(t, source.writer);
throw verifyChecksum(t, source.writer);
}
return scratch1;
@ -1469,10 +1471,7 @@ public class BKDWriter implements Closeable {
}
return new PathSlice(writer, 0, count);
} catch (Throwable t) {
verifyChecksum(t, source.writer);
// Dead code but javac disagrees:
return null;
throw verifyChecksum(t, source.writer);
}
}
@ -1797,7 +1796,7 @@ public class BKDWriter implements Closeable {
leftSlices[dim] = new PathSlice(leftPointWriter, 0, leftCount);
rightSlices[dim] = new PathSlice(rightPointWriter, 0, rightCount);
} catch (Throwable t) {
verifyChecksum(t, slices[dim].writer);
throw verifyChecksum(t, slices[dim].writer);
}
}

View File

@ -48,7 +48,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
* This class also provides helpers to explore the different paths of the {@link Automaton}.
*/
public final class GraphTokenStreamFiniteStrings {
private final Map<BytesRef, Integer> termToID = new HashMap<>();
private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
private final Map<Integer, Integer> idToInc = new HashMap<>();
private final Automaton det;
@ -247,35 +246,18 @@ public final class GraphTokenStreamFiniteStrings {
}
/**
* Gets an integer id for a given term.
*
* If there is no position gaps for this token then we can reuse the id for the same term if it appeared at another
* position without a gap. If we have a position gap generate a new id so we can keep track of the position
* increment.
* Gets an integer id for a given term and saves the position increment if needed.
*/
private int getTermID(int incr, int prevIncr, BytesRef term) {
assert term != null;
boolean isStackedGap = incr == 0 && prevIncr > 1;
boolean hasGap = incr > 1;
Integer id;
if (hasGap || isStackedGap) {
id = idToTerm.size();
idToTerm.put(id, BytesRef.deepCopyOf(term));
// stacked token should have the same increment as original token at this position
if (isStackedGap) {
idToInc.put(id, prevIncr);
} else {
idToInc.put(id, incr);
}
} else {
id = termToID.get(term);
if (id == null) {
term = BytesRef.deepCopyOf(term);
id = idToTerm.size();
termToID.put(term, id);
idToTerm.put(id, term);
}
int id = idToTerm.size();
idToTerm.put(id, BytesRef.deepCopyOf(term));
// stacked token should have the same increment as original token at this position
if (isStackedGap) {
idToInc.put(id, prevIncr);
} else if (incr > 1) {
idToInc.put(id, incr);
}
return id;
}

View File

@ -303,4 +303,17 @@ public class TestCodecUtil extends LuceneTestCase {
fakeChecksum.set((1L << 32) - 1); // ok
CodecUtil.writeCRC(fakeOutput);
}
public void testTruncatedFileThrowsCorruptIndexException() throws IOException {
RAMFile file = new RAMFile();
IndexOutput output = new RAMOutputStream(file, false);
output.close();
IndexInput input = new RAMInputStream("file", file);
CorruptIndexException e = expectThrows(CorruptIndexException.class,
() -> CodecUtil.checksumEntireFile(input));
assertEquals("misplaced codec footer (file truncated?): length=0 but footerLength==16 (resource=RAMInputStream(name=file))", e.getMessage());
e = expectThrows(CorruptIndexException.class,
() -> CodecUtil.retrieveChecksum(input));
assertEquals("misplaced codec footer (file truncated?): length=0 but footerLength==16 (resource=RAMInputStream(name=file))", e.getMessage());
}
}

View File

@ -237,8 +237,11 @@ public class TestDemoParallelLeafReader extends LuceneTestCase {
firstExc = t;
}
}
// throw the first exception
IOUtils.reThrow(firstExc);
if (firstExc != null) {
throw IOUtils.rethrowAlways(firstExc);
}
}
@Override
@ -549,10 +552,11 @@ public class TestDemoParallelLeafReader extends LuceneTestCase {
}
}
// If any error occured, throw it.
IOUtils.reThrow(th);
if (th != null) {
throw IOUtils.rethrowAlways(th);
}
}
@Override
public void setMergeInfo(SegmentCommitInfo info) {
// Record that this merged segment is current as of this schemaGen:

View File

@ -2403,4 +2403,86 @@ public class TestIndexSorting extends LuceneTestCase {
}
IOUtils.close(r, w, dir);
}
public void testIndexSortWithSparseField() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortField("dense_int", SortField.Type.INT, true);
Sort indexSort = new Sort(sortField);
iwc.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc);
Field textField = newTextField("sparse_text", "", Field.Store.NO);
for (int i = 0; i < 128; i++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("dense_int", i));
if (i < 64) {
doc.add(new NumericDocValuesField("sparse_int", i));
doc.add(new BinaryDocValuesField("sparse_binary", new BytesRef(Integer.toString(i))));
textField.setStringValue("foo");
doc.add(textField);
}
w.addDocument(doc);
}
w.commit();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
assertEquals(1, r.leaves().size());
LeafReader leafReader = r.leaves().get(0).reader();
NumericDocValues denseValues = leafReader.getNumericDocValues("dense_int");
NumericDocValues sparseValues = leafReader.getNumericDocValues("sparse_int");
BinaryDocValues sparseBinaryValues = leafReader.getBinaryDocValues("sparse_binary");
NumericDocValues normsValues = leafReader.getNormValues("sparse_text");
for(int docID = 0; docID < 128; docID++) {
assertTrue(denseValues.advanceExact(docID));
assertEquals(127-docID, (int) denseValues.longValue());
if (docID >= 64) {
assertTrue(denseValues.advanceExact(docID));
assertTrue(sparseValues.advanceExact(docID));
assertTrue(sparseBinaryValues.advanceExact(docID));
assertTrue(normsValues.advanceExact(docID));
assertEquals(1, normsValues.longValue());
assertEquals(127-docID, (int) sparseValues.longValue());
assertEquals(new BytesRef(Integer.toString(127-docID)), sparseBinaryValues.binaryValue());
} else {
assertFalse(sparseBinaryValues.advanceExact(docID));
assertFalse(sparseValues.advanceExact(docID));
assertFalse(normsValues.advanceExact(docID));
}
}
IOUtils.close(r, w, dir);
}
public void testIndexSortOnSparseField() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortField("sparse", SortField.Type.INT, false);
sortField.setMissingValue(Integer.MIN_VALUE);
Sort indexSort = new Sort(sortField);
iwc.setIndexSort(indexSort);
IndexWriter w = new IndexWriter(dir, iwc);
for (int i = 0; i < 128; i++) {
Document doc = new Document();
if (i < 64) {
doc.add(new NumericDocValuesField("sparse", i));
}
w.addDocument(doc);
}
w.commit();
w.forceMerge(1);
DirectoryReader r = DirectoryReader.open(w);
assertEquals(1, r.leaves().size());
LeafReader leafReader = r.leaves().get(0).reader();
NumericDocValues sparseValues = leafReader.getNumericDocValues("sparse");
for(int docID = 0; docID < 128; docID++) {
if (docID >= 64) {
assertTrue(sparseValues.advanceExact(docID));
assertEquals(docID-64, (int) sparseValues.longValue());
} else {
assertFalse(sparseValues.advanceExact(docID));
}
}
IOUtils.close(r, w, dir);
}
}

View File

@ -17,6 +17,7 @@
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -26,7 +27,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -35,12 +38,12 @@ import org.apache.lucene.util.TestUtil;
/**
* Tests the maxTermFrequency statistic in FieldInvertState
*/
public class TestMaxTermFrequency extends LuceneTestCase {
public class TestMaxTermFrequency extends LuceneTestCase {
Directory dir;
IndexReader reader;
/* expected maxTermFrequency values for our documents */
ArrayList<Integer> expected = new ArrayList<>();
@Override
public void setUp() throws Exception {
super.setUp();
@ -59,14 +62,14 @@ public class TestMaxTermFrequency extends LuceneTestCase {
reader = writer.getReader();
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
super.tearDown();
}
public void test() throws Exception {
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++) {
@ -95,30 +98,42 @@ public class TestMaxTermFrequency extends LuceneTestCase {
Collections.shuffle(terms, random());
return Arrays.toString(terms.toArray(new String[terms.size()]));
}
/**
* Simple similarity that encodes maxTermFrequency directly as a byte
*/
static class TestSimilarity extends TFIDFSimilarity {
static class TestSimilarity extends Similarity {
@Override
public float lengthNorm(FieldInvertState state) {
public long computeNorm(FieldInvertState state) {
return state.getMaxTermFrequency();
}
@Override
public long encodeNormValue(float f) {
return (byte) f;
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new SimWeight() {};
}
@Override
public float decodeNormValue(long norm) {
return norm;
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
return 0;
}
@Override
public float computeSlopFactor(int distance) {
return 0;
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 0;
}
};
}
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
}

View File

@ -32,13 +32,11 @@ import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
@ -49,67 +47,6 @@ import org.apache.lucene.util.TestUtil;
@Slow
public class TestNorms extends LuceneTestCase {
static final String BYTE_TEST_FIELD = "normsTestByte";
static class CustomNormEncodingSimilarity extends TFIDFSimilarity {
@Override
public long encodeNormValue(float f) {
return (long) f;
}
@Override
public float decodeNormValue(long norm) {
return norm;
}
@Override
public float lengthNorm(FieldInvertState state) {
return state.getLength();
}
@Override public float tf(float freq) { return 0; }
@Override public float idf(long docFreq, long docCount) { return 0; }
@Override public float sloppyFreq(int distance) { return 0; }
@Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
// LUCENE-1260
public void testCustomEncoder() throws Exception {
Directory dir = newDirectory();
MockAnalyzer analyzer = new MockAnalyzer(random());
IndexWriterConfig config = newIndexWriterConfig(analyzer);
config.setSimilarity(new CustomNormEncodingSimilarity());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
Document doc = new Document();
Field foo = newTextField("foo", "", Field.Store.NO);
Field bar = newTextField("bar", "", Field.Store.NO);
doc.add(foo);
doc.add(bar);
for (int i = 0; i < 100; i++) {
bar.setStringValue("singleton");
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, fooNorms.nextDoc());
assertEquals(0, fooNorms.longValue());
}
NumericDocValues barNorms = MultiDocValues.getNormValues(reader, "bar");
for (int i = 0; i < reader.maxDoc(); i++) {
assertEquals(i, barNorms.nextDoc());
assertEquals(1, barNorms.longValue());
}
reader.close();
dir.close();
}
public void testMaxByteNorms() throws IOException {
Directory dir = newFSDirectory(createTempDir("TestNorms.testMaxByteNorms"));

View File

@ -44,9 +44,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestOmitTf extends LuceneTestCase {
public static class SimpleSimilarity extends TFIDFSimilarity {
@Override public float decodeNormValue(long norm) { return norm; }
@Override public long encodeNormValue(float f) { return (long) f; }
@Override public float lengthNorm(FieldInvertState state) { return 1; }
@Override public float lengthNorm(int length) { return 1; }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long docCount) { return 1.0f; }

View File

@ -30,7 +30,6 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@ -72,7 +71,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
}
@Override
public float lengthNorm(FieldInvertState state) {
public float lengthNorm(int length) {
// Disable length norm
return 1;
}

View File

@ -31,11 +31,18 @@ public class TestDoubleRangeFieldQueries extends BaseRangeFieldQueryTestCase {
private static final String FIELD_NAME = "doubleRangeField";
private double nextDoubleInternal() {
if (rarely()) {
return random().nextBoolean() ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;
switch (random().nextInt(5)) {
case 0:
return Double.NEGATIVE_INFINITY;
case 1:
return Double.POSITIVE_INFINITY;
default:
if (random().nextBoolean()) {
return random().nextDouble();
} else {
return (random().nextInt(15) - 7) / 3d;
}
}
double max = Double.MAX_VALUE / 2;
return (max + max) * random().nextDouble() - max;
}
@Override

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
@ -26,6 +27,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
@ -164,4 +166,65 @@ public class TestDoubleValuesSource extends LuceneTestCase {
CheckHits.checkEqual(query, expected.scoreDocs, actual.scoreDocs);
}
}
static final Query[] testQueries = new Query[]{
new MatchAllDocsQuery(),
new TermQuery(new Term("oddeven", "odd")),
new BooleanQuery.Builder()
.add(new TermQuery(new Term("english", "one")), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("english", "two")), BooleanClause.Occur.MUST)
.build()
};
public void testExplanations() throws Exception {
for (Query q : testQueries) {
testExplanations(q, DoubleValuesSource.fromIntField("int"));
testExplanations(q, DoubleValuesSource.fromLongField("long"));
testExplanations(q, DoubleValuesSource.fromFloatField("float"));
testExplanations(q, DoubleValuesSource.fromDoubleField("double"));
testExplanations(q, DoubleValuesSource.fromDoubleField("onefield"));
testExplanations(q, DoubleValuesSource.constant(5.45));
testExplanations(q, DoubleValuesSource.function(
DoubleValuesSource.fromDoubleField("double"), "v * 4 + 73",
v -> v * 4 + 73
));
testExplanations(q, DoubleValuesSource.scoringFunction(
DoubleValuesSource.fromDoubleField("double"), "v * score", (v, s) -> v * s
));
}
}
private void testExplanations(Query q, DoubleValuesSource vs) throws IOException {
searcher.search(q, new SimpleCollector() {
DoubleValues v;
LeafReaderContext ctx;
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
this.ctx = context;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
this.v = vs.getValues(this.ctx, DoubleValuesSource.fromScorer(scorer));
}
@Override
public void collect(int doc) throws IOException {
Explanation scoreExpl = searcher.explain(q, ctx.docBase + doc);
if (this.v.advanceExact(doc)) {
CheckHits.verifyExplanation("", doc, (float) v.doubleValue(), true, vs.explain(ctx, doc, scoreExpl));
}
else {
assertFalse(vs.explain(ctx, doc, scoreExpl).isMatch());
}
}
@Override
public boolean needsScores() {
return vs.needsScores();
}
});
}
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FieldValueHitQueue.Entry;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@ -63,7 +64,7 @@ public class TestElevationComparator extends LuceneTestCase {
writer.close();
IndexSearcher searcher = newSearcher(r);
searcher.setSimilarity(new ClassicSimilarity());
searcher.setSimilarity(new BM25Similarity());
runTest(searcher, true);
runTest(searcher, false);
@ -98,11 +99,11 @@ public class TestElevationComparator extends LuceneTestCase {
assertEquals(3, topDocs.scoreDocs[1].doc);
if (reversed) {
assertEquals(2, topDocs.scoreDocs[2].doc);
assertEquals(1, topDocs.scoreDocs[3].doc);
} else {
assertEquals(1, topDocs.scoreDocs[2].doc);
assertEquals(2, topDocs.scoreDocs[3].doc);
} else {
assertEquals(2, topDocs.scoreDocs[2].doc);
assertEquals(1, topDocs.scoreDocs[3].doc);
}
/*

View File

@ -31,11 +31,18 @@ public class TestFloatRangeFieldQueries extends BaseRangeFieldQueryTestCase {
private static final String FIELD_NAME = "floatRangeField";
private float nextFloatInternal() {
if (rarely()) {
return random().nextBoolean() ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
switch (random().nextInt(5)) {
case 0:
return Float.NEGATIVE_INFINITY;
case 1:
return Float.POSITIVE_INFINITY;
default:
if (random().nextBoolean()) {
return random().nextFloat();
} else {
return (random().nextInt(15) - 7) / 3f;
}
}
float max = Float.MAX_VALUE / 2;
return (max + max) * random().nextFloat() - max;
}
@Override

View File

@ -23,6 +23,7 @@ import org.apache.lucene.document.IntRange;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.TestUtil;
/**
* Random testing for IntRange Queries.
@ -31,11 +32,25 @@ public class TestIntRangeFieldQueries extends BaseRangeFieldQueryTestCase {
private static final String FIELD_NAME = "intRangeField";
private int nextIntInternal() {
if (rarely()) {
return random().nextBoolean() ? Integer.MAX_VALUE : Integer.MIN_VALUE;
switch (random().nextInt(5)) {
case 0:
return Integer.MIN_VALUE;
case 1:
return Integer.MAX_VALUE;
default:
int bpv = random().nextInt(32);
switch (bpv) {
case 32:
return random().nextInt();
default:
int v = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
if (bpv > 0) {
// negative values sometimes
v -= 1 << (bpv - 1);
}
return v;
}
}
int max = Integer.MAX_VALUE / 2;
return (max + max) * random().nextInt() - max;
}
@Override

View File

@ -660,12 +660,14 @@ public class TestLRUQueryCache extends LuceneTestCase {
@Override
protected void onQueryCache(Query query, long ramBytesUsed) {
super.onQueryCache(query, ramBytesUsed);
assertNotNull("cached query is null", query);
ramBytesUsage.addAndGet(ramBytesUsed);
}
@Override
protected void onQueryEviction(Query query, long ramBytesUsed) {
super.onQueryEviction(query, ramBytesUsed);
assertNotNull("evicted query is null", query);
ramBytesUsage.addAndGet(-ramBytesUsed);
}

View File

@ -23,6 +23,7 @@ import org.apache.lucene.document.LongRange;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.TestUtil;
/**
* Random testing for LongRange Queries.
@ -31,11 +32,25 @@ public class TestLongRangeFieldQueries extends BaseRangeFieldQueryTestCase {
private static final String FIELD_NAME = "longRangeField";
private long nextLongInternal() {
if (rarely()) {
return random().nextBoolean() ? Long.MAX_VALUE : Long.MIN_VALUE;
switch (random().nextInt(5)) {
case 0:
return Long.MIN_VALUE;
case 1:
return Long.MAX_VALUE;
default:
int bpv = random().nextInt(64);
switch (bpv) {
case 64:
return random().nextLong();
default:
long v = TestUtil.nextLong(random(), 0, (1L << bpv) - 1);
if (bpv > 0) {
// negative values sometimes
v -= 1L << (bpv - 1);
}
return v;
}
}
long max = Long.MAX_VALUE / 2;
return (max + max) * random().nextLong() - max;
}
@Override

Some files were not shown because too many files have changed in this diff Show More