mirror of https://github.com/apache/lucene.git
merged with trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/docvalues@1102677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
54a2d7aab4
|
@ -7,6 +7,7 @@ modules/ is shared code
|
|||
To compile the sources run 'ant compile'
|
||||
To run all the tests run 'ant test'
|
||||
To setup your ide run 'ant idea' or 'ant eclipse'
|
||||
For Maven info, see dev-tools/maven/README.maven.
|
||||
|
||||
For more information on how to contribute see:
|
||||
http://wiki.apache.org/lucene-java/HowToContribute
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-digester-1.7.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/commons-logging-1.0.4.jar"/>
|
||||
<classpathentry kind="lib" path="modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r944541.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/apache-solr-noggit-r1099557.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-beanutils-1.7.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-codec-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="solr/lib/commons-collections-3.2.1.jar"/>
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
====================================
|
||||
Lucene/Solr Maven build instructions
|
||||
====================================
|
||||
|
||||
Contents:
|
||||
|
||||
A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts
|
||||
B. How to generate Lucene Maven artifacts
|
||||
C. How to generate Solr Maven artifacts
|
||||
D. How to use Maven to build Lucene/Solr
|
||||
|
||||
-----
|
||||
|
||||
A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts
|
||||
|
||||
The most recently produced nightly Jenkins-built Lucene and Solr Maven
|
||||
artifacts are available in Maven repository layout here:
|
||||
|
||||
<https://builds.apache.org/hudson/job/Lucene-Solr-Maven-trunk/lastSuccessfulBuild/artifact/maven_artifacts/>
|
||||
|
||||
|
||||
B. How to generate Lucene Maven artifacts
|
||||
|
||||
1. Prerequisites: JDK 1.5+, Ant 1.7.X, and maven-ant-tasks-2.1.1.jar
|
||||
|
||||
In order to generate Maven artifacts for Lucene/Solr, you must first
|
||||
download the Maven ant tasks JAR (maven-ant-tasks-2.1.1.jar), e.g.
|
||||
from <http://maven.apache.org/ant-tasks/download.html>, and add it
|
||||
to any one of the following:
|
||||
|
||||
a. Your $HOME/.ant/lib/ directory (C:\Users\username\.ant\lib\ under
|
||||
Windows Vista/7); or
|
||||
b. Your $ANT_HOME/lib/ directory (%ANT_HOME%\lib\ under Windows); or
|
||||
c. Your $CLASSPATH (%CLASSPATH% under Windows); or
|
||||
d. Your ant commond line: "-lib /path/to/maven-ant-tasks-2.1.1.jar".
|
||||
|
||||
2. Run the following command from the lucene/ directory:
|
||||
|
||||
ant generate-maven-artifacts
|
||||
|
||||
The above command will create an internal Maven repository under
|
||||
lucene/dist/maven/, including POMs, binary .jars, source .jars,
|
||||
and javadoc .jars, for Lucene Core, for the Lucene test framework,
|
||||
for each contrib, and for each module under the top-level modules/
|
||||
directory.
|
||||
|
||||
|
||||
C. How to generate Solr Maven artifacts
|
||||
|
||||
1. Prerequisites: JDK 1.6+; Ant 1.7.X; and maven-ant-tasks-2.1.1.jar
|
||||
(see item A.1. above for where to put the Maven ant tasks jar).
|
||||
|
||||
2. Run the following from the solr/ directory:
|
||||
|
||||
ant generate-maven-artifacts
|
||||
|
||||
The above command will create an internal Maven repository under
|
||||
solr/package/maven/, including POMs, binary .jars, source .jars,
|
||||
and javadoc .jars, for Solr Core, for the Solr test framework,
|
||||
for each contrib, and for the Solr .war (for which there are no
|
||||
source or javadoc .jars).
|
||||
|
||||
|
||||
D. How to use Maven to build Lucene/Solr
|
||||
|
||||
In summary, to enable Maven builds, perform the following:
|
||||
|
||||
svn update
|
||||
ant get-maven-poms
|
||||
mvn -N -Pbootstrap install
|
||||
|
||||
The details, followed by some example Maven commands:
|
||||
|
||||
1. Prerequisites: JDK 1.5+ (for Lucene); JDK 1.6+ (for Solr);
|
||||
Maven 2.2.1 or 3.0.X
|
||||
|
||||
2. Make sure your sources are up to date. If you checked your sources out
|
||||
from the Apache Subversion repository, run "svn update" from the top
|
||||
level.
|
||||
|
||||
3. Copy the Maven POM templates from under dev-tools/maven/ to where they
|
||||
they need to go in order to drive the Maven build, using the following
|
||||
command from the top-level directory:
|
||||
|
||||
ant get-maven-poms
|
||||
|
||||
Note that you will need to do this whenever changes to the POM
|
||||
templates are committed. It's a good idea to follow every "svn update"
|
||||
with "ant get-maven-poms" for this reason.
|
||||
|
||||
The above command copies all of the POM templates from dev-tools/maven/,
|
||||
filling in the project version with the default "X.X-SNAPSHOT". If you
|
||||
want the POMs and the Maven-built artifacts to have a version other than
|
||||
the default, you can supply an alternate version on the command line
|
||||
with the above command, e.g.:
|
||||
|
||||
ant -Dversion=4.0-my-special-version get-maven-poms
|
||||
|
||||
4. Populate your local repository with .jars & POMs for dependencies that
|
||||
are not available from public Maven repositories (a.k.a. "non-mavenized
|
||||
dependencies"):
|
||||
|
||||
mvn -N -Pbootstrap install
|
||||
|
||||
Note that you will need to do this whenever changes to the non-Mavenized
|
||||
dependencies are committed. It's a good idea to follow every
|
||||
"svn update" with "ant get-maven-poms" and "mvn -N -Pbootstrap install"
|
||||
for this reason.
|
||||
|
||||
|
||||
Some example Maven commands you can use after you perform the above
|
||||
preparatory steps:
|
||||
|
||||
- Compile, package, and install all artifacts to your local repository:
|
||||
|
||||
mvn install
|
||||
|
||||
After compiling and packaging, but before installing each module's
|
||||
artifact, the above command will also run all the module's tests.
|
||||
|
||||
To compile, package and install all artifacts without running any tests:
|
||||
|
||||
mvn -DskipTests install
|
||||
|
||||
- Run tests:
|
||||
|
||||
mvn test
|
||||
|
||||
To run all test methods defined in a test class:
|
||||
|
||||
mvn -Dtest=TestClassName test
|
|
@ -699,7 +699,7 @@
|
|||
<artifactId>solr-noggit</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<packaging>jar</packaging>
|
||||
<file>solr/lib/apache-solr-noggit-r944541.jar</file>
|
||||
<file>solr/lib/apache-solr-noggit-r1099557.jar</file>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
|
|
|
@ -103,8 +103,8 @@
|
|||
</dependencies>
|
||||
<build>
|
||||
<directory>${build-directory}</directory>
|
||||
<outputDirectory>${build-directory}/extras/classes</outputDirectory>
|
||||
<testOutputDirectory>${build-directory}/extras/test-classes</testOutputDirectory>
|
||||
<outputDirectory>${build-directory}/classes</outputDirectory>
|
||||
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
|
||||
<sourceDirectory>main/java</sourceDirectory>
|
||||
<testSourceDirectory>test/java</testSourceDirectory>
|
||||
<testResources>
|
||||
|
|
|
@ -159,7 +159,6 @@
|
|||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
|
|
|
@ -85,7 +85,7 @@
|
|||
</dependencies>
|
||||
<build>
|
||||
<directory>${build-directory}</directory>
|
||||
<outputDirectory>${build-directory}</outputDirectory>
|
||||
<outputDirectory>${build-directory}/classes</outputDirectory>
|
||||
<sourceDirectory>.</sourceDirectory>
|
||||
<testResources/>
|
||||
<plugins>
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# recursive, unified output format, treat missing files as present but empty
|
||||
DIFF_FLAGS = '-ruN'
|
||||
|
||||
if '-skipWhitespace' in sys.argv:
|
||||
sys.argv.remove('-skipWhitespace')
|
||||
# ignores only whitespace changes
|
||||
DIFF_FLAGS += 'bBw'
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print
|
||||
print 'Usage: python -u diffSources.py <dir1> <dir2> [-skipWhitespace]'
|
||||
print
|
||||
print '''This tool creates an applying patch between two directories.
|
||||
|
||||
While you could use this to make a committable patch from a branch, that approach loses
|
||||
the svn history from the branch (better to use "svn merge --reintegrate", for example). This
|
||||
diff output should not be considered "authoritative" from a merging standpoint as it does
|
||||
not reflect what svn will do on merge.
|
||||
'''
|
||||
print
|
||||
sys.exit(0)
|
||||
|
||||
p = subprocess.Popen(['diff', DIFF_FLAGS, '-x', '.svn', '-x', 'build', sys.argv[1], sys.argv[2]], shell=False, stdout=subprocess.PIPE)
|
||||
|
||||
keep = False
|
||||
while True:
|
||||
l = p.stdout.readline()
|
||||
if l == '':
|
||||
break
|
||||
if l.endswith('\r\n'):
|
||||
l = l[:-2]
|
||||
elif l.endswith('\n'):
|
||||
l = l[:-1]
|
||||
if l.startswith('diff ') or l.startswith('Binary files '):
|
||||
keep = l.lower().find('/build/') == -1 and (l.lower().startswith('Only in') or ((l.lower().endswith('.java') or l.lower().endswith('.txt') or l.lower().endswith('.xml') or l.lower().endswith('.iml')) and l.find('/.svn/') == -1))
|
||||
if keep:
|
||||
print
|
||||
print
|
||||
print l.strip()
|
||||
elif keep:
|
||||
print l
|
||||
elif l.startswith('Only in'):
|
||||
print l.strip()
|
|
@ -472,13 +472,63 @@ Changes in backwards compatibility policy
|
|||
a method getHeapArray() was added to retrieve the internal heap array as a
|
||||
non-generic Object[]. (Uwe Schindler, Yonik Seeley)
|
||||
|
||||
* LUCENE-1076: IndexWriter.setInfoStream now throws IOException
|
||||
(Mike McCandless, Shai Erera)
|
||||
|
||||
* LUCENE-3084: MergePolicy.OneMerge.segments was changed from
|
||||
SegmentInfos to a List<SegmentInfo>; this is actually a minor change
|
||||
because SegmentInfos itself extends Vector<SegmentInfo>. (Uwe
|
||||
Schindler, Mike McCandless)
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-3065: When a NumericField is retrieved from a Document loaded
|
||||
from IndexReader (or IndexSearcher), it will now come back as
|
||||
NumericField not as a Field with a string-ified version of the
|
||||
numeric value you had indexed. Note that this only applies for
|
||||
newly-indexed Documents; older indices will still return Field
|
||||
with the string-ified numeric value. If you call Document.get(),
|
||||
the value comes still back as String, but Document.getFieldable()
|
||||
returns NumericField instances. (Uwe Schindler, Ryan McKinley,
|
||||
Mike McCandless)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-3082: Added index upgrade tool oal.index.IndexUpgrader
|
||||
that allows to upgrade all segments to last recent supported index
|
||||
format without fully optimizing. (Uwe Schindler, Mike McCandless)
|
||||
|
||||
* LUCENE-1076: Added TieredMergePolicy which is able to merge non-contiguous
|
||||
segments, which means docIDs no longer necessarily stay "in order".
|
||||
(Mike McCandless, Shai Erera)
|
||||
|
||||
* LUCENE-3071: Adding ReversePathHierarchyTokenizer, added skip parameter to
|
||||
PathHierarchyTokenizer (Olivier Favre via ryan)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3061: IndexWriter's getNextMerge() and merge(OneMerge) are now public
|
||||
(though @lucene.experimental), allowing for custom MergeScheduler
|
||||
implementations. (Shai Erera)
|
||||
|
||||
* LUCENE-3065: Document.getField() was deprecated, as it throws
|
||||
ClassCastException when loading lazy fields or NumericFields.
|
||||
(Uwe Schindler, Ryan McKinley, Mike McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2990: ArrayUtil/CollectionUtil.*Sort() methods now exit early
|
||||
on empty or one-element lists/arrays. (Uwe Schindler)
|
||||
|
||||
* LUCENE-2897: Apply deleted terms while flushing a segment. We still
|
||||
buffer deleted terms to later apply to past segments. (Mike McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-2996: addIndexes(IndexReader) did not flush before adding the new
|
||||
indexes, causing existing deletions to be applied on the incoming indexes as
|
||||
well. (Shai Erera, Mike McCandless)
|
||||
|
||||
* LUCENE-3024: Index with more than 2.1B terms was hitting AIOOBE when
|
||||
seeking TermEnum (eg used by Solr's faceting) (Tom Burton-West, Mike
|
||||
McCandless)
|
||||
|
@ -491,6 +541,17 @@ Bug fixes
|
|||
very special use cases of the TokenStream-API, most users would not
|
||||
have recognized it. (Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-3054: PhraseQuery can in some cases stack overflow in
|
||||
SorterTemplate.quickSort(). This fix also adds an optimization to
|
||||
PhraseQuery as term with lower doc freq will also have less positions.
|
||||
(Uwe Schindler, Robert Muir, Otis Gospodnetic)
|
||||
|
||||
Test Cases
|
||||
|
||||
* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to
|
||||
stop iterating if at least 'tests.iter.min' ran and a failure occured.
|
||||
(Shai Erera, Chris Hostetter)
|
||||
|
||||
======================= Lucene 3.1.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
@ -1472,6 +1533,10 @@ Bug fixes
|
|||
that warming is free to do whatever it needs to. (Earwin Burrfoot
|
||||
via Mike McCandless)
|
||||
|
||||
* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero
|
||||
position-increment tokens that would sometimes assign different
|
||||
scores to identical docs. (Mike McCandless)
|
||||
|
||||
* LUCENE-2486: Fixed intermittent FileNotFoundException on doc store
|
||||
files when a mergedSegmentWarmer is set on IndexWriter. (Mike
|
||||
McCandless)
|
||||
|
|
|
@ -312,6 +312,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
|
|||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
|
||||
- o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils
|
||||
|
||||
* LUCENE-2514: The option to use a Collator's order (instead of binary order) for
|
||||
sorting and range queries has been moved to contrib/queries.
|
||||
|
|
|
@ -73,6 +73,7 @@
|
|||
</condition>
|
||||
<property name="tests.multiplier" value="1" />
|
||||
<property name="tests.codec" value="randomPerField" />
|
||||
<property name="tests.codecprovider" value="random" />
|
||||
<property name="tests.locale" value="random" />
|
||||
<property name="tests.timezone" value="random" />
|
||||
<property name="tests.directory" value="random" />
|
||||
|
@ -499,6 +500,8 @@
|
|||
<sysproperty key="tests.verbose" value="${tests.verbose}"/>
|
||||
<!-- set the codec tests should run with -->
|
||||
<sysproperty key="tests.codec" value="${tests.codec}"/>
|
||||
<!-- set the codec provider tests should run with -->
|
||||
<sysproperty key="tests.codecprovider" value="${tests.codecprovider}"/>
|
||||
<!-- set the locale tests should run with -->
|
||||
<sysproperty key="tests.locale" value="${tests.locale}"/>
|
||||
<!-- set the timezone tests should run with -->
|
||||
|
|
|
@ -50,6 +50,11 @@ Bug Fixes
|
|||
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
Changes in runtime behavior
|
||||
|
||||
* LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian
|
||||
contractions by default. (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
|
||||
|
@ -183,6 +188,10 @@ Bug fixes
|
|||
* LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3087: Highlighter: fix case that was preventing highlighting
|
||||
of exact phrase when tokens overlap. (Pierre Gossé via Mike
|
||||
McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2867: Some contrib queryparser methods that receives CharSequence as
|
||||
|
|
|
@ -355,6 +355,7 @@ public class Highlighter
|
|||
{
|
||||
try
|
||||
{
|
||||
tokenStream.end();
|
||||
tokenStream.close();
|
||||
}
|
||||
catch (Exception e)
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
|
@ -158,10 +159,13 @@ public class TokenSources {
|
|||
|
||||
OffsetAttribute offsetAtt;
|
||||
|
||||
PositionIncrementAttribute posincAtt;
|
||||
|
||||
StoredTokenStream(Token tokens[]) {
|
||||
this.tokens = tokens;
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
posincAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -173,6 +177,10 @@ public class TokenSources {
|
|||
clearAttributes();
|
||||
termAtt.setEmpty().append(token);
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
posincAtt
|
||||
.setPositionIncrement(currentToken <= 1
|
||||
|| tokens[currentToken - 1].startOffset() > tokens[currentToken - 2]
|
||||
.startOffset() ? 1 : 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -180,7 +188,6 @@ public class TokenSources {
|
|||
BytesRef[] terms = tpv.getTerms();
|
||||
int[] freq = tpv.getTermFrequencies();
|
||||
int totalTokens = 0;
|
||||
|
||||
for (int t = 0; t < freq.length; t++) {
|
||||
totalTokens += freq[t];
|
||||
}
|
||||
|
@ -189,7 +196,8 @@ public class TokenSources {
|
|||
for (int t = 0; t < freq.length; t++) {
|
||||
TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
|
||||
if (offsets == null) {
|
||||
throw new IllegalArgumentException("Required TermVector Offset information was not found");
|
||||
throw new IllegalArgumentException(
|
||||
"Required TermVector Offset information was not found");
|
||||
}
|
||||
|
||||
int[] pos = null;
|
||||
|
@ -205,8 +213,8 @@ public class TokenSources {
|
|||
unsortedTokens = new ArrayList<Token>();
|
||||
}
|
||||
for (int tp = 0; tp < offsets.length; tp++) {
|
||||
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp]
|
||||
.getEndOffset());
|
||||
Token token = new Token(terms[t].utf8ToString(),
|
||||
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
unsortedTokens.add(token);
|
||||
}
|
||||
} else {
|
||||
|
@ -221,8 +229,8 @@ public class TokenSources {
|
|||
// tokens stored with positions - can use this to index straight into
|
||||
// sorted array
|
||||
for (int tp = 0; tp < pos.length; tp++) {
|
||||
Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(),
|
||||
offsets[tp].getEndOffset());
|
||||
Token token = new Token(terms[t].utf8ToString(),
|
||||
offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
tokensInOriginalOrder[pos[tp]] = token;
|
||||
}
|
||||
}
|
||||
|
@ -231,12 +239,11 @@ public class TokenSources {
|
|||
if (unsortedTokens != null) {
|
||||
tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
|
||||
.size()]);
|
||||
ArrayUtil.quickSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||
ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator<Token>() {
|
||||
public int compare(Token t1, Token t2) {
|
||||
if (t1.startOffset() == t2.startOffset())
|
||||
return t1.endOffset() - t2.endOffset();
|
||||
else
|
||||
return t1.startOffset() - t2.startOffset();
|
||||
if (t1.startOffset() == t2.startOffset()) return t1.endOffset()
|
||||
- t2.endOffset();
|
||||
else return t1.startOffset() - t2.startOffset();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -1093,6 +1093,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
}
|
||||
|
||||
public void testMaxSizeHighlight() throws Exception {
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true);
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
analyzer.setEnableChecks(false);
|
||||
TestHighlightRunner helper = new TestHighlightRunner() {
|
||||
|
||||
@Override
|
||||
|
@ -1122,7 +1126,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
public void run() throws Exception {
|
||||
String goodWord = "goodtoken";
|
||||
CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken"));
|
||||
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true);
|
||||
analyzer.setEnableChecks(false);
|
||||
TermQuery query = new TermQuery(new Term("data", goodWord));
|
||||
|
||||
String match;
|
||||
|
@ -1134,13 +1141,13 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
sb.append("stoppedtoken");
|
||||
}
|
||||
SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
|
||||
Highlighter hg = getHighlighter(query, "data", new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true).tokenStream(
|
||||
Highlighter hg = getHighlighter(query, "data", analyzer.tokenStream(
|
||||
"data", new StringReader(sb.toString())), fm);// new Highlighter(fm,
|
||||
// new
|
||||
// QueryTermScorer(query));
|
||||
hg.setTextFragmenter(new NullFragmenter());
|
||||
hg.setMaxDocCharsToAnalyze(100);
|
||||
match = hg.getBestFragment(new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true), "data", sb.toString());
|
||||
match = hg.getBestFragment(analyzer, "data", sb.toString());
|
||||
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
|
||||
.getMaxDocCharsToAnalyze());
|
||||
|
||||
|
@ -1151,7 +1158,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
|||
// + whitespace)
|
||||
sb.append(" ");
|
||||
sb.append(goodWord);
|
||||
match = hg.getBestFragment(new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true), "data", sb.toString());
|
||||
match = hg.getBestFragment(analyzer, "data", sb.toString());
|
||||
assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
|
||||
.getMaxDocCharsToAnalyze());
|
||||
}
|
||||
|
@ -1726,6 +1733,11 @@ final class SynonymAnalyzer extends Analyzer {
|
|||
stream.addAttribute(CharTermAttribute.class);
|
||||
stream.addAttribute(PositionIncrementAttribute.class);
|
||||
stream.addAttribute(OffsetAttribute.class);
|
||||
try {
|
||||
stream.reset();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return new SynonymTokenizer(stream, synonyms);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,32 +28,38 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testFilter() throws Exception {
|
||||
TokenStream stream = new MockTokenizer(new StringReader(
|
||||
// we disable MockTokenizer checks because we will forcefully limit the
|
||||
// tokenstream and call end() before incrementToken() returns false.
|
||||
MockTokenizer stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
stream.setEnableChecks(false);
|
||||
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
|
||||
|
||||
stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
stream.setEnableChecks(false);
|
||||
filter = new OffsetLimitTokenFilter(stream, 12);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
|
||||
|
||||
stream = new MockTokenizer(new StringReader(
|
||||
"short toolong evenmuchlongertext a ab toolong foo"),
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
stream.setEnableChecks(false);
|
||||
filter = new OffsetLimitTokenFilter(stream, 30);
|
||||
assertTokenStreamContents(filter, new String[] {"short", "toolong",
|
||||
"evenmuchlongertext"});
|
||||
|
||||
|
||||
// TODO: This is not actually testing reuse! (reusableTokenStream is not implemented)
|
||||
checkOneTermReuse(new Analyzer() {
|
||||
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
|
||||
MockTokenizer.WHITESPACE, false), 10);
|
||||
MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
tokenizer.setEnableChecks(false);
|
||||
return new OffsetLimitTokenFilter(tokenizer, 10);
|
||||
}
|
||||
}, "llenges", "llenges");
|
||||
}
|
||||
|
|
|
@ -36,7 +36,10 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermPositionVector;
|
||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
|
@ -86,12 +89,12 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
public void reset() {
|
||||
this.i = -1;
|
||||
this.tokens = new Token[] {
|
||||
new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
|
||||
new Token(new char[] { '{', 'f', 'o', 'x', '}' }, 0, 5, 0, 7),
|
||||
new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
|
||||
new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
|
||||
new Token(new char[] { 'n', 'o', 't' }, 0, 3, 12, 15),
|
||||
new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
|
||||
new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
|
||||
new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
|
||||
new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
|
||||
new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
|
||||
new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
|
||||
new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
|
||||
this.tokens[1].setPositionIncrement(0);
|
||||
}
|
||||
}
|
||||
|
@ -188,4 +191,97 @@ public class TokenSourcesTest extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
|
||||
LockObtainFailedException, IOException, InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
|
||||
try {
|
||||
final Document document = new Document();
|
||||
document.add(new Field(FIELD, new TokenStreamOverlap(),
|
||||
TermVector.WITH_OFFSETS));
|
||||
indexWriter.addDocument(document);
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
final IndexReader indexReader = IndexReader.open(directory, true);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final IndexSearcher indexSearcher = newSearcher(indexReader);
|
||||
try {
|
||||
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
|
||||
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
|
||||
new SpanTermQuery(new Term(FIELD, "the")),
|
||||
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
|
||||
|
||||
TopDocs hits = indexSearcher.search(phraseQuery, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
final Highlighter highlighter = new Highlighter(
|
||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
|
||||
false);
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
indexSearcher.close();
|
||||
}
|
||||
} finally {
|
||||
indexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testOverlapWithPositionsAndOffsetExactPhrase()
|
||||
throws CorruptIndexException, LockObtainFailedException, IOException,
|
||||
InvalidTokenOffsetsException {
|
||||
final String TEXT = "the fox did not jump";
|
||||
final Directory directory = newDirectory();
|
||||
final IndexWriter indexWriter = new IndexWriter(directory,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
|
||||
try {
|
||||
final Document document = new Document();
|
||||
document.add(new Field(FIELD, new TokenStreamOverlap(),
|
||||
TermVector.WITH_POSITIONS_OFFSETS));
|
||||
indexWriter.addDocument(document);
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
final IndexReader indexReader = IndexReader.open(directory, true);
|
||||
try {
|
||||
assertEquals(1, indexReader.numDocs());
|
||||
final IndexSearcher indexSearcher = newSearcher(indexReader);
|
||||
try {
|
||||
// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "the")));
|
||||
// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
|
||||
final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
|
||||
new SpanTermQuery(new Term(FIELD, "the")),
|
||||
new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
|
||||
|
||||
TopDocs hits = indexSearcher.search(phraseQuery, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
final Highlighter highlighter = new Highlighter(
|
||||
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
|
||||
new QueryScorer(phraseQuery));
|
||||
final TokenStream tokenStream = TokenSources
|
||||
.getTokenStream(
|
||||
(TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
|
||||
false);
|
||||
assertEquals("<B>the fox</B> did not jump",
|
||||
highlighter.getBestFragment(tokenStream, TEXT));
|
||||
} finally {
|
||||
indexSearcher.close();
|
||||
}
|
||||
} finally {
|
||||
indexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -192,6 +192,7 @@ public class FuzzyLikeThisQuery extends Query
|
|||
int corpusNumDocs=reader.numDocs();
|
||||
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
||||
HashSet<String> processedTerms=new HashSet<String>();
|
||||
ts.reset();
|
||||
while (ts.incrementToken())
|
||||
{
|
||||
String term = termAtt.toString();
|
||||
|
@ -213,17 +214,15 @@ public class FuzzyLikeThisQuery extends Query
|
|||
BoostAttribute boostAtt =
|
||||
fe.attributes().addAttribute(BoostAttribute.class);
|
||||
while ((possibleMatch = fe.next()) != null) {
|
||||
if (possibleMatch!=null) {
|
||||
numVariants++;
|
||||
totalVariantDocFreqs+=fe.docFreq();
|
||||
float score=boostAtt.getBoost();
|
||||
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
|
||||
ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm);
|
||||
variantsQ.insertWithOverflow(st);
|
||||
minScore = variantsQ.top().score; // maintain minScore
|
||||
}
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
numVariants++;
|
||||
totalVariantDocFreqs+=fe.docFreq();
|
||||
float score=boostAtt.getBoost();
|
||||
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
|
||||
ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm);
|
||||
variantsQ.insertWithOverflow(st);
|
||||
minScore = variantsQ.top().score; // maintain minScore
|
||||
}
|
||||
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
if(numVariants>0)
|
||||
|
@ -247,6 +246,8 @@ public class FuzzyLikeThisQuery extends Query
|
|||
}
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -885,7 +885,7 @@ public final class MoreLikeThis {
|
|||
int tokenCount=0;
|
||||
// for every token
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
tokenCount++;
|
||||
|
@ -906,6 +906,8 @@ public final class MoreLikeThis {
|
|||
cnt.x++;
|
||||
}
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -110,6 +110,11 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
|
||||
int countTokens = 0;
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
while (true) {
|
||||
try {
|
||||
if (!source.incrementToken()) break;
|
||||
|
@ -126,6 +131,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
}
|
||||
}
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
|
@ -191,7 +197,11 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||
List<String> tlist = new ArrayList<String>();
|
||||
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
while (true) {
|
||||
try {
|
||||
if (!source.incrementToken()) break;
|
||||
|
@ -202,6 +212,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
|
@ -242,6 +253,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
boolean multipleTokens = false;
|
||||
|
||||
try {
|
||||
source.reset();
|
||||
if (source.incrementToken()) {
|
||||
nextToken = termAtt.toString();
|
||||
}
|
||||
|
@ -251,6 +263,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
|
@ -281,6 +294,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
try {
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
||||
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
source.reset();
|
||||
multipleTokens = false;
|
||||
|
||||
|
||||
|
@ -292,6 +306,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
// ignore
|
||||
}
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
|
@ -308,6 +323,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||
|
||||
try {
|
||||
source.reset();
|
||||
if (source.incrementToken()) {
|
||||
part2 = termAtt.toString();
|
||||
}
|
||||
|
@ -316,6 +332,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
// ignore
|
||||
}
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
|
|
|
@ -123,6 +123,11 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
|||
|
||||
TokenStream source = this.analyzer.tokenStream(field, new StringReader(
|
||||
text));
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
|
|
|
@ -118,12 +118,14 @@ public final class SynExpand {
|
|||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
if ( already.add( word))
|
||||
top.add( word);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
final BooleanQuery tmp = new BooleanQuery();
|
||||
|
||||
// [2] form query
|
||||
|
|
|
@ -111,7 +111,6 @@ public class TestSynonymTokenFilter extends BaseTokenStreamTestCase {
|
|||
setPreviousTokenStream(streams);
|
||||
} else {
|
||||
streams.source.reset(reader);
|
||||
streams.result.reset(); // reset the SynonymTokenFilter
|
||||
}
|
||||
return streams.result;
|
||||
}
|
||||
|
|
|
@ -80,9 +80,12 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
|||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
try
|
||||
{
|
||||
ts.reset();
|
||||
while(ts.incrementToken()) {
|
||||
stopWordsSet.add(termAtt.toString());
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
|
|
|
@ -59,11 +59,14 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
|
|||
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
||||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, new BytesRef(bytes)));
|
||||
clausesList.add(stq);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
SpanOrQuery soq=new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return soq;
|
||||
|
|
|
@ -64,6 +64,7 @@ public class TermsFilterBuilder implements FilterBuilder
|
|||
{
|
||||
Term term = null;
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
if (term == null)
|
||||
|
@ -76,6 +77,8 @@ public class TermsFilterBuilder implements FilterBuilder
|
|||
}
|
||||
tf.addTerm(term);
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
|
|
|
@ -61,6 +61,7 @@ public class TermsQueryBuilder implements QueryBuilder {
|
|||
TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
Term term = null;
|
||||
BytesRef bytes = termAtt.getBytesRef();
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
termAtt.fillBytesRef();
|
||||
if (term == null)
|
||||
|
@ -73,6 +74,8 @@ public class TermsQueryBuilder implements QueryBuilder {
|
|||
}
|
||||
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
||||
}
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Contributions
|
||||
|
@ -275,7 +275,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="#PDFTextStream -- PDF text and metadata extraction">PDFTextStream -- PDF text and metadata extraction</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="#PJ Classic & PJ Professional - PDF Document Conversion">PJ Classic & PJ Professional - PDF Document Conversion</a>
|
||||
<a href="#PJ Classic & PJ Professional - PDF Document Conversion">PJ Classic & PJ Professional - PDF Document Conversion</a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
|
@ -403,7 +403,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
URL
|
||||
</th>
|
||||
<td>
|
||||
<a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2">
|
||||
<a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2">
|
||||
http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2
|
||||
</a>
|
||||
</td>
|
||||
|
@ -538,7 +538,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</tr>
|
||||
|
||||
</table>
|
||||
<a name="N10124"></a><a name="PJ Classic & PJ Professional - PDF Document Conversion"></a>
|
||||
<a name="N10124"></a><a name="PJ Classic & PJ Professional - PDF Document Conversion"></a>
|
||||
<h3 class="boxed">PJ Classic & PJ Professional - PDF Document Conversion</h3>
|
||||
<table class="ForrestTable" cellspacing="1" cellpadding="4">
|
||||
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Building and Installing the Basic Demo
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Basic Demo Sources Walk-through
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Index File Formats
|
||||
|
@ -425,11 +425,19 @@ document.write("Last Published: " + document.lastModified);
|
|||
<p>
|
||||
In version 3.1, segments records the code version
|
||||
that created them. See LUCENE-2720 for details.
|
||||
|
||||
Additionally segments track explicitly whether or
|
||||
not they have term vectors. See LUCENE-2811 for details.
|
||||
</p>
|
||||
<p>
|
||||
In version 3.2, numeric fields are written as natively
|
||||
to stored fields file, previously they were stored in
|
||||
text format only.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
<a name="N10037"></a><a name="Definitions"></a>
|
||||
<a name="N1003A"></a><a name="Definitions"></a>
|
||||
<h2 class="boxed">Definitions</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -470,7 +478,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
strings, the first naming the field, and the second naming text
|
||||
within the field.
|
||||
</p>
|
||||
<a name="N10057"></a><a name="Inverted Indexing"></a>
|
||||
<a name="N1005A"></a><a name="Inverted Indexing"></a>
|
||||
<h3 class="boxed">Inverted Indexing</h3>
|
||||
<p>
|
||||
The index stores statistics about terms in order
|
||||
|
@ -480,7 +488,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
it. This is the inverse of the natural relationship, in which
|
||||
documents list terms.
|
||||
</p>
|
||||
<a name="N10063"></a><a name="Types of Fields"></a>
|
||||
<a name="N10066"></a><a name="Types of Fields"></a>
|
||||
<h3 class="boxed">Types of Fields</h3>
|
||||
<p>
|
||||
In Lucene, fields may be <i>stored</i>, in which
|
||||
|
@ -494,7 +502,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
to be indexed literally.
|
||||
</p>
|
||||
<p>See the <a href="api/core/org/apache/lucene/document/Field.html">Field</a> java docs for more information on Fields.</p>
|
||||
<a name="N10080"></a><a name="Segments"></a>
|
||||
<a name="N10083"></a><a name="Segments"></a>
|
||||
<h3 class="boxed">Segments</h3>
|
||||
<p>
|
||||
Lucene indexes may be composed of multiple sub-indexes, or
|
||||
|
@ -520,7 +528,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Searches may involve multiple segments and/or multiple indexes, each
|
||||
index potentially composed of a set of segments.
|
||||
</p>
|
||||
<a name="N1009E"></a><a name="Document Numbers"></a>
|
||||
<a name="N100A1"></a><a name="Document Numbers"></a>
|
||||
<h3 class="boxed">Document Numbers</h3>
|
||||
<p>
|
||||
Internally, Lucene refers to documents by an integer <i>document
|
||||
|
@ -575,7 +583,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N100C5"></a><a name="Overview"></a>
|
||||
<a name="N100C8"></a><a name="Overview"></a>
|
||||
<h2 class="boxed">Overview</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -674,7 +682,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10108"></a><a name="File Naming"></a>
|
||||
<a name="N1010B"></a><a name="File Naming"></a>
|
||||
<h2 class="boxed">File Naming</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -701,7 +709,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</p>
|
||||
</div>
|
||||
|
||||
<a name="N10117"></a><a name="file-names"></a>
|
||||
<a name="N1011A"></a><a name="file-names"></a>
|
||||
<h2 class="boxed">Summary of File Extensions</h2>
|
||||
<div class="section">
|
||||
<p>The following table summarizes the names and extensions of the files in Lucene:
|
||||
|
@ -843,10 +851,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10201"></a><a name="Primitive Types"></a>
|
||||
<a name="N10204"></a><a name="Primitive Types"></a>
|
||||
<h2 class="boxed">Primitive Types</h2>
|
||||
<div class="section">
|
||||
<a name="N10206"></a><a name="Byte"></a>
|
||||
<a name="N10209"></a><a name="Byte"></a>
|
||||
<h3 class="boxed">Byte</h3>
|
||||
<p>
|
||||
The most primitive type
|
||||
|
@ -854,7 +862,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
other data types are defined as sequences
|
||||
of bytes, so file formats are byte-order independent.
|
||||
</p>
|
||||
<a name="N1020F"></a><a name="UInt32"></a>
|
||||
<a name="N10212"></a><a name="UInt32"></a>
|
||||
<h3 class="boxed">UInt32</h3>
|
||||
<p>
|
||||
32-bit unsigned integers are written as four
|
||||
|
@ -864,7 +872,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
UInt32 --> <Byte><sup>4</sup>
|
||||
|
||||
</p>
|
||||
<a name="N1021E"></a><a name="Uint64"></a>
|
||||
<a name="N10221"></a><a name="Uint64"></a>
|
||||
<h3 class="boxed">Uint64</h3>
|
||||
<p>
|
||||
64-bit unsigned integers are written as eight
|
||||
|
@ -873,7 +881,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<p>UInt64 --> <Byte><sup>8</sup>
|
||||
|
||||
</p>
|
||||
<a name="N1022D"></a><a name="VInt"></a>
|
||||
<a name="N10230"></a><a name="VInt"></a>
|
||||
<h3 class="boxed">VInt</h3>
|
||||
<p>
|
||||
A variable-length format for positive integers is
|
||||
|
@ -1423,13 +1431,13 @@ document.write("Last Published: " + document.lastModified);
|
|||
This provides compression while still being
|
||||
efficient to decode.
|
||||
</p>
|
||||
<a name="N10512"></a><a name="Chars"></a>
|
||||
<a name="N10515"></a><a name="Chars"></a>
|
||||
<h3 class="boxed">Chars</h3>
|
||||
<p>
|
||||
Lucene writes unicode
|
||||
character sequences as UTF-8 encoded bytes.
|
||||
</p>
|
||||
<a name="N1051B"></a><a name="String"></a>
|
||||
<a name="N1051E"></a><a name="String"></a>
|
||||
<h3 class="boxed">String</h3>
|
||||
<p>
|
||||
Lucene writes strings as UTF-8 encoded bytes.
|
||||
|
@ -1442,10 +1450,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10528"></a><a name="Compound Types"></a>
|
||||
<a name="N1052B"></a><a name="Compound Types"></a>
|
||||
<h2 class="boxed">Compound Types</h2>
|
||||
<div class="section">
|
||||
<a name="N1052D"></a><a name="MapStringString"></a>
|
||||
<a name="N10530"></a><a name="MapStringString"></a>
|
||||
<h3 class="boxed">Map<String,String></h3>
|
||||
<p>
|
||||
In a couple places Lucene stores a Map
|
||||
|
@ -1458,13 +1466,13 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N1053D"></a><a name="Per-Index Files"></a>
|
||||
<a name="N10540"></a><a name="Per-Index Files"></a>
|
||||
<h2 class="boxed">Per-Index Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The files in this section exist one-per-index.
|
||||
</p>
|
||||
<a name="N10545"></a><a name="Segments File"></a>
|
||||
<a name="N10548"></a><a name="Segments File"></a>
|
||||
<h3 class="boxed">Segments File</h3>
|
||||
<p>
|
||||
The active segments in the index are stored in the
|
||||
|
@ -1508,7 +1516,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<b>3.1</b>
|
||||
Segments --> Format, Version, NameCounter, SegCount, <SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
|
||||
NormGen<sup>NumField</sup>,
|
||||
IsCompoundFile, DeletionCount, HasProx, Diagnostics><sup>SegCount</sup>, CommitUserData, Checksum
|
||||
IsCompoundFile, DeletionCount, HasProx, Diagnostics, HasVectors><sup>SegCount</sup>, CommitUserData, Checksum
|
||||
</p>
|
||||
<p>
|
||||
Format, NameCounter, SegCount, SegSize, NumField,
|
||||
|
@ -1525,7 +1533,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</p>
|
||||
<p>
|
||||
IsCompoundFile, HasSingleNormFile,
|
||||
DocStoreIsCompoundFile, HasProx --> Int8
|
||||
DocStoreIsCompoundFile, HasProx, HasVectors --> Int8
|
||||
</p>
|
||||
<p>
|
||||
CommitUserData --> Map<String,String>
|
||||
|
@ -1634,7 +1642,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
Lucene version, OS, Java version, why the segment
|
||||
was created (merge, flush, addIndexes), etc.
|
||||
</p>
|
||||
<a name="N105CD"></a><a name="Lock File"></a>
|
||||
<p> HasVectors is 1 if this segment stores term vectors,
|
||||
else it's 0.
|
||||
</p>
|
||||
<a name="N105D3"></a><a name="Lock File"></a>
|
||||
<h3 class="boxed">Lock File</h3>
|
||||
<p>
|
||||
The write lock, which is stored in the index
|
||||
|
@ -1648,14 +1659,14 @@ document.write("Last Published: " + document.lastModified);
|
|||
documents). This lock file ensures that only one
|
||||
writer is modifying the index at a time.
|
||||
</p>
|
||||
<a name="N105D6"></a><a name="Deletable File"></a>
|
||||
<a name="N105DC"></a><a name="Deletable File"></a>
|
||||
<h3 class="boxed">Deletable File</h3>
|
||||
<p>
|
||||
A writer dynamically computes
|
||||
the files that are deletable, instead, so no file
|
||||
is written.
|
||||
</p>
|
||||
<a name="N105DF"></a><a name="Compound Files"></a>
|
||||
<a name="N105E5"></a><a name="Compound Files"></a>
|
||||
<h3 class="boxed">Compound Files</h3>
|
||||
<p>Starting with Lucene 1.4 the compound file format became default. This
|
||||
is simply a container for all files described in the next section
|
||||
|
@ -1682,14 +1693,14 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10607"></a><a name="Per-Segment Files"></a>
|
||||
<a name="N1060D"></a><a name="Per-Segment Files"></a>
|
||||
<h2 class="boxed">Per-Segment Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The remaining files are all per-segment, and are
|
||||
thus defined by suffix.
|
||||
</p>
|
||||
<a name="N1060F"></a><a name="Fields"></a>
|
||||
<a name="N10615"></a><a name="Fields"></a>
|
||||
<h3 class="boxed">Fields</h3>
|
||||
<p>
|
||||
|
||||
|
@ -1863,12 +1874,28 @@ document.write("Last Published: " + document.lastModified);
|
|||
(if compression is enabled, the algorithm used is ZLIB),
|
||||
only available for indexes until Lucene version 2.9.x</li>
|
||||
|
||||
<li>4th to 6th bits (mask: 0x7<<3) define the type of a
|
||||
numeric field: <ul>
|
||||
|
||||
<li>all bits in mask are cleared if no numeric field at all</li>
|
||||
|
||||
<li>1<<3: Value is Int</li>
|
||||
|
||||
<li>2<<3: Value is Long</li>
|
||||
|
||||
<li>3<<3: Value is Int as Float (as of Integer.intBitsToFloat)</li>
|
||||
|
||||
<li>4<<3: Value is Long as Double (as of Double.longBitsToDouble)</li>
|
||||
|
||||
</ul>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</p>
|
||||
|
||||
<p>Value -->
|
||||
String | BinaryValue (depending on Bits)
|
||||
String | BinaryValue | Int | Long (depending on Bits)
|
||||
</p>
|
||||
|
||||
<p>BinaryValue -->
|
||||
|
@ -1883,7 +1910,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N106B6"></a><a name="Term Dictionary"></a>
|
||||
<a name="N106D0"></a><a name="Term Dictionary"></a>
|
||||
<h3 class="boxed">Term Dictionary</h3>
|
||||
<p>
|
||||
The term dictionary is represented as two files:
|
||||
|
@ -2075,7 +2102,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N1073A"></a><a name="Frequencies"></a>
|
||||
<a name="N10754"></a><a name="Frequencies"></a>
|
||||
<h3 class="boxed">Frequencies</h3>
|
||||
<p>
|
||||
The .frq file contains the lists of documents
|
||||
|
@ -2203,7 +2230,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
|
||||
to entry 31 on level 0.
|
||||
</p>
|
||||
<a name="N107C2"></a><a name="Positions"></a>
|
||||
<a name="N107DC"></a><a name="Positions"></a>
|
||||
<h3 class="boxed">Positions</h3>
|
||||
<p>
|
||||
The .prx file contains the lists of positions that
|
||||
|
@ -2273,7 +2300,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Payload. If PayloadLength is not stored, then this Payload has the same
|
||||
length as the Payload at the previous position.
|
||||
</p>
|
||||
<a name="N107FE"></a><a name="Normalization Factors"></a>
|
||||
<a name="N10818"></a><a name="Normalization Factors"></a>
|
||||
<h3 class="boxed">Normalization Factors</h3>
|
||||
<p>There's a single .nrm file containing all norms:
|
||||
</p>
|
||||
|
@ -2353,7 +2380,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</p>
|
||||
<p>Separate norm files are created (when adequate) for both compound and non compound segments.
|
||||
</p>
|
||||
<a name="N1084F"></a><a name="Term Vectors"></a>
|
||||
<a name="N10869"></a><a name="Term Vectors"></a>
|
||||
<h3 class="boxed">Term Vectors</h3>
|
||||
<p>
|
||||
Term Vector support is an optional on a field by
|
||||
|
@ -2489,7 +2516,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N108EB"></a><a name="Deleted Documents"></a>
|
||||
<a name="N10905"></a><a name="Deleted Documents"></a>
|
||||
<h3 class="boxed">Deleted Documents</h3>
|
||||
<p>The .del file is
|
||||
optional, and only exists when a segment contains deletions.
|
||||
|
@ -2553,7 +2580,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10925"></a><a name="Limitations"></a>
|
||||
<a name="N1093F"></a><a name="Limitations"></a>
|
||||
<h2 class="boxed">Limitations</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Getting Started Guide
|
||||
|
@ -269,14 +269,12 @@ may wish to skip sections.
|
|||
<li>
|
||||
<a href="demo.html">About the command-line Lucene demo and its usage</a>. This section
|
||||
is intended for anyone who wants to use the command-line Lucene demo.</li>
|
||||
<p></p>
|
||||
|
||||
|
||||
<li>
|
||||
<a href="demo2.html">About the sources and implementation for the command-line Lucene
|
||||
demo</a>. This section walks through the implementation details (sources) of the
|
||||
command-line Lucene demo. This section is intended for developers.</li>
|
||||
<p></p>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>Lucene Java Documentation</title>
|
||||
<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>Site Linkmap Table of Contents</title>
|
||||
<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Lucene Contrib
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Query Parser Syntax
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>
|
||||
Apache Lucene - Scoring
|
||||
|
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 4.7 KiB |
Binary file not shown.
After Width: | Height: | Size: 2.2 KiB |
|
@ -3,7 +3,7 @@
|
|||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta content="Apache Forrest" name="Generator">
|
||||
<meta name="Forrest-version" content="0.8">
|
||||
<meta name="Forrest-version" content="0.9">
|
||||
<meta name="Forrest-skin-name" content="lucene">
|
||||
<title>Apache Lucene - System Requirements</title>
|
||||
<link type="text/css" href="skin/basic.css" rel="stylesheet">
|
||||
|
|
Binary file not shown.
|
@ -131,8 +131,13 @@ public final class Document {
|
|||
/** Returns a field with the given name if any exist in this document, or
|
||||
* null. If multiple fields exists with this name, this method returns the
|
||||
* first value added.
|
||||
* Do not use this method with lazy loaded fields.
|
||||
* Do not use this method with lazy loaded fields or {@link NumericField}.
|
||||
* @deprecated use {@link #getFieldable} instead and cast depending on
|
||||
* data type.
|
||||
* @throws ClassCastException if you try to retrieve a numerical or
|
||||
* lazy loaded field.
|
||||
*/
|
||||
@Deprecated
|
||||
public final Field getField(String name) {
|
||||
return (Field) getFieldable(name);
|
||||
}
|
||||
|
@ -154,6 +159,8 @@ public final class Document {
|
|||
* this document, or null. If multiple fields exist with this name, this
|
||||
* method returns the first value added. If only binary fields with this name
|
||||
* exist, returns null.
|
||||
* For {@link NumericField} it returns the string value of the number. If you want
|
||||
* the actual {@code NumericField} instance back, use {@link #getFieldable}.
|
||||
*/
|
||||
public final String get(String name) {
|
||||
for (Fieldable field : fields) {
|
||||
|
@ -177,13 +184,18 @@ public final class Document {
|
|||
|
||||
/**
|
||||
* Returns an array of {@link Field}s with the given name.
|
||||
* Do not use with lazy loaded fields.
|
||||
* This method returns an empty array when there are no
|
||||
* matching fields. It never returns null.
|
||||
* Do not use this method with lazy loaded fields or {@link NumericField}.
|
||||
*
|
||||
* @param name the name of the field
|
||||
* @return a <code>Field[]</code> array
|
||||
* @deprecated use {@link #getFieldable} instead and cast depending on
|
||||
* data type.
|
||||
* @throws ClassCastException if you try to retrieve a numerical or
|
||||
* lazy loaded field.
|
||||
*/
|
||||
@Deprecated
|
||||
public final Field[] getFields(String name) {
|
||||
List<Field> result = new ArrayList<Field>();
|
||||
for (Fieldable field : fields) {
|
||||
|
@ -230,6 +242,8 @@ public final class Document {
|
|||
* Returns an array of values of the field specified as the method parameter.
|
||||
* This method returns an empty array when there are no
|
||||
* matching fields. It never returns null.
|
||||
* For {@link NumericField}s it returns the string value of the number. If you want
|
||||
* the actual {@code NumericField} instances back, use {@link #getFieldables}.
|
||||
* @param name the name of the field
|
||||
* @return a <code>String[]</code> of field values
|
||||
*/
|
||||
|
|
|
@ -127,18 +127,18 @@ import org.apache.lucene.search.FieldCache; // javadocs
|
|||
* class is a wrapper around this token stream type for
|
||||
* easier, more intuitive usage.</p>
|
||||
*
|
||||
* <p><b>NOTE:</b> This class is only used during
|
||||
* indexing. When retrieving the stored field value from a
|
||||
* {@link Document} instance after search, you will get a
|
||||
* conventional {@link Fieldable} instance where the numeric
|
||||
* values are returned as {@link String}s (according to
|
||||
* <code>toString(value)</code> of the used data type).
|
||||
*
|
||||
* @since 2.9
|
||||
*/
|
||||
public final class NumericField extends AbstractField {
|
||||
|
||||
private final NumericTokenStream numericTS;
|
||||
/** Data type of the value in {@link NumericField}.
|
||||
* @since 3.2
|
||||
*/
|
||||
public static enum DataType { INT, LONG, FLOAT, DOUBLE }
|
||||
|
||||
private transient NumericTokenStream numericTS;
|
||||
private DataType type;
|
||||
private final int precisionStep;
|
||||
|
||||
/**
|
||||
* Creates a field for numeric values using the default <code>precisionStep</code>
|
||||
|
@ -158,8 +158,8 @@ public final class NumericField extends AbstractField {
|
|||
* a numeric value, before indexing a document containing this field,
|
||||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* @param name the field name
|
||||
* @param store if the field should be stored in plain text form
|
||||
* (according to <code>toString(value)</code> of the used data type)
|
||||
* @param store if the field should be stored, {@link Document#getFieldable}
|
||||
* then returns {@code NumericField} instances on search results.
|
||||
* @param index if the field should be indexed using {@link NumericTokenStream}
|
||||
*/
|
||||
public NumericField(String name, Field.Store store, boolean index) {
|
||||
|
@ -186,19 +186,43 @@ public final class NumericField extends AbstractField {
|
|||
* set a value using the various set<em>???</em>Value() methods.
|
||||
* @param name the field name
|
||||
* @param precisionStep the used <a href="../search/NumericRangeQuery.html#precisionStepDesc">precision step</a>
|
||||
* @param store if the field should be stored in plain text form
|
||||
* (according to <code>toString(value)</code> of the used data type)
|
||||
* @param store if the field should be stored, {@link Document#getFieldable}
|
||||
* then returns {@code NumericField} instances on search results.
|
||||
* @param index if the field should be indexed using {@link NumericTokenStream}
|
||||
*/
|
||||
public NumericField(String name, int precisionStep, Field.Store store, boolean index) {
|
||||
super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO);
|
||||
this.precisionStep = precisionStep;
|
||||
setOmitTermFreqAndPositions(true);
|
||||
numericTS = new NumericTokenStream(precisionStep);
|
||||
}
|
||||
|
||||
/** Returns a {@link NumericTokenStream} for indexing the numeric value. */
|
||||
public TokenStream tokenStreamValue() {
|
||||
return isIndexed() ? numericTS : null;
|
||||
if (!isIndexed())
|
||||
return null;
|
||||
if (numericTS == null) {
|
||||
// lazy init the TokenStream as it is heavy to instantiate (attributes,...),
|
||||
// if not needed (stored field loading)
|
||||
numericTS = new NumericTokenStream(precisionStep);
|
||||
// initialize value in TokenStream
|
||||
if (fieldsData != null) {
|
||||
assert type != null;
|
||||
final Number val = (Number) fieldsData;
|
||||
switch (type) {
|
||||
case INT:
|
||||
numericTS.setIntValue(val.intValue()); break;
|
||||
case LONG:
|
||||
numericTS.setLongValue(val.longValue()); break;
|
||||
case FLOAT:
|
||||
numericTS.setFloatValue(val.floatValue()); break;
|
||||
case DOUBLE:
|
||||
numericTS.setDoubleValue(val.doubleValue()); break;
|
||||
default:
|
||||
assert false : "Should never get here";
|
||||
}
|
||||
}
|
||||
}
|
||||
return numericTS;
|
||||
}
|
||||
|
||||
/** Returns always <code>null</code> for numeric fields */
|
||||
|
@ -212,7 +236,10 @@ public final class NumericField extends AbstractField {
|
|||
return null;
|
||||
}
|
||||
|
||||
/** Returns the numeric value as a string (how it is stored, when {@link Field.Store#YES} is chosen). */
|
||||
/** Returns the numeric value as a string. This format is also returned if you call {@link Document#get(String)}
|
||||
* on search results. It is recommended to use {@link Document#getFieldable} instead
|
||||
* that returns {@code NumericField} instances. You can then use {@link #getNumericValue}
|
||||
* to return the stored value. */
|
||||
public String stringValue() {
|
||||
return (fieldsData == null) ? null : fieldsData.toString();
|
||||
}
|
||||
|
@ -224,7 +251,14 @@ public final class NumericField extends AbstractField {
|
|||
|
||||
/** Returns the precision step. */
|
||||
public int getPrecisionStep() {
|
||||
return numericTS.getPrecisionStep();
|
||||
return precisionStep;
|
||||
}
|
||||
|
||||
/** Returns the data type of the current value, {@code null} if not yet set.
|
||||
* @since 3.2
|
||||
*/
|
||||
public DataType getDataType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -234,8 +268,9 @@ public final class NumericField extends AbstractField {
|
|||
* <code>document.add(new NumericField(name, precisionStep).setLongValue(value))</code>
|
||||
*/
|
||||
public NumericField setLongValue(final long value) {
|
||||
numericTS.setLongValue(value);
|
||||
if (numericTS != null) numericTS.setLongValue(value);
|
||||
fieldsData = Long.valueOf(value);
|
||||
type = DataType.LONG;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -246,8 +281,9 @@ public final class NumericField extends AbstractField {
|
|||
* <code>document.add(new NumericField(name, precisionStep).setIntValue(value))</code>
|
||||
*/
|
||||
public NumericField setIntValue(final int value) {
|
||||
numericTS.setIntValue(value);
|
||||
if (numericTS != null) numericTS.setIntValue(value);
|
||||
fieldsData = Integer.valueOf(value);
|
||||
type = DataType.INT;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -258,8 +294,9 @@ public final class NumericField extends AbstractField {
|
|||
* <code>document.add(new NumericField(name, precisionStep).setDoubleValue(value))</code>
|
||||
*/
|
||||
public NumericField setDoubleValue(final double value) {
|
||||
numericTS.setDoubleValue(value);
|
||||
if (numericTS != null) numericTS.setDoubleValue(value);
|
||||
fieldsData = Double.valueOf(value);
|
||||
type = DataType.DOUBLE;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -270,8 +307,9 @@ public final class NumericField extends AbstractField {
|
|||
* <code>document.add(new NumericField(name, precisionStep).setFloatValue(value))</code>
|
||||
*/
|
||||
public NumericField setFloatValue(final float value) {
|
||||
numericTS.setFloatValue(value);
|
||||
if (numericTS != null) numericTS.setFloatValue(value);
|
||||
fieldsData = Float.valueOf(value);
|
||||
type = DataType.FLOAT;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -132,9 +132,9 @@ class BufferedDeletesStream {
|
|||
public final long gen;
|
||||
|
||||
// If non-null, contains segments that are 100% deleted
|
||||
public final SegmentInfos allDeleted;
|
||||
public final List<SegmentInfo> allDeleted;
|
||||
|
||||
ApplyDeletesResult(boolean anyDeletes, long gen, SegmentInfos allDeleted) {
|
||||
ApplyDeletesResult(boolean anyDeletes, long gen, List<SegmentInfo> allDeleted) {
|
||||
this.anyDeletes = anyDeletes;
|
||||
this.gen = gen;
|
||||
this.allDeleted = allDeleted;
|
||||
|
@ -164,7 +164,7 @@ class BufferedDeletesStream {
|
|||
/** Resolves the buffered deleted Term/Query/docIDs, into
|
||||
* actual deleted docIDs in the deletedDocs BitVector for
|
||||
* each SegmentReader. */
|
||||
public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, SegmentInfos infos) throws IOException {
|
||||
public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, List<SegmentInfo> infos) throws IOException {
|
||||
final long t0 = System.currentTimeMillis();
|
||||
|
||||
if (infos.size() == 0) {
|
||||
|
@ -182,7 +182,7 @@ class BufferedDeletesStream {
|
|||
message("applyDeletes: infos=" + infos + " packetCount=" + deletes.size());
|
||||
}
|
||||
|
||||
SegmentInfos infos2 = new SegmentInfos();
|
||||
List<SegmentInfo> infos2 = new ArrayList<SegmentInfo>();
|
||||
infos2.addAll(infos);
|
||||
Collections.sort(infos2, sortSegInfoByDelGen);
|
||||
|
||||
|
@ -192,7 +192,7 @@ class BufferedDeletesStream {
|
|||
int infosIDX = infos2.size()-1;
|
||||
int delIDX = deletes.size()-1;
|
||||
|
||||
SegmentInfos allDeleted = null;
|
||||
List<SegmentInfo> allDeleted = null;
|
||||
|
||||
while (infosIDX >= 0) {
|
||||
//System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX);
|
||||
|
@ -245,7 +245,7 @@ class BufferedDeletesStream {
|
|||
|
||||
if (segAllDeletes) {
|
||||
if (allDeleted == null) {
|
||||
allDeleted = new SegmentInfos();
|
||||
allDeleted = new ArrayList<SegmentInfo>();
|
||||
}
|
||||
allDeleted.add(info);
|
||||
}
|
||||
|
@ -287,7 +287,7 @@ class BufferedDeletesStream {
|
|||
|
||||
if (segAllDeletes) {
|
||||
if (allDeleted == null) {
|
||||
allDeleted = new SegmentInfos();
|
||||
allDeleted = new ArrayList<SegmentInfo>();
|
||||
}
|
||||
allDeleted.add(info);
|
||||
}
|
||||
|
|
|
@ -46,8 +46,10 @@ import org.apache.lucene.util.IOUtils;
|
|||
* file. The {directory} that follows has that many entries. Each directory entry
|
||||
* contains a long pointer to the start of this file's data section, and a String
|
||||
* with that file's name.
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
final class CompoundFileWriter {
|
||||
public final class CompoundFileWriter {
|
||||
|
||||
static final class FileEntry {
|
||||
|
||||
|
@ -137,8 +139,7 @@ final class CompoundFileWriter {
|
|||
|
||||
/** Merge files with the extensions added up to now.
|
||||
* All files with these extensions are combined sequentially into the
|
||||
* compound stream. After successful merge, the source files
|
||||
* are deleted.
|
||||
* compound stream.
|
||||
* @throws IllegalStateException if close() had been called before or
|
||||
* if no file has been added to this object
|
||||
*/
|
||||
|
|
|
@ -135,8 +135,8 @@ public class ConcurrentMergeScheduler extends MergeScheduler {
|
|||
final MergePolicy.OneMerge m1 = t1.getCurrentMerge();
|
||||
final MergePolicy.OneMerge m2 = t2.getCurrentMerge();
|
||||
|
||||
final int c1 = m1 == null ? Integer.MAX_VALUE : m1.segments.totalDocCount();
|
||||
final int c2 = m2 == null ? Integer.MAX_VALUE : m2.segments.totalDocCount();
|
||||
final int c1 = m1 == null ? Integer.MAX_VALUE : m1.totalDocCount;
|
||||
final int c2 = m2 == null ? Integer.MAX_VALUE : m2.totalDocCount;
|
||||
|
||||
return c2 - c1;
|
||||
}
|
||||
|
|
|
@ -263,9 +263,10 @@ final class DocFieldProcessor extends DocConsumer {
|
|||
// enabled; we could save [small amount of] CPU
|
||||
// here.
|
||||
ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp);
|
||||
|
||||
for(int i=0;i<fieldCount;i++)
|
||||
fields[i].consumer.processFields(fields[i].fields, fields[i].fieldCount);
|
||||
for(int i=0;i<fieldCount;i++) {
|
||||
final DocFieldProcessorPerField perField = fields[i];
|
||||
perField.consumer.processFields(perField.fields, perField.fieldCount);
|
||||
}
|
||||
|
||||
if (docState.maxTermPrefix != null && docState.infoStream != null) {
|
||||
docState.infoStream.println("WARNING: document contains at least one immense term (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'");
|
||||
|
|
|
@ -188,7 +188,7 @@ final class DocumentsWriter {
|
|||
this.infoStream = infoStream;
|
||||
final Iterator<ThreadState> it = perThreadPool.getAllPerThreadsIterator();
|
||||
while (it.hasNext()) {
|
||||
it.next().perThread.docState.infoStream = infoStream;
|
||||
it.next().perThread.setInfoStream(infoStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -63,9 +63,10 @@ import org.apache.lucene.search.Query;
|
|||
*/
|
||||
final class DocumentsWriterDeleteQueue {
|
||||
|
||||
private volatile Node tail;
|
||||
private volatile Node<?> tail;
|
||||
|
||||
private static final AtomicReferenceFieldUpdater<DocumentsWriterDeleteQueue, Node> tailUpdater = AtomicReferenceFieldUpdater
|
||||
@SuppressWarnings("rawtypes")
|
||||
private static final AtomicReferenceFieldUpdater<DocumentsWriterDeleteQueue,Node> tailUpdater = AtomicReferenceFieldUpdater
|
||||
.newUpdater(DocumentsWriterDeleteQueue.class, Node.class, "tail");
|
||||
|
||||
private final DeleteSlice globalSlice;
|
||||
|
@ -90,7 +91,7 @@ final class DocumentsWriterDeleteQueue {
|
|||
* we use a sentinel instance as our initial tail. No slice will ever try to
|
||||
* apply this tail since the head is always omitted.
|
||||
*/
|
||||
tail = new Node(null); // sentinel
|
||||
tail = new Node<Object>(null); // sentinel
|
||||
globalSlice = new DeleteSlice(tail);
|
||||
}
|
||||
|
||||
|
@ -126,14 +127,14 @@ final class DocumentsWriterDeleteQueue {
|
|||
// we can do it just every n times or so?
|
||||
}
|
||||
|
||||
void add(Node item) {
|
||||
void add(Node<?> item) {
|
||||
/*
|
||||
* this non-blocking / 'wait-free' linked list add was inspired by Apache
|
||||
* Harmony's ConcurrentLinkedQueue Implementation.
|
||||
*/
|
||||
while (true) {
|
||||
final Node currentTail = this.tail;
|
||||
final Node tailNext = currentTail.next;
|
||||
final Node<?> currentTail = this.tail;
|
||||
final Node<?> tailNext = currentTail.next;
|
||||
if (tail == currentTail) {
|
||||
if (tailNext != null) {
|
||||
/*
|
||||
|
@ -196,7 +197,7 @@ final class DocumentsWriterDeleteQueue {
|
|||
* deletes in the queue and reset the global slice to let the GC prune the
|
||||
* queue.
|
||||
*/
|
||||
final Node currentTail = tail; // take the current tail make this local any
|
||||
final Node<?> currentTail = tail; // take the current tail make this local any
|
||||
// Changes after this call are applied later
|
||||
// and not relevant here
|
||||
if (callerSlice != null) {
|
||||
|
@ -232,10 +233,10 @@ final class DocumentsWriterDeleteQueue {
|
|||
|
||||
static class DeleteSlice {
|
||||
// No need to be volatile, slices are thread captive (only accessed by one thread)!
|
||||
Node sliceHead; // we don't apply this one
|
||||
Node sliceTail;
|
||||
Node<?> sliceHead; // we don't apply this one
|
||||
Node<?> sliceTail;
|
||||
|
||||
DeleteSlice(Node currentTail) {
|
||||
DeleteSlice(Node<?> currentTail) {
|
||||
assert currentTail != null;
|
||||
/*
|
||||
* Initially this is a 0 length slice pointing to the 'current' tail of
|
||||
|
@ -256,7 +257,7 @@ final class DocumentsWriterDeleteQueue {
|
|||
* tail in this slice are not equal then there will be at least one more
|
||||
* non-null node in the slice!
|
||||
*/
|
||||
Node current = sliceHead;
|
||||
Node<?> current = sliceHead;
|
||||
do {
|
||||
current = current.next;
|
||||
assert current != null : "slice property violated between the head on the tail must not be a null node";
|
||||
|
@ -290,7 +291,7 @@ final class DocumentsWriterDeleteQueue {
|
|||
void clear() {
|
||||
globalBufferLock.lock();
|
||||
try {
|
||||
final Node currentTail = tail;
|
||||
final Node<?> currentTail = tail;
|
||||
globalSlice.sliceHead = globalSlice.sliceTail = currentTail;
|
||||
globalBufferedDeletes.clear();
|
||||
} finally {
|
||||
|
@ -298,27 +299,28 @@ final class DocumentsWriterDeleteQueue {
|
|||
}
|
||||
}
|
||||
|
||||
private static class Node {
|
||||
volatile Node next;
|
||||
final Object item;
|
||||
private static class Node<T> {
|
||||
volatile Node<?> next;
|
||||
final T item;
|
||||
|
||||
private Node(Object item) {
|
||||
Node(T item) {
|
||||
this.item = item;
|
||||
}
|
||||
|
||||
static final AtomicReferenceFieldUpdater<Node, Node> nextUpdater = AtomicReferenceFieldUpdater
|
||||
@SuppressWarnings("rawtypes")
|
||||
static final AtomicReferenceFieldUpdater<Node,Node> nextUpdater = AtomicReferenceFieldUpdater
|
||||
.newUpdater(Node.class, Node.class, "next");
|
||||
|
||||
void apply(BufferedDeletes bufferedDeletes, int docIDUpto) {
|
||||
assert false : "sentinel item must never be applied";
|
||||
}
|
||||
|
||||
boolean casNext(Node cmp, Node val) {
|
||||
boolean casNext(Node<?> cmp, Node<?> val) {
|
||||
return nextUpdater.compareAndSet(this, cmp, val);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class TermNode extends Node {
|
||||
private static final class TermNode extends Node<Term> {
|
||||
|
||||
TermNode(Term term) {
|
||||
super(term);
|
||||
|
@ -326,33 +328,31 @@ final class DocumentsWriterDeleteQueue {
|
|||
|
||||
@Override
|
||||
void apply(BufferedDeletes bufferedDeletes, int docIDUpto) {
|
||||
bufferedDeletes.addTerm((Term) item, docIDUpto);
|
||||
bufferedDeletes.addTerm(item, docIDUpto);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class QueryArrayNode extends Node {
|
||||
private static final class QueryArrayNode extends Node<Query[]> {
|
||||
QueryArrayNode(Query[] query) {
|
||||
super(query);
|
||||
}
|
||||
|
||||
@Override
|
||||
void apply(BufferedDeletes bufferedDeletes, int docIDUpto) {
|
||||
final Query[] queries = (Query[]) item;
|
||||
for (Query query : queries) {
|
||||
for (Query query : item) {
|
||||
bufferedDeletes.addQuery(query, docIDUpto);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class TermArrayNode extends Node {
|
||||
private static final class TermArrayNode extends Node<Term[]> {
|
||||
TermArrayNode(Term[] term) {
|
||||
super(term);
|
||||
}
|
||||
|
||||
@Override
|
||||
void apply(BufferedDeletes bufferedDeletes, int docIDUpto) {
|
||||
final Term[] terms = (Term[]) item;
|
||||
for (Term term : terms) {
|
||||
for (Term term : item) {
|
||||
bufferedDeletes.addTerm(term, docIDUpto);
|
||||
}
|
||||
}
|
||||
|
@ -361,7 +361,7 @@ final class DocumentsWriterDeleteQueue {
|
|||
|
||||
private boolean forceApplyGlobalSlice() {
|
||||
globalBufferLock.lock();
|
||||
final Node currentTail = tail;
|
||||
final Node<?> currentTail = tail;
|
||||
try {
|
||||
if (globalSlice.sliceTail != currentTail) {
|
||||
globalSlice.sliceTail = currentTail;
|
||||
|
|
|
@ -122,13 +122,13 @@ public final class DocumentsWriterFlushControl {
|
|||
// is super important since we can not address more than 2048 MB per DWPT
|
||||
setFlushPending(perThread);
|
||||
if (fullFlush) {
|
||||
DocumentsWriterPerThread toBlock = internalTryCheckOutForFlush(perThread, false);
|
||||
DocumentsWriterPerThread toBlock = internalTryCheckOutForFlush(perThread);
|
||||
assert toBlock != null;
|
||||
blockedFlushes.add(toBlock);
|
||||
}
|
||||
}
|
||||
}
|
||||
final DocumentsWriterPerThread flushingDWPT = tryCheckoutForFlush(perThread, false);
|
||||
final DocumentsWriterPerThread flushingDWPT = tryCheckoutForFlush(perThread);
|
||||
healthiness.updateStalled(this);
|
||||
return flushingDWPT;
|
||||
}
|
||||
|
@ -189,18 +189,15 @@ public final class DocumentsWriterFlushControl {
|
|||
}
|
||||
|
||||
synchronized DocumentsWriterPerThread tryCheckoutForFlush(
|
||||
ThreadState perThread, boolean setPending) {
|
||||
ThreadState perThread) {
|
||||
if (fullFlush) {
|
||||
return null;
|
||||
}
|
||||
return internalTryCheckOutForFlush(perThread, setPending);
|
||||
return internalTryCheckOutForFlush(perThread);
|
||||
}
|
||||
|
||||
private DocumentsWriterPerThread internalTryCheckOutForFlush(
|
||||
ThreadState perThread, boolean setPending) {
|
||||
if (setPending && !perThread.flushPending) {
|
||||
setFlushPending(perThread);
|
||||
}
|
||||
ThreadState perThread) {
|
||||
if (perThread.flushPending) {
|
||||
// We are pending so all memory is already moved to flushBytes
|
||||
if (perThread.tryLock()) {
|
||||
|
@ -245,7 +242,7 @@ public final class DocumentsWriterFlushControl {
|
|||
while (allActiveThreads.hasNext() && numPending > 0) {
|
||||
ThreadState next = allActiveThreads.next();
|
||||
if (next.flushPending) {
|
||||
final DocumentsWriterPerThread dwpt = tryCheckoutForFlush(next, false);
|
||||
final DocumentsWriterPerThread dwpt = tryCheckoutForFlush(next);
|
||||
if (dwpt != null) {
|
||||
return dwpt;
|
||||
}
|
||||
|
@ -330,7 +327,12 @@ public final class DocumentsWriterFlushControl {
|
|||
}
|
||||
if (next.perThread.getNumDocsInRAM() > 0 ) {
|
||||
final DocumentsWriterPerThread dwpt = next.perThread; // just for assert
|
||||
final DocumentsWriterPerThread flushingDWPT = internalTryCheckOutForFlush(next, true);
|
||||
synchronized (this) {
|
||||
if (!next.flushPending) {
|
||||
setFlushPending(next);
|
||||
}
|
||||
}
|
||||
final DocumentsWriterPerThread flushingDWPT = internalTryCheckOutForFlush(next);
|
||||
assert flushingDWPT != null : "DWPT must never be null here since we hold the lock and it holds documents";
|
||||
assert dwpt == flushingDWPT : "flushControl returned different DWPT";
|
||||
toFlush.add(flushingDWPT);
|
||||
|
|
|
@ -163,7 +163,7 @@ public class DocumentsWriterPerThread {
|
|||
boolean hasAborted = false; // True if the last exception throws by #updateDocument was aborting
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private final PrintStream infoStream;
|
||||
private PrintStream infoStream;
|
||||
private int numDocsInRAM;
|
||||
private int flushedDocCount;
|
||||
DocumentsWriterDeleteQueue deleteQueue;
|
||||
|
@ -235,6 +235,7 @@ public class DocumentsWriterPerThread {
|
|||
// mark document as deleted
|
||||
deleteDocID(docState.docID);
|
||||
numDocsInRAM++;
|
||||
fieldInfos.revertUncommitted();
|
||||
} else {
|
||||
abort();
|
||||
}
|
||||
|
@ -377,15 +378,12 @@ public class DocumentsWriterPerThread {
|
|||
boolean success = false;
|
||||
|
||||
try {
|
||||
|
||||
SegmentInfo newSegment = new SegmentInfo(segment, flushState.numDocs, directory, false, fieldInfos.hasProx(), flushState.segmentCodecs, false, fieldInfos);
|
||||
consumer.flush(flushState);
|
||||
pendingDeletes.terms.clear();
|
||||
newSegment.setHasVectors(flushState.hasVectors);
|
||||
|
||||
final SegmentInfo newSegment = new SegmentInfo(segment, flushState.numDocs, directory, false, flushState.segmentCodecs, fieldInfos.asReadOnly());
|
||||
if (infoStream != null) {
|
||||
message("new segment has " + (flushState.deletedDocs == null ? 0 : flushState.deletedDocs.count()) + " deleted docs");
|
||||
message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors"));
|
||||
message("new segment has " + (newSegment.getHasVectors() ? "vectors" : "no vectors"));
|
||||
message("flushedFiles=" + newSegment.files());
|
||||
message("flushed codecs=" + newSegment.getSegmentCodecs());
|
||||
}
|
||||
|
@ -435,10 +433,6 @@ public class DocumentsWriterPerThread {
|
|||
return bytesUsed.get() + pendingDeletes.bytesUsed.get();
|
||||
}
|
||||
|
||||
FieldInfos getFieldInfos() {
|
||||
return fieldInfos;
|
||||
}
|
||||
|
||||
void message(String message) {
|
||||
writer.message("DWPT: " + message);
|
||||
}
|
||||
|
@ -498,4 +492,9 @@ public class DocumentsWriterPerThread {
|
|||
assert segment != null;
|
||||
return new PerDocWriteState(infoStream, directory, segment, fieldInfos, bytesUsed, codecId);
|
||||
}
|
||||
|
||||
void setInfoStream(PrintStream infoStream) {
|
||||
this.infoStream = infoStream;
|
||||
docState.infoStream = infoStream;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,6 @@ import org.apache.lucene.index.values.Type;
|
|||
/** @lucene.experimental */
|
||||
public final class FieldInfo {
|
||||
public static final int UNASSIGNED_CODEC_ID = -1;
|
||||
|
||||
public final String name;
|
||||
public final int number;
|
||||
|
||||
|
@ -113,7 +112,6 @@ public final class FieldInfo {
|
|||
}
|
||||
assert !this.omitTermFreqAndPositions || !this.storePayloads;
|
||||
}
|
||||
|
||||
void setDocValues(Type v) {
|
||||
if (docValues == null) {
|
||||
docValues = v;
|
||||
|
@ -127,4 +125,29 @@ public final class FieldInfo {
|
|||
public Type getDocValues() {
|
||||
return docValues;
|
||||
}
|
||||
|
||||
private boolean vectorsCommitted;
|
||||
|
||||
/**
|
||||
* Reverts all uncommitted changes on this {@link FieldInfo}
|
||||
* @see #commitVectors()
|
||||
*/
|
||||
void revertUncommitted() {
|
||||
if (storeTermVector && !vectorsCommitted) {
|
||||
storeOffsetWithTermVector = false;
|
||||
storePositionWithTermVector = false;
|
||||
storeTermVector = false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Commits term vector modifications. Changes to term-vectors must be
|
||||
* explicitly committed once the necessary files are created. If those changes
|
||||
* are not committed subsequent {@link #revertUncommitted()} will reset the
|
||||
* all term-vector flags before the next document.
|
||||
*/
|
||||
void commitVectors() {
|
||||
assert storeTermVector;
|
||||
vectorsCommitted = true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -220,6 +220,10 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40;
|
||||
|
||||
private int format;
|
||||
private boolean hasProx; // only set if readonly
|
||||
private boolean hasVectors; // only set if readonly
|
||||
private long version; // internal use to track changes
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new {@link FieldInfos} instance with a private
|
||||
|
@ -267,7 +271,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
*/
|
||||
public FieldInfos(Directory d, String name) throws IOException {
|
||||
this((FieldNumberBiMap)null, null); // use null here to make this FIs Read-Only
|
||||
IndexInput input = d.openInput(name);
|
||||
final IndexInput input = d.openInput(name);
|
||||
try {
|
||||
read(input, name);
|
||||
} finally {
|
||||
|
@ -303,6 +307,9 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
@Override
|
||||
synchronized public Object clone() {
|
||||
FieldInfos fis = new FieldInfos(globalFieldNumbers, segmentCodecsBuilder);
|
||||
fis.format = format;
|
||||
fis.hasProx = hasProx;
|
||||
fis.hasVectors = hasVectors;
|
||||
for (FieldInfo fi : this) {
|
||||
FieldInfo clone = (FieldInfo) (fi).clone();
|
||||
fis.putInternal(clone);
|
||||
|
@ -312,6 +319,10 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
|
||||
/** Returns true if any fields do not omitTermFreqAndPositions */
|
||||
public boolean hasProx() {
|
||||
if (isReadOnly()) {
|
||||
return hasProx;
|
||||
}
|
||||
// mutable FIs must check!
|
||||
for (FieldInfo fi : this) {
|
||||
if (fi.isIndexed && !fi.omitTermFreqAndPositions) {
|
||||
return true;
|
||||
|
@ -445,6 +456,7 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
if ((fi.isIndexed || fi.hasDocValues()) && fi.getCodecId() == FieldInfo.UNASSIGNED_CODEC_ID) {
|
||||
segmentCodecsBuilder.tryAddAndSet(fi);
|
||||
}
|
||||
version++;
|
||||
return fi;
|
||||
}
|
||||
|
||||
|
@ -514,6 +526,10 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
}
|
||||
|
||||
public boolean hasVectors() {
|
||||
if (isReadOnly()) {
|
||||
return hasVectors;
|
||||
}
|
||||
// mutable FIs must check
|
||||
for (FieldInfo fi : this) {
|
||||
if (fi.storeTermVector) {
|
||||
return true;
|
||||
|
@ -567,6 +583,10 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
return globalFieldNumbers == null;
|
||||
}
|
||||
|
||||
synchronized final long getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
public void write(IndexOutput output) throws IOException {
|
||||
output.writeVInt(FORMAT_CURRENT);
|
||||
output.writeVInt(size());
|
||||
|
@ -658,7 +678,8 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
if (omitTermFreqAndPositions) {
|
||||
storePayloads = false;
|
||||
}
|
||||
|
||||
hasVectors |= storeTermVector;
|
||||
hasProx |= isIndexed && !omitTermFreqAndPositions;
|
||||
Type docValuesType = null;
|
||||
if (format <= FORMAT_INDEX_VALUES) {
|
||||
final byte b = input.readByte();
|
||||
|
@ -706,4 +727,28 @@ public final class FieldInfos implements Iterable<FieldInfo> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reverts all uncommitted changes
|
||||
* @see FieldInfo#revertUncommitted()
|
||||
*/
|
||||
void revertUncommitted() {
|
||||
for (FieldInfo fieldInfo : this) {
|
||||
fieldInfo.revertUncommitted();
|
||||
}
|
||||
}
|
||||
|
||||
final FieldInfos asReadOnly() {
|
||||
if (isReadOnly()) {
|
||||
return this;
|
||||
}
|
||||
final FieldInfos roFis = new FieldInfos((FieldNumberBiMap)null, null);
|
||||
for (FieldInfo fieldInfo : this) {
|
||||
FieldInfo clone = (FieldInfo) (fieldInfo).clone();
|
||||
roFis.putInternal(clone);
|
||||
roFis.hasVectors |= clone.storeTermVector;
|
||||
roFis.hasProx |= clone.isIndexed && !clone.omitTermFreqAndPositions;
|
||||
}
|
||||
return roFis;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -24,10 +24,11 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.document.FieldSelector;
|
||||
import org.apache.lucene.document.FieldSelectorResult;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.BufferedIndexInput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.CloseableThreadLocal;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -212,40 +213,39 @@ public final class FieldsReader implements Cloneable {
|
|||
|
||||
Document doc = new Document();
|
||||
int numFields = fieldsStream.readVInt();
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
out: for (int i = 0; i < numFields; i++) {
|
||||
int fieldNumber = fieldsStream.readVInt();
|
||||
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
|
||||
FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
|
||||
|
||||
byte bits = fieldsStream.readByte();
|
||||
assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
|
||||
int bits = fieldsStream.readByte() & 0xFF;
|
||||
assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits);
|
||||
|
||||
boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
|
||||
boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
|
||||
//TODO: Find an alternative approach here if this list continues to grow beyond the
|
||||
//list of 5 or 6 currently here. See Lucene 762 for discussion
|
||||
if (acceptField.equals(FieldSelectorResult.LOAD)) {
|
||||
addField(doc, fi, binary, tokenize);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
|
||||
addField(doc, fi, binary, tokenize);
|
||||
break;//Get out of this loop
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
|
||||
addFieldLazy(doc, fi, binary, tokenize, true);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.LATENT)) {
|
||||
addFieldLazy(doc, fi, binary, tokenize, false);
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.SIZE)){
|
||||
skipField(addFieldSize(doc, fi, binary));
|
||||
}
|
||||
else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
|
||||
addFieldSize(doc, fi, binary);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
skipField();
|
||||
final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK;
|
||||
|
||||
switch (acceptField) {
|
||||
case LOAD:
|
||||
addField(doc, fi, binary, tokenize, numeric);
|
||||
break;
|
||||
case LOAD_AND_BREAK:
|
||||
addField(doc, fi, binary, tokenize, numeric);
|
||||
break out; //Get out of this loop
|
||||
case LAZY_LOAD:
|
||||
addFieldLazy(doc, fi, binary, tokenize, true, numeric);
|
||||
break;
|
||||
case LATENT:
|
||||
addFieldLazy(doc, fi, binary, tokenize, false, numeric);
|
||||
break;
|
||||
case SIZE:
|
||||
skipFieldBytes(addFieldSize(doc, fi, binary, numeric));
|
||||
break;
|
||||
case SIZE_AND_BREAK:
|
||||
addFieldSize(doc, fi, binary, numeric);
|
||||
break out; //Get out of this loop
|
||||
default:
|
||||
skipField(numeric);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -282,72 +282,121 @@ public final class FieldsReader implements Cloneable {
|
|||
* Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
|
||||
* This will have the most payoff on large fields.
|
||||
*/
|
||||
private void skipField() throws IOException {
|
||||
skipField(fieldsStream.readVInt());
|
||||
private void skipField(int numeric) throws IOException {
|
||||
final int numBytes;
|
||||
switch(numeric) {
|
||||
case 0:
|
||||
numBytes = fieldsStream.readVInt();
|
||||
break;
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_INT:
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
|
||||
numBytes = 4;
|
||||
break;
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
|
||||
numBytes = 8;
|
||||
break;
|
||||
default:
|
||||
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
|
||||
}
|
||||
|
||||
skipFieldBytes(numBytes);
|
||||
}
|
||||
|
||||
private void skipField(int toRead) throws IOException {
|
||||
private void skipFieldBytes(int toRead) throws IOException {
|
||||
fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
|
||||
}
|
||||
|
||||
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize, boolean cacheResult) throws IOException {
|
||||
private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException {
|
||||
assert numeric != 0;
|
||||
switch(numeric) {
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_INT:
|
||||
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readInt());
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
|
||||
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readLong());
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
|
||||
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readInt()));
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
|
||||
return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readLong()));
|
||||
default:
|
||||
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
|
||||
}
|
||||
}
|
||||
|
||||
private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize, boolean cacheResult, int numeric) throws IOException {
|
||||
final AbstractField f;
|
||||
if (binary) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
|
||||
doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, cacheResult));
|
||||
f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, cacheResult);
|
||||
//Need to move the pointer ahead by toRead positions
|
||||
fieldsStream.seek(pointer + toRead);
|
||||
} else if (numeric != 0) {
|
||||
f = loadNumericField(fi, numeric);
|
||||
} else {
|
||||
Field.Store store = Field.Store.YES;
|
||||
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
|
||||
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
|
||||
|
||||
AbstractField f;
|
||||
int length = fieldsStream.readVInt();
|
||||
long pointer = fieldsStream.getFilePointer();
|
||||
//Skip ahead of where we are by the length of what is stored
|
||||
fieldsStream.seek(pointer+length);
|
||||
f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, cacheResult);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
|
||||
doc.add(f);
|
||||
}
|
||||
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
doc.add(f);
|
||||
}
|
||||
|
||||
private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException {
|
||||
private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize, int numeric) throws CorruptIndexException, IOException {
|
||||
final AbstractField f;
|
||||
|
||||
if (binary) {
|
||||
int toRead = fieldsStream.readVInt();
|
||||
final byte[] b = new byte[toRead];
|
||||
fieldsStream.readBytes(b, 0, b.length);
|
||||
doc.add(new Field(fi.name, b));
|
||||
f = new Field(fi.name, b);
|
||||
} else if (numeric != 0) {
|
||||
f = loadNumericField(fi, numeric);
|
||||
} else {
|
||||
Field.Store store = Field.Store.YES;
|
||||
Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize);
|
||||
Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector);
|
||||
|
||||
AbstractField f;
|
||||
f = new Field(fi.name, // name
|
||||
false,
|
||||
fieldsStream.readString(), // read value
|
||||
store,
|
||||
index,
|
||||
termVector);
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
|
||||
doc.add(f);
|
||||
false,
|
||||
fieldsStream.readString(), // read value
|
||||
Field.Store.YES,
|
||||
index,
|
||||
termVector);
|
||||
}
|
||||
|
||||
f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions);
|
||||
f.setOmitNorms(fi.omitNorms);
|
||||
doc.add(f);
|
||||
}
|
||||
|
||||
// Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
|
||||
// Read just the size -- caller must skip the field content to continue reading fields
|
||||
// Return the size in bytes or chars, depending on field type
|
||||
private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException {
|
||||
int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size;
|
||||
private int addFieldSize(Document doc, FieldInfo fi, boolean binary, int numeric) throws IOException {
|
||||
final int bytesize, size;
|
||||
switch(numeric) {
|
||||
case 0:
|
||||
size = fieldsStream.readVInt();
|
||||
bytesize = binary ? size : 2*size;
|
||||
break;
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_INT:
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_FLOAT:
|
||||
size = bytesize = 4;
|
||||
break;
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_LONG:
|
||||
case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE:
|
||||
size = bytesize = 8;
|
||||
break;
|
||||
default:
|
||||
throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric));
|
||||
}
|
||||
byte[] sizebytes = new byte[4];
|
||||
sizebytes[0] = (byte) (bytesize>>>24);
|
||||
sizebytes[1] = (byte) (bytesize>>>16);
|
||||
|
@ -358,7 +407,7 @@ public final class FieldsReader implements Cloneable {
|
|||
}
|
||||
|
||||
/**
|
||||
* A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is
|
||||
* A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is
|
||||
* loaded.
|
||||
*/
|
||||
private class LazyField extends AbstractField implements Fieldable {
|
||||
|
|
|
@ -21,22 +21,40 @@ import java.util.List;
|
|||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.document.NumericField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
final class FieldsWriter {
|
||||
static final byte FIELD_IS_TOKENIZED = 0x1;
|
||||
static final byte FIELD_IS_BINARY = 0x2;
|
||||
static final int FIELD_IS_TOKENIZED = 1 << 0;
|
||||
static final int FIELD_IS_BINARY = 1 << 1;
|
||||
|
||||
// the old bit 1 << 2 was compressed, is now left out
|
||||
|
||||
private static final int _NUMERIC_BIT_SHIFT = 3;
|
||||
static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT;
|
||||
|
||||
static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT;
|
||||
static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT;
|
||||
static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT;
|
||||
static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT;
|
||||
// currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT;
|
||||
// currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT;
|
||||
|
||||
// the next possible bits are: 1 << 6; 1 << 7
|
||||
|
||||
// Lucene 3.0: Removal of compressed fields
|
||||
static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2;
|
||||
|
||||
// Lucene 3.2: NumericFields are stored in binary format
|
||||
static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3;
|
||||
|
||||
// NOTE: if you introduce a new format, make it 1 higher
|
||||
// than the current one, and always change this if you
|
||||
// switch to a new format!
|
||||
static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
|
||||
static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS;
|
||||
|
||||
// when removing support for old versions, leave the last supported version here
|
||||
static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS;
|
||||
|
@ -121,13 +139,26 @@ final class FieldsWriter {
|
|||
|
||||
final void writeField(int fieldNumber, Fieldable field) throws IOException {
|
||||
fieldsStream.writeVInt(fieldNumber);
|
||||
byte bits = 0;
|
||||
int bits = 0;
|
||||
if (field.isTokenized())
|
||||
bits |= FieldsWriter.FIELD_IS_TOKENIZED;
|
||||
bits |= FIELD_IS_TOKENIZED;
|
||||
if (field.isBinary())
|
||||
bits |= FieldsWriter.FIELD_IS_BINARY;
|
||||
|
||||
fieldsStream.writeByte(bits);
|
||||
bits |= FIELD_IS_BINARY;
|
||||
if (field instanceof NumericField) {
|
||||
switch (((NumericField) field).getDataType()) {
|
||||
case INT:
|
||||
bits |= FIELD_IS_NUMERIC_INT; break;
|
||||
case LONG:
|
||||
bits |= FIELD_IS_NUMERIC_LONG; break;
|
||||
case FLOAT:
|
||||
bits |= FIELD_IS_NUMERIC_FLOAT; break;
|
||||
case DOUBLE:
|
||||
bits |= FIELD_IS_NUMERIC_DOUBLE; break;
|
||||
default:
|
||||
assert false : "Should never get here";
|
||||
}
|
||||
}
|
||||
fieldsStream.writeByte((byte) bits);
|
||||
|
||||
if (field.isBinary()) {
|
||||
final byte[] data;
|
||||
|
@ -139,8 +170,22 @@ final class FieldsWriter {
|
|||
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, offset, len);
|
||||
}
|
||||
else {
|
||||
} else if (field instanceof NumericField) {
|
||||
final NumericField nf = (NumericField) field;
|
||||
final Number n = nf.getNumericValue();
|
||||
switch (nf.getDataType()) {
|
||||
case INT:
|
||||
fieldsStream.writeInt(n.intValue()); break;
|
||||
case LONG:
|
||||
fieldsStream.writeLong(n.longValue()); break;
|
||||
case FLOAT:
|
||||
fieldsStream.writeInt(Float.floatToIntBits(n.floatValue())); break;
|
||||
case DOUBLE:
|
||||
fieldsStream.writeLong(Double.doubleToLongBits(n.doubleValue())); break;
|
||||
default:
|
||||
assert false : "Should never get here";
|
||||
}
|
||||
} else {
|
||||
fieldsStream.writeString(field.stringValue());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.FilenameFilter;
|
|||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
|
@ -196,7 +197,31 @@ final class IndexFileDeleter {
|
|||
}
|
||||
}
|
||||
if (sis != null) {
|
||||
CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis);
|
||||
final SegmentInfos infos = sis;
|
||||
for (SegmentInfo segmentInfo : infos) {
|
||||
try {
|
||||
/*
|
||||
* Force FI to load for each segment since we could see a
|
||||
* segments file and load successfully above if the files are
|
||||
* still referenced when they are deleted and the os doesn't let
|
||||
* you delete them. Yet its likely that fnm files are removed
|
||||
* while seg file is still around Since LUCENE-2984 we need FI
|
||||
* to find out if a seg has vectors and prox so we need those
|
||||
* files to be opened for a commit point.
|
||||
*/
|
||||
segmentInfo.getFieldInfos();
|
||||
} catch (FileNotFoundException e) {
|
||||
refresh(segmentInfo.name);
|
||||
sis = null;
|
||||
if (infoStream != null) {
|
||||
message("init: hit FileNotFoundException when loading commit \"" + fileName + "\"; skipping this commit point");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (sis != null) {
|
||||
final CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis);
|
||||
if (sis.getGeneration() == segmentInfos.getGeneration()) {
|
||||
currentCommitPoint = commitPoint;
|
||||
}
|
||||
|
|
|
@ -1428,7 +1428,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
|||
cfr = new CompoundFileReader(dir, filename);
|
||||
|
||||
String [] files = cfr.listAll();
|
||||
ArrayUtil.quickSort(files); // sort the array of filename so that the output is more readable
|
||||
ArrayUtil.mergeSort(files); // sort the array of filename so that the output is more readable
|
||||
|
||||
for (int i = 0; i < files.length; ++i) {
|
||||
long len = cfr.fileLength(files[i]);
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* This is an easy-to-use tool that upgrades all segments of an index from previous Lucene versions
|
||||
* to the current segment file format. It can be used from command line:
|
||||
* <pre>
|
||||
* java -cp lucene-core.jar org.apache.lucene.index.IndexUpgrader [-delete-prior-commits] [-verbose] indexDir
|
||||
* </pre>
|
||||
* Alternatively this class can be instantiated and {@link #upgrade} invoked. It uses {@link UpgradeIndexMergePolicy}
|
||||
* and triggers the upgrade via an optimize request to {@link IndexWriter}.
|
||||
* <p>This tool keeps only the last commit in an index; for this
|
||||
* reason, if the incoming index has more than one commit, the tool
|
||||
* refuses to run by default. Specify {@code -delete-prior-commits}
|
||||
* to override this, allowing the tool to delete all but the last commit.
|
||||
* From Java code this can be enabled by passing {@code true} to
|
||||
* {@link #IndexUpgrader(Directory,PrintStream,boolean)}.
|
||||
*/
|
||||
public final class IndexUpgrader {
|
||||
|
||||
private static void printUsage() {
|
||||
System.err.println("Upgrades an index so all segments created with a previous Lucene version are rewritten.");
|
||||
System.err.println("Usage:");
|
||||
System.err.println(" java " + IndexUpgrader.class.getName() + " [-delete-prior-commits] [-verbose] indexDir");
|
||||
System.err.println("This tool keeps only the last commit in an index; for this");
|
||||
System.err.println("reason, if the incoming index has more than one commit, the tool");
|
||||
System.err.println("refuses to run by default. Specify -delete-prior-commits to override");
|
||||
System.err.println("this, allowing the tool to delete all but the last commit.");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
String dir = null;
|
||||
boolean deletePriorCommits = false;
|
||||
PrintStream out = null;
|
||||
for (String arg : args) {
|
||||
if ("-delete-prior-commits".equals(arg)) {
|
||||
deletePriorCommits = true;
|
||||
} else if ("-verbose".equals(arg)) {
|
||||
out = System.out;
|
||||
} else if (dir == null) {
|
||||
dir = arg;
|
||||
} else {
|
||||
printUsage();
|
||||
}
|
||||
}
|
||||
if (dir == null) {
|
||||
printUsage();
|
||||
}
|
||||
|
||||
new IndexUpgrader(FSDirectory.open(new File(dir)), out, deletePriorCommits).upgrade();
|
||||
}
|
||||
|
||||
private final Directory dir;
|
||||
private final PrintStream infoStream;
|
||||
private final IndexWriterConfig iwc;
|
||||
private final boolean deletePriorCommits;
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public IndexUpgrader(Directory dir) {
|
||||
this(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, null), null, false);
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecation")
|
||||
public IndexUpgrader(Directory dir, PrintStream infoStream, boolean deletePriorCommits) {
|
||||
this(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, null), infoStream, deletePriorCommits);
|
||||
}
|
||||
|
||||
public IndexUpgrader(Directory dir, IndexWriterConfig iwc, PrintStream infoStream, boolean deletePriorCommits) {
|
||||
this.dir = dir;
|
||||
this.iwc = iwc;
|
||||
this.infoStream = infoStream;
|
||||
this.deletePriorCommits = deletePriorCommits;
|
||||
}
|
||||
|
||||
public void upgrade() throws IOException {
|
||||
if (!IndexReader.indexExists(dir)) {
|
||||
throw new IndexNotFoundException(dir.toString());
|
||||
}
|
||||
|
||||
if (!deletePriorCommits) {
|
||||
final Collection<IndexCommit> commits = IndexReader.listCommits(dir);
|
||||
if (commits.size() > 1) {
|
||||
throw new IllegalArgumentException("This tool was invoked to not delete prior commit points, but the following commits were found: " + commits);
|
||||
}
|
||||
}
|
||||
|
||||
final IndexWriterConfig c = (IndexWriterConfig) iwc.clone();
|
||||
c.setMergePolicy(new UpgradeIndexMergePolicy(c.getMergePolicy()));
|
||||
c.setIndexDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
|
||||
|
||||
final IndexWriter w = new IndexWriter(dir, c);
|
||||
try {
|
||||
w.setInfoStream(infoStream);
|
||||
w.message("Upgrading all pre-" + Constants.LUCENE_MAIN_VERSION + " segments of index directory '" + dir + "' to version " + Constants.LUCENE_MAIN_VERSION + "...");
|
||||
w.optimize();
|
||||
w.message("All segments upgraded to version " + Constants.LUCENE_MAIN_VERSION);
|
||||
} finally {
|
||||
w.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -421,7 +421,7 @@ public class IndexWriter implements Closeable {
|
|||
private final Map<SegmentInfo,SegmentReader> readerMap = new HashMap<SegmentInfo,SegmentReader>();
|
||||
|
||||
/** Forcefully clear changes for the specified segments. This is called on successful merge. */
|
||||
synchronized void clear(SegmentInfos infos) throws IOException {
|
||||
synchronized void clear(List<SegmentInfo> infos) throws IOException {
|
||||
if (infos == null) {
|
||||
for (Map.Entry<SegmentInfo,SegmentReader> ent: readerMap.entrySet()) {
|
||||
ent.getValue().hasChanges = false;
|
||||
|
@ -511,7 +511,7 @@ public class IndexWriter implements Closeable {
|
|||
return false;
|
||||
}
|
||||
|
||||
public synchronized void drop(SegmentInfos infos) throws IOException {
|
||||
public synchronized void drop(List<SegmentInfo> infos) throws IOException {
|
||||
for(SegmentInfo info : infos) {
|
||||
drop(info);
|
||||
}
|
||||
|
@ -2355,7 +2355,7 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
String mergedName = newSegmentName();
|
||||
SegmentMerger merger = new SegmentMerger(directory, config.getTermIndexInterval(),
|
||||
mergedName, null, codecs, payloadProcessorProvider,
|
||||
mergedName, null, payloadProcessorProvider,
|
||||
globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs)));
|
||||
|
||||
for (IndexReader reader : readers) // add new indexes
|
||||
|
@ -2365,8 +2365,7 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
final FieldInfos fieldInfos = merger.fieldInfos();
|
||||
SegmentInfo info = new SegmentInfo(mergedName, docCount, directory,
|
||||
false, fieldInfos.hasProx(), merger.getSegmentCodecs(),
|
||||
fieldInfos.hasVectors(),
|
||||
false, merger.getSegmentCodecs(),
|
||||
fieldInfos);
|
||||
setDiagnostics(info, "addIndexes(IndexReader...)");
|
||||
|
||||
|
@ -2729,7 +2728,7 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
assert testPoint("startCommitMergeDeletes");
|
||||
|
||||
final SegmentInfos sourceSegments = merge.segments;
|
||||
final List<SegmentInfo> sourceSegments = merge.segments;
|
||||
|
||||
if (infoStream != null)
|
||||
message("commitMergeDeletes " + merge.segString(directory));
|
||||
|
@ -2741,7 +2740,7 @@ public class IndexWriter implements Closeable {
|
|||
long minGen = Long.MAX_VALUE;
|
||||
|
||||
for(int i=0; i < sourceSegments.size(); i++) {
|
||||
SegmentInfo info = sourceSegments.info(i);
|
||||
SegmentInfo info = sourceSegments.get(i);
|
||||
minGen = Math.min(info.getBufferedDeletesGen(), minGen);
|
||||
int docCount = info.docCount;
|
||||
final SegmentReader previousReader = merge.readerClones.get(i);
|
||||
|
@ -3041,7 +3040,16 @@ public class IndexWriter implements Closeable {
|
|||
// is running (while synchronized) to avoid race
|
||||
// condition where two conflicting merges from different
|
||||
// threads, start
|
||||
message("registerMerge merging=" + mergingSegments);
|
||||
if (infoStream != null) {
|
||||
StringBuilder builder = new StringBuilder("registerMerge merging= [");
|
||||
for (SegmentInfo info : mergingSegments) {
|
||||
builder.append(info.name).append(", ");
|
||||
}
|
||||
builder.append("]");
|
||||
// don't call mergingSegments.toString() could lead to ConcurrentModException
|
||||
// since merge updates the segments FieldInfos
|
||||
message(builder.toString());
|
||||
}
|
||||
for(SegmentInfo info : merge.segments) {
|
||||
message("registerMerge info=" + info);
|
||||
mergingSegments.add(info);
|
||||
|
@ -3094,7 +3102,7 @@ public class IndexWriter implements Closeable {
|
|||
// Bind a new segment name here so even with
|
||||
// ConcurrentMergePolicy we keep deterministic segment
|
||||
// names.
|
||||
merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, false, null, false, globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs)));
|
||||
merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, null, globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs)));
|
||||
|
||||
// Lock order: IW -> BD
|
||||
final BufferedDeletesStream.ApplyDeletesResult result = bufferedDeletesStream.applyDeletes(readerPool, merge.segments);
|
||||
|
@ -3133,6 +3141,16 @@ public class IndexWriter implements Closeable {
|
|||
message("merge seg=" + merge.info.name);
|
||||
}
|
||||
|
||||
assert merge.estimatedMergeBytes == 0;
|
||||
for(SegmentInfo info : merge.segments) {
|
||||
if (info.docCount > 0) {
|
||||
final int delCount = numDeletedDocs(info);
|
||||
assert delCount <= info.docCount;
|
||||
final double delRatio = ((double) delCount)/info.docCount;
|
||||
merge.estimatedMergeBytes += info.sizeInBytes(true) * (1.0 - delRatio);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: I think this should no longer be needed (we
|
||||
// now build CFS before adding segment to the infos);
|
||||
// however, on removing it, tests fail for some reason!
|
||||
|
@ -3174,7 +3192,7 @@ public class IndexWriter implements Closeable {
|
|||
// It's possible we are called twice, eg if there was an
|
||||
// exception inside mergeInit
|
||||
if (merge.registerDone) {
|
||||
final SegmentInfos sourceSegments = merge.segments;
|
||||
final List<SegmentInfo> sourceSegments = merge.segments;
|
||||
for(SegmentInfo info : sourceSegments) {
|
||||
mergingSegments.remove(info);
|
||||
}
|
||||
|
@ -3245,21 +3263,17 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
int mergedDocCount = 0;
|
||||
|
||||
SegmentInfos sourceSegments = merge.segments;
|
||||
List<SegmentInfo> sourceSegments = merge.segments;
|
||||
|
||||
SegmentMerger merger = new SegmentMerger(directory, config.getTermIndexInterval(), mergedName, merge,
|
||||
codecs, payloadProcessorProvider,
|
||||
merge.info.getFieldInfos());
|
||||
payloadProcessorProvider, merge.info.getFieldInfos());
|
||||
|
||||
if (infoStream != null) {
|
||||
message("merging " + merge.segString(directory) + " mergeVectors=" + merger.fieldInfos().hasVectors());
|
||||
message("merging " + merge.segString(directory) + " mergeVectors=" + merge.info.getFieldInfos().hasVectors());
|
||||
}
|
||||
|
||||
merge.readers = new ArrayList<SegmentReader>();
|
||||
merge.readerClones = new ArrayList<SegmentReader>();
|
||||
|
||||
merge.estimatedMergeBytes = 0;
|
||||
|
||||
// This is try/finally to make sure merger's readers are
|
||||
// closed:
|
||||
boolean success = false;
|
||||
|
@ -3268,7 +3282,7 @@ public class IndexWriter implements Closeable {
|
|||
int segUpto = 0;
|
||||
while(segUpto < sourceSegments.size()) {
|
||||
|
||||
final SegmentInfo info = sourceSegments.info(segUpto);
|
||||
final SegmentInfo info = sourceSegments.get(segUpto);
|
||||
|
||||
// Hold onto the "live" reader; we will use this to
|
||||
// commit merged deletes
|
||||
|
@ -3277,13 +3291,6 @@ public class IndexWriter implements Closeable {
|
|||
-config.getReaderTermsIndexDivisor());
|
||||
merge.readers.add(reader);
|
||||
|
||||
final int readerMaxDoc = reader.maxDoc();
|
||||
if (readerMaxDoc > 0) {
|
||||
final int delCount = reader.numDeletedDocs();
|
||||
final double delRatio = ((double) delCount)/readerMaxDoc;
|
||||
merge.estimatedMergeBytes += info.sizeInBytes(true) * (1.0 - delRatio);
|
||||
}
|
||||
|
||||
// We clone the segment readers because other
|
||||
// deletes may come in while we're merging so we
|
||||
// need readers that will not change
|
||||
|
@ -3308,8 +3315,6 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
// Record which codec was used to write the segment
|
||||
merge.info.setSegmentCodecs(merger.getSegmentCodecs());
|
||||
// Record if we have merged vectors
|
||||
merge.info.setHasVectors(merger.fieldInfos().hasVectors());
|
||||
|
||||
if (infoStream != null) {
|
||||
message("merge segmentCodecs=" + merger.getSegmentCodecs());
|
||||
|
@ -3323,8 +3328,6 @@ public class IndexWriter implements Closeable {
|
|||
// because codec must know if prox was written for
|
||||
// this segment:
|
||||
//System.out.println("merger set hasProx=" + merger.hasProx() + " seg=" + merge.info.name);
|
||||
merge.info.setHasProx(merger.fieldInfos().hasProx());
|
||||
|
||||
boolean useCompoundFile;
|
||||
synchronized (this) { // Guard segmentInfos
|
||||
useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.info);
|
||||
|
@ -3469,14 +3472,14 @@ public class IndexWriter implements Closeable {
|
|||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public synchronized String segString(SegmentInfos infos) throws IOException {
|
||||
public synchronized String segString(List<SegmentInfo> infos) throws IOException {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
final int count = infos.size();
|
||||
for(int i = 0; i < count; i++) {
|
||||
if (i > 0) {
|
||||
buffer.append(' ');
|
||||
}
|
||||
buffer.append(segString(infos.info(i)));
|
||||
buffer.append(segString(infos.get(i)));
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
|
@ -3531,6 +3534,7 @@ public class IndexWriter implements Closeable {
|
|||
|
||||
// called only from assert
|
||||
private boolean filesExist(SegmentInfos toSync) throws IOException {
|
||||
|
||||
Collection<String> files = toSync.files(directory, false);
|
||||
for(final String fileName: files) {
|
||||
assert directory.fileExists(fileName): "file " + fileName + " does not exist";
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -595,7 +594,7 @@ public abstract class LogMergePolicy extends MergePolicy {
|
|||
} else if (!anyTooLarge) {
|
||||
if (spec == null)
|
||||
spec = new MergeSpecification();
|
||||
final SegmentInfos mergeInfos = new SegmentInfos();
|
||||
final List<SegmentInfo> mergeInfos = new ArrayList<SegmentInfo>();
|
||||
for(int i=start;i<end;i++) {
|
||||
mergeInfos.add(levels.get(i).info);
|
||||
assert infos.contains(levels.get(i).info);
|
||||
|
|
|
@ -32,7 +32,7 @@ final class MergeDocIDRemapper {
|
|||
|
||||
public MergeDocIDRemapper(SegmentInfos infos, int[][] docMaps, int[] delCounts, MergePolicy.OneMerge merge, int mergedDocCount) {
|
||||
this.docMaps = docMaps;
|
||||
SegmentInfo firstSegment = merge.segments.info(0);
|
||||
SegmentInfo firstSegment = merge.segments.get(0);
|
||||
int i = 0;
|
||||
while(true) {
|
||||
SegmentInfo info = infos.info(i);
|
||||
|
@ -45,7 +45,7 @@ final class MergeDocIDRemapper {
|
|||
int numDocs = 0;
|
||||
for(int j=0;j<docMaps.length;i++,j++) {
|
||||
numDocs += infos.info(i).docCount;
|
||||
assert infos.info(i).equals(merge.segments.info(j));
|
||||
assert infos.info(i).equals(merge.segments.get(j));
|
||||
}
|
||||
maxDocID = minDocID + numDocs;
|
||||
|
||||
|
@ -55,7 +55,7 @@ final class MergeDocIDRemapper {
|
|||
starts[0] = minDocID;
|
||||
newStarts[0] = minDocID;
|
||||
for(i=1;i<docMaps.length;i++) {
|
||||
final int lastDocCount = merge.segments.info(i-1).docCount;
|
||||
final int lastDocCount = merge.segments.get(i-1).docCount;
|
||||
starts[i] = starts[i-1] + lastDocCount;
|
||||
newStarts[i] = newStarts[i-1] + lastDocCount - delCounts[i-1];
|
||||
}
|
||||
|
@ -69,7 +69,7 @@ final class MergeDocIDRemapper {
|
|||
// assert docShift > 0;
|
||||
|
||||
// Make sure it all adds up:
|
||||
assert docShift == maxDocID - (newStarts[docMaps.length-1] + merge.segments.info(docMaps.length-1).docCount - delCounts[docMaps.length-1]);
|
||||
assert docShift == maxDocID - (newStarts[docMaps.length-1] + merge.segments.get(docMaps.length-1).docCount - delCounts[docMaps.length-1]);
|
||||
}
|
||||
|
||||
public int remap(int oldDocID) {
|
||||
|
|
|
@ -75,15 +75,21 @@ public abstract class MergePolicy implements java.io.Closeable {
|
|||
long estimatedMergeBytes; // used by IndexWriter
|
||||
List<SegmentReader> readers; // used by IndexWriter
|
||||
List<SegmentReader> readerClones; // used by IndexWriter
|
||||
public final SegmentInfos segments;
|
||||
public final List<SegmentInfo> segments;
|
||||
public final int totalDocCount;
|
||||
boolean aborted;
|
||||
Throwable error;
|
||||
boolean paused;
|
||||
|
||||
public OneMerge(SegmentInfos segments) {
|
||||
public OneMerge(List<SegmentInfo> segments) {
|
||||
if (0 == segments.size())
|
||||
throw new RuntimeException("segments must include at least one segment");
|
||||
this.segments = segments;
|
||||
int count = 0;
|
||||
for(SegmentInfo info : segments) {
|
||||
count += info.docCount;
|
||||
}
|
||||
totalDocCount = count;
|
||||
}
|
||||
|
||||
/** Record that an exception occurred while executing
|
||||
|
@ -147,7 +153,7 @@ public abstract class MergePolicy implements java.io.Closeable {
|
|||
final int numSegments = segments.size();
|
||||
for(int i=0;i<numSegments;i++) {
|
||||
if (i > 0) b.append(' ');
|
||||
b.append(segments.info(i).toString(dir, 0));
|
||||
b.append(segments.get(i).toString(dir, 0));
|
||||
}
|
||||
if (info != null)
|
||||
b.append(" into ").append(info.name);
|
||||
|
|
|
@ -43,7 +43,8 @@ import org.apache.lucene.util.Constants;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public final class SegmentInfo {
|
||||
|
||||
// TODO: remove with hasVector and hasProx
|
||||
private static final int CHECK_FIELDINFO = -2;
|
||||
static final int NO = -1; // e.g. no norms; no deletes;
|
||||
static final int YES = 1; // e.g. have norms; have deletes;
|
||||
static final int WITHOUT_GEN = 0; // a file name that has no GEN in it.
|
||||
|
@ -86,9 +87,11 @@ public final class SegmentInfo {
|
|||
|
||||
private int delCount; // How many deleted docs in this segment
|
||||
|
||||
private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false
|
||||
//TODO: remove when we don't have to support old indexes anymore that had this field
|
||||
private int hasVectors = CHECK_FIELDINFO;
|
||||
//TODO: remove when we don't have to support old indexes anymore that had this field
|
||||
private int hasProx = CHECK_FIELDINFO; // True if this segment has any fields with omitTermFreqAndPositions==false
|
||||
|
||||
private boolean hasVectors; // True if this segment wrote term vectors
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
|
@ -107,8 +110,11 @@ public final class SegmentInfo {
|
|||
// this is never written to/read from the Directory
|
||||
private long bufferedDeletesGen;
|
||||
|
||||
// holds the fieldInfos Version to refresh files() cache if FI has changed
|
||||
private long fieldInfosVersion;
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile,
|
||||
boolean hasProx, SegmentCodecs segmentCodecs, boolean hasVectors, FieldInfos fieldInfos) {
|
||||
SegmentCodecs segmentCodecs, FieldInfos fieldInfos) {
|
||||
this.name = name;
|
||||
this.docCount = docCount;
|
||||
this.dir = dir;
|
||||
|
@ -116,9 +122,7 @@ public final class SegmentInfo {
|
|||
this.isCompoundFile = isCompoundFile;
|
||||
this.docStoreOffset = -1;
|
||||
this.docStoreSegment = name;
|
||||
this.hasProx = hasProx;
|
||||
this.segmentCodecs = segmentCodecs;
|
||||
this.hasVectors = hasVectors;
|
||||
delCount = 0;
|
||||
version = Constants.LUCENE_MAIN_VERSION;
|
||||
this.fieldInfos = fieldInfos;
|
||||
|
@ -213,7 +217,7 @@ public final class SegmentInfo {
|
|||
delCount = input.readInt();
|
||||
assert delCount <= docCount;
|
||||
|
||||
hasProx = input.readByte() == YES;
|
||||
hasProx = input.readByte();
|
||||
|
||||
// System.out.println(Thread.currentThread().getName() + ": si.read hasProx=" + hasProx + " seg=" + name);
|
||||
if (format <= DefaultSegmentInfosWriter.FORMAT_4_0) {
|
||||
|
@ -226,7 +230,7 @@ public final class SegmentInfo {
|
|||
diagnostics = input.readStringStringMap();
|
||||
|
||||
if (format <= DefaultSegmentInfosWriter.FORMAT_HAS_VECTORS) {
|
||||
hasVectors = input.readByte() == 1;
|
||||
hasVectors = input.readByte();
|
||||
} else {
|
||||
final String storesSegment;
|
||||
final String ext;
|
||||
|
@ -247,7 +251,7 @@ public final class SegmentInfo {
|
|||
dirToTest = dir;
|
||||
}
|
||||
try {
|
||||
hasVectors = dirToTest.fileExists(IndexFileNames.segmentFileName(storesSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION));
|
||||
hasVectors = dirToTest.fileExists(IndexFileNames.segmentFileName(storesSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION)) ? YES : NO;
|
||||
} finally {
|
||||
if (isCompoundFile) {
|
||||
dirToTest.close();
|
||||
|
@ -311,12 +315,7 @@ public final class SegmentInfo {
|
|||
}
|
||||
|
||||
public boolean getHasVectors() throws IOException {
|
||||
return hasVectors;
|
||||
}
|
||||
|
||||
public void setHasVectors(boolean v) {
|
||||
hasVectors = v;
|
||||
clearFilesCache();
|
||||
return hasVectors == CHECK_FIELDINFO ? getFieldInfos().hasVectors() : hasVectors == YES;
|
||||
}
|
||||
|
||||
public FieldInfos getFieldInfos() throws IOException {
|
||||
|
@ -349,7 +348,7 @@ public final class SegmentInfo {
|
|||
|
||||
@Override
|
||||
public Object clone() {
|
||||
final SegmentInfo si = new SegmentInfo(name, docCount, dir, isCompoundFile, hasProx, segmentCodecs, hasVectors,
|
||||
final SegmentInfo si = new SegmentInfo(name, docCount, dir, isCompoundFile, segmentCodecs,
|
||||
fieldInfos == null ? null : (FieldInfos) fieldInfos.clone());
|
||||
si.docStoreOffset = docStoreOffset;
|
||||
si.docStoreSegment = docStoreSegment;
|
||||
|
@ -364,6 +363,8 @@ public final class SegmentInfo {
|
|||
}
|
||||
}
|
||||
si.version = version;
|
||||
si.hasProx = hasProx;
|
||||
si.hasVectors = hasVectors;
|
||||
return si;
|
||||
}
|
||||
|
||||
|
@ -569,19 +570,14 @@ public final class SegmentInfo {
|
|||
|
||||
output.writeByte((byte) (isCompoundFile ? YES : NO));
|
||||
output.writeInt(delCount);
|
||||
output.writeByte((byte) (hasProx ? 1:0));
|
||||
output.writeByte((byte) (hasProx));
|
||||
segmentCodecs.write(output);
|
||||
output.writeStringStringMap(diagnostics);
|
||||
output.writeByte((byte) (hasVectors ? 1 : 0));
|
||||
output.writeByte((byte) (hasVectors));
|
||||
}
|
||||
|
||||
void setHasProx(boolean hasProx) {
|
||||
this.hasProx = hasProx;
|
||||
clearFilesCache();
|
||||
}
|
||||
|
||||
public boolean getHasProx() {
|
||||
return hasProx;
|
||||
public boolean getHasProx() throws IOException {
|
||||
return hasProx == CHECK_FIELDINFO ? getFieldInfos().hasProx() : hasProx == YES;
|
||||
}
|
||||
|
||||
/** Can only be called once. */
|
||||
|
@ -609,13 +605,14 @@ public final class SegmentInfo {
|
|||
*/
|
||||
|
||||
public List<String> files() throws IOException {
|
||||
|
||||
if (files != null) {
|
||||
final long fisVersion = fieldInfosVersion;
|
||||
if (fisVersion != (fieldInfosVersion = getFieldInfos().getVersion())) {
|
||||
clearFilesCache(); // FIS has modifications - need to recompute
|
||||
} else if (files != null) {
|
||||
// Already cached:
|
||||
return files;
|
||||
}
|
||||
|
||||
Set<String> fileSet = new HashSet<String>();
|
||||
final Set<String> fileSet = new HashSet<String>();
|
||||
|
||||
boolean useCompoundFile = getUseCompoundFile();
|
||||
|
||||
|
@ -637,7 +634,7 @@ public final class SegmentInfo {
|
|||
} else {
|
||||
fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.FIELDS_INDEX_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.FIELDS_EXTENSION));
|
||||
if (hasVectors) {
|
||||
if (getHasVectors()) {
|
||||
fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_FIELDS_EXTENSION));
|
||||
|
@ -646,7 +643,7 @@ public final class SegmentInfo {
|
|||
} else if (!useCompoundFile) {
|
||||
fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.FIELDS_INDEX_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.FIELDS_EXTENSION));
|
||||
if (hasVectors) {
|
||||
if (getHasVectors()) {
|
||||
fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_INDEX_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
|
||||
fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_FIELDS_EXTENSION));
|
||||
|
@ -709,8 +706,12 @@ public final class SegmentInfo {
|
|||
if (this.dir != dir) {
|
||||
s.append('x');
|
||||
}
|
||||
if (hasVectors) {
|
||||
s.append('v');
|
||||
try {
|
||||
if (getHasVectors()) {
|
||||
s.append('v');
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
s.append(docCount);
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ final class SegmentMerger {
|
|||
|
||||
private PayloadProcessorProvider payloadProcessorProvider;
|
||||
|
||||
SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs, PayloadProcessorProvider payloadProcessorProvider, FieldInfos fieldInfos) {
|
||||
SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, PayloadProcessorProvider payloadProcessorProvider, FieldInfos fieldInfos) {
|
||||
this.payloadProcessorProvider = payloadProcessorProvider;
|
||||
directory = dir;
|
||||
segment = name;
|
||||
|
|
|
@ -32,7 +32,6 @@ public class SegmentWriteState {
|
|||
public final String segmentName;
|
||||
public final FieldInfos fieldInfos;
|
||||
public final int numDocs;
|
||||
public boolean hasVectors;
|
||||
|
||||
// Deletes to apply while we are flushing the segment. A
|
||||
// Term is enrolled in here if it was deleted at one
|
||||
|
|
|
@ -63,7 +63,6 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
}
|
||||
|
||||
lastDocID = 0;
|
||||
state.hasVectors = hasVectors;
|
||||
hasVectors = false;
|
||||
}
|
||||
|
||||
|
@ -121,8 +120,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
fill(docState.docID);
|
||||
|
||||
// Append term vectors to the real outputs:
|
||||
long pointer = tvd.getFilePointer();
|
||||
tvx.writeLong(pointer);
|
||||
tvx.writeLong(tvd.getFilePointer());
|
||||
tvx.writeLong(tvf.getFilePointer());
|
||||
tvd.writeVInt(numVectorFields);
|
||||
if (numVectorFields > 0) {
|
||||
|
@ -136,6 +134,8 @@ final class TermVectorsTermsWriter extends TermsHashConsumer {
|
|||
tvd.writeVLong(pos-lastPos);
|
||||
lastPos = pos;
|
||||
perFields[i].finishDocument();
|
||||
// commit the termVectors once successful success - FI will otherwise reset them
|
||||
perFields[i].fieldInfo.commitVectors();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Collection;
|
|||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Merges segments of approximately equal size, subject to
|
||||
|
@ -249,7 +251,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
|
||||
final Collection<SegmentInfo> toBeMerged = new HashSet<SegmentInfo>();
|
||||
|
||||
final SegmentInfos infosSorted = new SegmentInfos();
|
||||
final List<SegmentInfo> infosSorted = new ArrayList<SegmentInfo>();
|
||||
infosSorted.addAll(infos);
|
||||
|
||||
Collections.sort(infosSorted, segmentByteSizeDescending);
|
||||
|
@ -277,7 +279,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
// If we have too-large segments, grace them out
|
||||
// of the maxSegmentCount:
|
||||
int tooBigCount = 0;
|
||||
while (tooBigCount < infosSorted.size() && size(infosSorted.info(tooBigCount)) >= maxMergedSegmentBytes/2.0) {
|
||||
while (tooBigCount < infosSorted.size() && size(infosSorted.get(tooBigCount)) >= maxMergedSegmentBytes/2.0) {
|
||||
totIndexBytes -= size(infosSorted.get(tooBigCount));
|
||||
tooBigCount++;
|
||||
}
|
||||
|
@ -310,7 +312,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
// Gather eligible segments for merging, ie segments
|
||||
// not already being merged and not already picked (by
|
||||
// prior iteration of this loop) for merging:
|
||||
final SegmentInfos eligible = new SegmentInfos();
|
||||
final List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
|
||||
for(int idx = tooBigCount; idx<infosSorted.size(); idx++) {
|
||||
final SegmentInfo info = infosSorted.get(idx);
|
||||
if (merging.contains(info)) {
|
||||
|
@ -332,7 +334,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
|
||||
// OK we are over budget -- find best merge!
|
||||
MergeScore bestScore = null;
|
||||
SegmentInfos best = null;
|
||||
List<SegmentInfo> best = null;
|
||||
boolean bestTooLarge = false;
|
||||
long bestMergeBytes = 0;
|
||||
|
||||
|
@ -341,10 +343,10 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
|
||||
long totAfterMergeBytes = 0;
|
||||
|
||||
final SegmentInfos candidate = new SegmentInfos();
|
||||
final List<SegmentInfo> candidate = new ArrayList<SegmentInfo>();
|
||||
boolean hitTooLarge = false;
|
||||
for(int idx = startIdx;idx<eligible.size() && candidate.size() < maxMergeAtOnce;idx++) {
|
||||
final SegmentInfo info = eligible.info(idx);
|
||||
final SegmentInfo info = eligible.get(idx);
|
||||
final long segBytes = size(info);
|
||||
|
||||
if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) {
|
||||
|
@ -398,7 +400,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
}
|
||||
|
||||
/** Expert: scores one merge; subclasses can override. */
|
||||
protected MergeScore score(SegmentInfos candidate, boolean hitTooLarge, long mergingBytes) throws IOException {
|
||||
protected MergeScore score(List<SegmentInfo> candidate, boolean hitTooLarge, long mergingBytes) throws IOException {
|
||||
long totBeforeMergeBytes = 0;
|
||||
long totAfterMergeBytes = 0;
|
||||
long totAfterMergeBytesFloored = 0;
|
||||
|
@ -420,7 +422,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
// over time:
|
||||
skew = 1.0/maxMergeAtOnce;
|
||||
} else {
|
||||
skew = ((double) floorSize(size(candidate.info(0))))/totAfterMergeBytesFloored;
|
||||
skew = ((double) floorSize(size(candidate.get(0))))/totAfterMergeBytesFloored;
|
||||
}
|
||||
|
||||
// Strongly favor merges with less skew (smaller
|
||||
|
@ -458,7 +460,8 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
if (verbose()) {
|
||||
message("findMergesForOptimize maxSegmentCount=" + maxSegmentCount + " infos=" + writer.get().segString(infos) + " segmentsToOptimize=" + segmentsToOptimize);
|
||||
}
|
||||
SegmentInfos eligible = new SegmentInfos();
|
||||
|
||||
List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
|
||||
boolean optimizeMergeRunning = false;
|
||||
final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
|
||||
for(SegmentInfo info : infos) {
|
||||
|
@ -499,7 +502,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
if (spec == null) {
|
||||
spec = new MergeSpecification();
|
||||
}
|
||||
final OneMerge merge = new OneMerge(eligible.range(end-maxMergeAtOnceExplicit, end));
|
||||
final OneMerge merge = new OneMerge(eligible.subList(end-maxMergeAtOnceExplicit, end));
|
||||
if (verbose()) {
|
||||
message("add merge=" + writer.get().segString(merge.segments));
|
||||
}
|
||||
|
@ -510,7 +513,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
if (spec == null && !optimizeMergeRunning) {
|
||||
// Do final merge
|
||||
final int numToMerge = end - maxSegmentCount + 1;
|
||||
final OneMerge merge = new OneMerge(eligible.range(end-numToMerge, end));
|
||||
final OneMerge merge = new OneMerge(eligible.subList(end-numToMerge, end));
|
||||
if (verbose()) {
|
||||
message("add final merge=" + merge.segString(writer.get().getDirectory()));
|
||||
}
|
||||
|
@ -527,7 +530,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
if (verbose()) {
|
||||
message("findMergesToExpungeDeletes infos=" + writer.get().segString(infos) + " expungeDeletesPctAllowed=" + expungeDeletesPctAllowed);
|
||||
}
|
||||
final SegmentInfos eligible = new SegmentInfos();
|
||||
final List<SegmentInfo> eligible = new ArrayList<SegmentInfo>();
|
||||
final Collection<SegmentInfo> merging = writer.get().getMergingSegments();
|
||||
for(SegmentInfo info : infos) {
|
||||
double pctDeletes = 100.*((double) writer.get().numDeletedDocs(info))/info.docCount;
|
||||
|
@ -580,7 +583,7 @@ public class TieredMergePolicy extends MergePolicy {
|
|||
spec = new MergeSpecification();
|
||||
}
|
||||
|
||||
final OneMerge merge = new OneMerge(eligible.range(start, upto));
|
||||
final OneMerge merge = new OneMerge(eligible.subList(start, upto));
|
||||
if (verbose()) {
|
||||
message("add merge=" + writer.get().segString(merge.segments));
|
||||
}
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Constants;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/** This {@link MergePolicy} is used for upgrading all existing segments of
|
||||
* an index when calling {@link IndexWriter#optimize()}.
|
||||
* All other methods delegate to the base {@code MergePolicy} given to the constructor.
|
||||
* This allows for an as-cheap-as possible upgrade of an older index by only upgrading segments that
|
||||
* are created by previous Lucene versions. Optimize does no longer really optimize
|
||||
* it is just used to "optimize" older segment versions away.
|
||||
* <p>In general one would use {@link IndexUpgrader}, but for a fully customizeable upgrade,
|
||||
* you can use this like any other {@code MergePolicy} and call {@link IndexWriter#optimize()}:
|
||||
* <pre class="prettyprint lang-java">
|
||||
* IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_XX, new KeywordAnalyzer());
|
||||
* iwc.setMergePolicy(new UpgradeIndexMergePolicy(iwc.getMergePolicy()));
|
||||
* IndexWriter w = new IndexWriter(dir, iwc);
|
||||
* w.optimize();
|
||||
* w.close();
|
||||
* </pre>
|
||||
* @lucene.experimental
|
||||
* @see IndexUpgrader
|
||||
*/
|
||||
public class UpgradeIndexMergePolicy extends MergePolicy {
|
||||
|
||||
protected final MergePolicy base;
|
||||
|
||||
/** Wrap the given {@link MergePolicy} and intercept optimize requests to
|
||||
* only upgrade segments written with previous Lucene versions. */
|
||||
public UpgradeIndexMergePolicy(MergePolicy base) {
|
||||
this.base = base;
|
||||
}
|
||||
|
||||
/** Returns if the given segment should be upgraded. The default implementation
|
||||
* will return {@code !Constants.LUCENE_MAIN_VERSION.equals(si.getVersion())},
|
||||
* so all segments created with a different version number than this Lucene version will
|
||||
* get upgraded.
|
||||
*/
|
||||
protected boolean shouldUpgradeSegment(SegmentInfo si) {
|
||||
return !Constants.LUCENE_MAIN_VERSION.equals(si.getVersion());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setIndexWriter(IndexWriter writer) {
|
||||
super.setIndexWriter(writer);
|
||||
base.setIndexWriter(writer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MergeSpecification findMerges(SegmentInfos segmentInfos) throws CorruptIndexException, IOException {
|
||||
return base.findMerges(segmentInfos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MergeSpecification findMergesForOptimize(SegmentInfos segmentInfos, int maxSegmentCount, Set<SegmentInfo> segmentsToOptimize) throws CorruptIndexException, IOException {
|
||||
// first find all old segments
|
||||
final HashSet<SegmentInfo> oldSegments = new HashSet<SegmentInfo>();
|
||||
for (final SegmentInfo si : segmentInfos) {
|
||||
if (segmentsToOptimize.contains(si) && shouldUpgradeSegment(si)) {
|
||||
oldSegments.add(si);
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose()) message("findMergesForOptimize: segmentsToUpgrade=" + oldSegments);
|
||||
|
||||
if (oldSegments.isEmpty())
|
||||
return null;
|
||||
|
||||
MergeSpecification spec = base.findMergesForOptimize(segmentInfos, maxSegmentCount, oldSegments);
|
||||
|
||||
if (spec != null) {
|
||||
// remove all segments that are in merge specification from oldSegments,
|
||||
// the resulting set contains all segments that are left over
|
||||
// and will be merged to one additional segment:
|
||||
for (final OneMerge om : spec.merges) {
|
||||
oldSegments.removeAll(om.segments);
|
||||
}
|
||||
}
|
||||
|
||||
if (!oldSegments.isEmpty()) {
|
||||
if (verbose())
|
||||
message("findMergesForOptimize: " + base.getClass().getSimpleName() +
|
||||
" does not want to merge all old segments, merge remaining ones into new segment: " + oldSegments);
|
||||
final List<SegmentInfo> newInfos = new ArrayList<SegmentInfo>();
|
||||
for (final SegmentInfo si : segmentInfos) {
|
||||
if (oldSegments.contains(si)) {
|
||||
newInfos.add(si);
|
||||
}
|
||||
}
|
||||
// add the final merge
|
||||
if (spec == null) {
|
||||
spec = new MergeSpecification();
|
||||
}
|
||||
spec.add(new OneMerge(newInfos));
|
||||
}
|
||||
|
||||
return spec;
|
||||
}
|
||||
|
||||
@Override
|
||||
public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos) throws CorruptIndexException, IOException {
|
||||
return base.findMergesToExpungeDeletes(segmentInfos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean useCompoundFile(SegmentInfos segments, SegmentInfo newSegment) throws IOException {
|
||||
return base.useCompoundFile(segments, newSegment);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
base.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "[" + getClass().getSimpleName() + "->" + base + "]";
|
||||
}
|
||||
|
||||
private boolean verbose() {
|
||||
IndexWriter w = writer.get();
|
||||
return w != null && w.verbose();
|
||||
}
|
||||
|
||||
private void message(String message) {
|
||||
if (verbose())
|
||||
writer.get().message("UPGMP: " + message);
|
||||
}
|
||||
|
||||
}
|
|
@ -73,6 +73,11 @@ public class CodecProvider {
|
|||
}
|
||||
}
|
||||
|
||||
/** @lucene.internal */
|
||||
public synchronized Set<String> listAll() {
|
||||
return codecs.keySet();
|
||||
}
|
||||
|
||||
public Collection<String> getAllExtensions() {
|
||||
return knownExtensions;
|
||||
}
|
||||
|
|
|
@ -68,15 +68,8 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public Object clone() {
|
||||
PulsingTermState clone;
|
||||
clone = (PulsingTermState) super.clone();
|
||||
if (postingsSize != -1) {
|
||||
clone.postings = new byte[postingsSize];
|
||||
System.arraycopy(postings, 0, clone.postings, 0, postingsSize);
|
||||
} else {
|
||||
assert wrappedTermState != null;
|
||||
clone.wrappedTermState = (BlockTermState) wrappedTermState.clone();
|
||||
}
|
||||
PulsingTermState clone = new PulsingTermState();
|
||||
clone.copyFrom(this);
|
||||
return clone;
|
||||
}
|
||||
|
||||
|
@ -90,8 +83,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)];
|
||||
}
|
||||
System.arraycopy(other.postings, 0, postings, 0, other.postingsSize);
|
||||
} else {
|
||||
} else if (wrappedTermState != null) {
|
||||
wrappedTermState.copyFrom(other.wrappedTermState);
|
||||
} else {
|
||||
wrappedTermState = (BlockTermState) other.wrappedTermState.clone();
|
||||
}
|
||||
|
||||
// NOTE: we do not copy the
|
||||
|
|
|
@ -85,7 +85,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
public static void files(SegmentInfo segmentInfo, String codecId, Collection<String> files) {
|
||||
public static void files(SegmentInfo segmentInfo, String codecId, Collection<String> files) throws IOException {
|
||||
files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.DOC_EXTENSION));
|
||||
files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.SKIP_EXTENSION));
|
||||
|
||||
|
@ -151,14 +151,8 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
|
||||
@Override
|
||||
public Object clone() {
|
||||
SepTermState other = (SepTermState) super.clone();
|
||||
other.docIndex = (IntIndexInput.Index) docIndex.clone();
|
||||
if (freqIndex != null) {
|
||||
other.freqIndex = (IntIndexInput.Index) freqIndex.clone();
|
||||
}
|
||||
if (posIndex != null) {
|
||||
other.posIndex = (IntIndexInput.Index) posIndex.clone();
|
||||
}
|
||||
SepTermState other = new SepTermState();
|
||||
other.copyFrom(this);
|
||||
return other;
|
||||
}
|
||||
|
||||
|
@ -166,12 +160,28 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
public void copyFrom(TermState _other) {
|
||||
super.copyFrom(_other);
|
||||
SepTermState other = (SepTermState) _other;
|
||||
docIndex.set(other.docIndex);
|
||||
if (freqIndex != null && other.freqIndex != null) {
|
||||
freqIndex.set(other.freqIndex);
|
||||
if (docIndex == null) {
|
||||
docIndex = (IntIndexInput.Index) other.docIndex.clone();
|
||||
} else {
|
||||
docIndex.set(other.docIndex);
|
||||
}
|
||||
if (posIndex != null && other.posIndex != null) {
|
||||
posIndex.set(other.posIndex);
|
||||
if (other.freqIndex != null) {
|
||||
if (freqIndex == null) {
|
||||
freqIndex = (IntIndexInput.Index) other.freqIndex.clone();
|
||||
} else {
|
||||
freqIndex.set(other.freqIndex);
|
||||
}
|
||||
} else {
|
||||
freqIndex = null;
|
||||
}
|
||||
if (other.posIndex != null) {
|
||||
if (posIndex == null) {
|
||||
posIndex = (IntIndexInput.Index) other.posIndex.clone();
|
||||
} else {
|
||||
posIndex.set(other.posIndex);
|
||||
}
|
||||
} else {
|
||||
posIndex = null;
|
||||
}
|
||||
payloadFP = other.payloadFP;
|
||||
skipFP = other.skipFP;
|
||||
|
|
|
@ -806,6 +806,7 @@ public abstract class QueryParserBase {
|
|||
}
|
||||
|
||||
try {
|
||||
source.end();
|
||||
source.close();
|
||||
} catch (IOException ignored) {}
|
||||
|
||||
|
|
|
@ -21,8 +21,6 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
|
||||
final class HitQueue extends PriorityQueue<ScoreDoc> {
|
||||
|
||||
private boolean prePopulate;
|
||||
|
||||
/**
|
||||
* Creates a new instance with <code>size</code> elements. If
|
||||
* <code>prePopulate</code> is set to true, the queue will pre-populate itself
|
||||
|
|
|
@ -46,8 +46,18 @@ import org.apache.lucene.util.ThreadInterruptedException;
|
|||
*
|
||||
* <p>Applications usually need only call the inherited
|
||||
* {@link #search(Query,int)}
|
||||
* or {@link #search(Query,Filter,int)} methods. For performance reasons it is
|
||||
* recommended to open only one IndexSearcher and use it for all of your searches.
|
||||
* or {@link #search(Query,Filter,int)} methods. For
|
||||
* performance reasons, if your index is unchanging, you
|
||||
* should share a single IndexSearcher instance across
|
||||
* multiple searches instead of creating a new one
|
||||
* per-search. If your index has changed and you wish to
|
||||
* see the changes reflected in searching, you should
|
||||
* use {@link IndexReader#reopen} to obtain a new reader and
|
||||
* then create a new IndexSearcher from that. Also, for
|
||||
* low-latency turnaround it's best to use a near-real-time
|
||||
* reader ({@link IndexReader#open(IndexWriter,boolean)}).
|
||||
* Once you have a new {@link IndexReader}, it's relatively
|
||||
* cheap to create a new IndexSearcher from it.
|
||||
*
|
||||
* <a name="thread-safety"></a><p><b>NOTE</b>: <code>{@link
|
||||
* IndexSearcher}</code> instances are completely
|
||||
|
|
|
@ -214,12 +214,12 @@ public class MultiPhraseQuery extends Query {
|
|||
docFreq = reader.docFreq(term.field(), term.bytes());
|
||||
}
|
||||
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue());
|
||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]);
|
||||
}
|
||||
|
||||
// sort by increasing docFreq order
|
||||
if (slop == 0) {
|
||||
ArrayUtil.quickSort(postingsFreqs);
|
||||
ArrayUtil.mergeSort(postingsFreqs);
|
||||
}
|
||||
|
||||
if (slop == 0) {
|
||||
|
|
|
@ -28,13 +28,15 @@ final class PhrasePositions {
|
|||
int position; // position in doc
|
||||
int count; // remaining pos in this doc
|
||||
int offset; // position in phrase
|
||||
final int ord; // unique across all PhrasePositions instances
|
||||
final DocsAndPositionsEnum postings; // stream of docs & positions
|
||||
PhrasePositions next; // used to make lists
|
||||
boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1)
|
||||
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o) {
|
||||
PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) {
|
||||
this.postings = postings;
|
||||
offset = o;
|
||||
this.ord = ord;
|
||||
}
|
||||
|
||||
final boolean next() throws IOException { // increments to next doc
|
||||
|
|
|
@ -124,16 +124,48 @@ public class PhraseQuery extends Query {
|
|||
final DocsAndPositionsEnum postings;
|
||||
final int docFreq;
|
||||
final int position;
|
||||
final Term term;
|
||||
|
||||
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position) {
|
||||
public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) {
|
||||
this.postings = postings;
|
||||
this.docFreq = docFreq;
|
||||
this.position = position;
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
public int compareTo(PostingsAndFreq other) {
|
||||
if (docFreq == other.docFreq) {
|
||||
if (position == other.position) {
|
||||
return term.compareTo(other.term);
|
||||
}
|
||||
return position - other.position;
|
||||
}
|
||||
return docFreq - other.docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + docFreq;
|
||||
result = prime * result + position;
|
||||
result = prime * result + ((term == null) ? 0 : term.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
PostingsAndFreq other = (PostingsAndFreq) obj;
|
||||
if (docFreq != other.docFreq) return false;
|
||||
if (position != other.position) return false;
|
||||
if (term == null) {
|
||||
if (other.term != null) return false;
|
||||
} else if (!term.equals(other.term)) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private class PhraseWeight extends Weight {
|
||||
|
@ -197,12 +229,12 @@ public class PhraseQuery extends Query {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue());
|
||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t);
|
||||
}
|
||||
|
||||
// sort by increasing docFreq order
|
||||
if (slop == 0) {
|
||||
ArrayUtil.quickSort(postingsFreqs);
|
||||
ArrayUtil.mergeSort(postingsFreqs);
|
||||
}
|
||||
|
||||
if (slop == 0) { // optimize exact case
|
||||
|
|
|
@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue<PhrasePositions> {
|
|||
if (pp1.position == pp2.position)
|
||||
// same doc and pp.position, so decide by actual term positions.
|
||||
// rely on: pp.position == tp.position - offset.
|
||||
return pp1.offset < pp2.offset;
|
||||
else
|
||||
if (pp1.offset == pp2.offset) {
|
||||
return pp1.ord < pp2.ord;
|
||||
} else {
|
||||
return pp1.offset < pp2.offset;
|
||||
}
|
||||
else {
|
||||
return pp1.position < pp2.position;
|
||||
else
|
||||
}
|
||||
else {
|
||||
return pp1.doc < pp2.doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer {
|
|||
// this allows to easily identify a matching (exact) phrase
|
||||
// when all PhrasePositions have exactly the same position.
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position);
|
||||
PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i);
|
||||
if (last != null) { // add next to end of list
|
||||
last.next = pp;
|
||||
} else {
|
||||
|
|
|
@ -134,7 +134,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
final Term placeholderTerm = new Term(query.field);
|
||||
final Q q = getTopLevelQuery();
|
||||
final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]);
|
||||
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
|
||||
ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp);
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = placeholderTerm.createTerm(st.bytes);
|
||||
assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq();
|
||||
|
|
|
@ -190,7 +190,7 @@ public class NearSpansOrdered extends Spans {
|
|||
|
||||
/** Advance the subSpans to the same document */
|
||||
private boolean toSameDoc() throws IOException {
|
||||
ArrayUtil.quickSort(subSpansByDoc, spanDocComparator);
|
||||
ArrayUtil.mergeSort(subSpansByDoc, spanDocComparator);
|
||||
int firstIndex = 0;
|
||||
int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
|
||||
while (subSpansByDoc[firstIndex].doc() != maxDoc) {
|
||||
|
|
|
@ -62,13 +62,26 @@ public abstract class SorterTemplate {
|
|||
|
||||
/** Sorts via in-place, but unstable, QuickSort algorithm.
|
||||
* For small collections falls back to {@link #insertionSort(int,int)}. */
|
||||
public final void quickSort(int lo, int hi) {
|
||||
public final void quickSort(final int lo, final int hi) {
|
||||
if (hi <= lo) return;
|
||||
// from Integer's Javadocs: ceil(log2(x)) = 32 - numberOfLeadingZeros(x - 1)
|
||||
quickSort(lo, hi, (Integer.SIZE - Integer.numberOfLeadingZeros(hi - lo)) << 1);
|
||||
}
|
||||
|
||||
private void quickSort(int lo, int hi, int maxDepth) {
|
||||
// fall back to insertion when array has short length
|
||||
final int diff = hi - lo;
|
||||
if (diff <= QUICKSORT_THRESHOLD) {
|
||||
insertionSort(lo, hi);
|
||||
return;
|
||||
}
|
||||
|
||||
// fall back to merge sort when recursion depth gets too big
|
||||
if (--maxDepth == 0) {
|
||||
mergeSort(lo, hi);
|
||||
return;
|
||||
}
|
||||
|
||||
final int mid = lo + (diff >>> 1);
|
||||
|
||||
if (compare(lo, mid) > 0) {
|
||||
|
@ -101,8 +114,8 @@ public abstract class SorterTemplate {
|
|||
}
|
||||
}
|
||||
|
||||
quickSort(lo, left);
|
||||
quickSort(left + 1, hi);
|
||||
quickSort(lo, left, maxDepth);
|
||||
quickSort(left + 1, hi, maxDepth);
|
||||
}
|
||||
|
||||
/** Sorts via stable in-place MergeSort algorithm
|
||||
|
|
|
@ -261,9 +261,12 @@ public class Builder<T> {
|
|||
add(scratchIntsRef, output);
|
||||
}
|
||||
|
||||
/** It's OK to add the same input twice in a row with
|
||||
* different outputs, as long as outputs impls the merge
|
||||
* method. */
|
||||
public void add(IntsRef input, T output) throws IOException {
|
||||
//System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output));
|
||||
assert lastInput.length == 0 || input.compareTo(lastInput) > 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
|
||||
assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input;
|
||||
assert validOutput(output);
|
||||
|
||||
//System.out.println("\nadd: " + input);
|
||||
|
@ -347,8 +350,15 @@ public class Builder<T> {
|
|||
assert validOutput(output);
|
||||
}
|
||||
|
||||
// push remaining output:
|
||||
frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output);
|
||||
if (lastInput.length == input.length && prefixLenPlus1 == 1+input.length) {
|
||||
// same input more than 1 time in a row, mapping to
|
||||
// multiple outputs
|
||||
lastNode.output = fst.outputs.merge(lastNode.output, output);
|
||||
} else {
|
||||
// this new arc is private to this new input; set its
|
||||
// arc output to the leftover output:
|
||||
frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output);
|
||||
}
|
||||
|
||||
// save last input
|
||||
lastInput.copy(input);
|
||||
|
|
|
@ -231,10 +231,13 @@ public class FST<T> {
|
|||
}
|
||||
|
||||
void setEmptyOutput(T v) throws IOException {
|
||||
if (emptyOutput != null && !emptyOutput.equals(v)) {
|
||||
throw new IllegalStateException("empty output is already set: " + outputs.outputToString(emptyOutput) + " vs " + outputs.outputToString(v));
|
||||
if (emptyOutput != null) {
|
||||
if (!emptyOutput.equals(v)) {
|
||||
emptyOutput = outputs.merge(emptyOutput, v);
|
||||
}
|
||||
} else {
|
||||
emptyOutput = v;
|
||||
}
|
||||
emptyOutput = v;
|
||||
|
||||
// TODO: this is messy -- replace with sillyBytesWriter; maybe make
|
||||
// bytes private
|
||||
|
@ -446,25 +449,17 @@ public class FST<T> {
|
|||
// reverse bytes in-place; we do this so that the
|
||||
// "BIT_TARGET_NEXT" opto can work, ie, it reads the
|
||||
// node just before the current one
|
||||
final int endAddress = writer.posWrite;
|
||||
final int stopAt = (endAddress - startAddress)/2;
|
||||
int upto = 0;
|
||||
while (upto < stopAt) {
|
||||
final byte b = bytes[startAddress+upto];
|
||||
bytes[startAddress+upto] = bytes[endAddress-upto-1];
|
||||
bytes[endAddress-upto-1] = b;
|
||||
upto++;
|
||||
final int endAddress = lastFrozenNode = writer.posWrite - 1;
|
||||
|
||||
int left = startAddress;
|
||||
int right = endAddress;
|
||||
while (left < right) {
|
||||
final byte b = bytes[left];
|
||||
bytes[left++] = bytes[right];
|
||||
bytes[right--] = b;
|
||||
}
|
||||
|
||||
lastFrozenNode = endAddress - 1;
|
||||
/*
|
||||
System.out.println(" return node addr=" + (endAddress-1));
|
||||
for(int i=endAddress-1;i>=startAddress;i--) {
|
||||
System.out.println(" bytes[" + i + "]=" + bytes[i]);
|
||||
}
|
||||
*/
|
||||
|
||||
return endAddress-1;
|
||||
return endAddress;
|
||||
}
|
||||
|
||||
/** Fills virtual 'start' arc, ie, an empty incoming arc to
|
||||
|
|
|
@ -140,7 +140,7 @@ abstract class FSTEnum<T> {
|
|||
// Arcs are fixed array -- use binary search to find
|
||||
// the target.
|
||||
|
||||
final FST.BytesReader in = fst.getBytesReader(0);
|
||||
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||
int low = arc.arcIdx;
|
||||
int high = arc.numArcs-1;
|
||||
int mid = 0;
|
||||
|
@ -278,7 +278,7 @@ abstract class FSTEnum<T> {
|
|||
// Arcs are fixed array -- use binary search to find
|
||||
// the target.
|
||||
|
||||
final FST.BytesReader in = fst.getBytesReader(0);
|
||||
final FST<T>.BytesReader in = fst.getBytesReader(0);
|
||||
int low = arc.arcIdx;
|
||||
int high = arc.numArcs-1;
|
||||
int mid = 0;
|
||||
|
|
|
@ -40,7 +40,7 @@ final class NodeHash<T> {
|
|||
return false;
|
||||
}
|
||||
for(int arcUpto=0;arcUpto<node.numArcs;arcUpto++) {
|
||||
final Builder.Arc arc = node.arcs[arcUpto];
|
||||
final Builder.Arc<T> arc = node.arcs[arcUpto];
|
||||
if (arc.label != scratchArc.label ||
|
||||
!arc.output.equals(scratchArc.output) ||
|
||||
((Builder.CompiledNode) arc.target).address != scratchArc.target ||
|
||||
|
|
|
@ -54,4 +54,8 @@ public abstract class Outputs<T> {
|
|||
public abstract T getNoOutput();
|
||||
|
||||
public abstract String outputToString(T output);
|
||||
|
||||
public T merge(T first, T second) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ public class PairOutputs<A,B> extends Outputs<PairOutputs.Pair<A,B>> {
|
|||
this.output2 = output2;
|
||||
}
|
||||
|
||||
@Override @SuppressWarnings("unchecked")
|
||||
@Override @SuppressWarnings("rawtypes")
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) {
|
||||
return true;
|
||||
|
|
|
@ -22,14 +22,11 @@ import java.io.IOException;
|
|||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
// TODO: make a sharing and non-sharing variant; eg if you
|
||||
// output docFreq per term the FST will be smaller if you
|
||||
// don't share since they are not "well shared"
|
||||
|
||||
/**
|
||||
* Output is a long, for each input term. NOTE: the
|
||||
* resulting FST is not guaranteed to be minimal! See
|
||||
* {@link Builder}.
|
||||
* {@link Builder}. You cannot store 0 output with this
|
||||
* (that's reserved to mean "no output")!
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
|
|
|
@ -0,0 +1,224 @@
|
|||
package org.apache.lucene.util.automaton.fst;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
||||
/**
|
||||
* Holds one or two longs for each input term. If it's a
|
||||
* single output, Long is returned; else, TwoLongs. Order
|
||||
* is preseved in the TwoLongs case, ie .first is the first
|
||||
* input/output added to Builder, and .second is the
|
||||
* second. You cannot store 0 output with this (that's
|
||||
* reserved to mean "no output")!
|
||||
*
|
||||
* NOTE: the resulting FST is not guaranteed to be minimal!
|
||||
* See {@link Builder}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public final class UpToTwoPositiveIntOutputs extends Outputs<Object> {
|
||||
|
||||
public final static class TwoLongs {
|
||||
final long first;
|
||||
final long second;
|
||||
|
||||
public TwoLongs(long first, long second) {
|
||||
this.first = first;
|
||||
this.second = second;
|
||||
assert first >= 0;
|
||||
assert second >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TwoLongs:" + first + "," + second;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object _other) {
|
||||
if (_other instanceof TwoLongs) {
|
||||
final TwoLongs other = (TwoLongs) _other;
|
||||
return first == other.first && second == other.second;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (int) ((first^(first>>>32)) ^ (second^(second>>32)));
|
||||
}
|
||||
}
|
||||
|
||||
private final static Long NO_OUTPUT = new Long(0);
|
||||
|
||||
private final boolean doShare;
|
||||
|
||||
private final static UpToTwoPositiveIntOutputs singletonShare = new UpToTwoPositiveIntOutputs(true);
|
||||
private final static UpToTwoPositiveIntOutputs singletonNoShare = new UpToTwoPositiveIntOutputs(false);
|
||||
|
||||
private UpToTwoPositiveIntOutputs(boolean doShare) {
|
||||
this.doShare = doShare;
|
||||
}
|
||||
|
||||
public static UpToTwoPositiveIntOutputs getSingleton(boolean doShare) {
|
||||
return doShare ? singletonShare : singletonNoShare;
|
||||
}
|
||||
|
||||
public Long get(long v) {
|
||||
if (v == 0) {
|
||||
return NO_OUTPUT;
|
||||
} else {
|
||||
return Long.valueOf(v);
|
||||
}
|
||||
}
|
||||
|
||||
public TwoLongs get(long first, long second) {
|
||||
return new TwoLongs(first, second);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long common(Object _output1, Object _output2) {
|
||||
assert valid(_output1, false);
|
||||
assert valid(_output2, false);
|
||||
final Long output1 = (Long) _output1;
|
||||
final Long output2 = (Long) _output2;
|
||||
if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) {
|
||||
return NO_OUTPUT;
|
||||
} else if (doShare) {
|
||||
assert output1 > 0;
|
||||
assert output2 > 0;
|
||||
return Math.min(output1, output2);
|
||||
} else if (output1.equals(output2)) {
|
||||
return output1;
|
||||
} else {
|
||||
return NO_OUTPUT;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long subtract(Object _output, Object _inc) {
|
||||
assert valid(_output, false);
|
||||
assert valid(_inc, false);
|
||||
final Long output = (Long) _output;
|
||||
final Long inc = (Long) _inc;
|
||||
assert output >= inc;
|
||||
|
||||
if (inc == NO_OUTPUT) {
|
||||
return output;
|
||||
} else if (output.equals(inc)) {
|
||||
return NO_OUTPUT;
|
||||
} else {
|
||||
return output - inc;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object add(Object _prefix, Object _output) {
|
||||
assert valid(_prefix, false);
|
||||
assert valid(_output, true);
|
||||
final Long prefix = (Long) _prefix;
|
||||
if (_output instanceof Long) {
|
||||
final Long output = (Long) _output;
|
||||
if (prefix == NO_OUTPUT) {
|
||||
return output;
|
||||
} else if (output == NO_OUTPUT) {
|
||||
return prefix;
|
||||
} else {
|
||||
return prefix + output;
|
||||
}
|
||||
} else {
|
||||
final TwoLongs output = (TwoLongs) _output;
|
||||
final long v = prefix;
|
||||
return new TwoLongs(output.first + v, output.second + v);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(Object _output, DataOutput out) throws IOException {
|
||||
assert valid(_output, true);
|
||||
if (_output instanceof Long) {
|
||||
final Long output = (Long) _output;
|
||||
out.writeVLong(output<<1);
|
||||
} else {
|
||||
final TwoLongs output = (TwoLongs) _output;
|
||||
out.writeVLong((output.first<<1) | 1);
|
||||
out.writeVLong(output.second);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object read(DataInput in) throws IOException {
|
||||
final long code = in.readVLong();
|
||||
if ((code & 1) == 0) {
|
||||
// single long
|
||||
final long v = code >>> 1;
|
||||
if (v == 0) {
|
||||
return NO_OUTPUT;
|
||||
} else {
|
||||
return Long.valueOf(v);
|
||||
}
|
||||
} else {
|
||||
// two longs
|
||||
final long first = code >>> 1;
|
||||
final long second = in.readVLong();
|
||||
return new TwoLongs(first, second);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean valid(Long o) {
|
||||
assert o != null;
|
||||
assert o instanceof Long;
|
||||
assert o == NO_OUTPUT || o > 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Used only by assert
|
||||
private boolean valid(Object _o, boolean allowDouble) {
|
||||
if (!allowDouble) {
|
||||
assert _o instanceof Long;
|
||||
return valid((Long) _o);
|
||||
} else if (_o instanceof TwoLongs) {
|
||||
return true;
|
||||
} else {
|
||||
return valid((Long) _o);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getNoOutput() {
|
||||
return NO_OUTPUT;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String outputToString(Object output) {
|
||||
return output.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object merge(Object first, Object second) {
|
||||
assert valid(first, false);
|
||||
assert valid(second, false);
|
||||
return new TwoLongs((Long) first, (Long) second);
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue