diff --git a/README.txt b/README.txt index 4a997894d2d..da3dcfdba72 100644 --- a/README.txt +++ b/README.txt @@ -7,6 +7,7 @@ modules/ is shared code To compile the sources run 'ant compile' To run all the tests run 'ant test' To setup your ide run 'ant idea' or 'ant eclipse' +For Maven info, see dev-tools/maven/README.maven. For more information on how to contribute see: http://wiki.apache.org/lucene-java/HowToContribute diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index a5d64f92dcc..0f201536149 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -95,7 +95,7 @@ - + diff --git a/dev-tools/maven/README.maven b/dev-tools/maven/README.maven new file mode 100644 index 00000000000..65954abf1d8 --- /dev/null +++ b/dev-tools/maven/README.maven @@ -0,0 +1,131 @@ +==================================== +Lucene/Solr Maven build instructions +==================================== + +Contents: + +A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts +B. How to generate Lucene Maven artifacts +C. How to generate Solr Maven artifacts +D. How to use Maven to build Lucene/Solr + +----- + +A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts + + The most recently produced nightly Jenkins-built Lucene and Solr Maven + artifacts are available in Maven repository layout here: + + + + +B. How to generate Lucene Maven artifacts + + 1. Prerequisites: JDK 1.5+, Ant 1.7.X, and maven-ant-tasks-2.1.1.jar + + In order to generate Maven artifacts for Lucene/Solr, you must first + download the Maven ant tasks JAR (maven-ant-tasks-2.1.1.jar), e.g. + from , and add it + to any one of the following: + + a. Your $HOME/.ant/lib/ directory (C:\Users\username\.ant\lib\ under + Windows Vista/7); or + b. Your $ANT_HOME/lib/ directory (%ANT_HOME%\lib\ under Windows); or + c. Your $CLASSPATH (%CLASSPATH% under Windows); or + d. Your ant commond line: "-lib /path/to/maven-ant-tasks-2.1.1.jar". + + 2. Run the following command from the lucene/ directory: + + ant generate-maven-artifacts + + The above command will create an internal Maven repository under + lucene/dist/maven/, including POMs, binary .jars, source .jars, + and javadoc .jars, for Lucene Core, for the Lucene test framework, + for each contrib, and for each module under the top-level modules/ + directory. + + +C. How to generate Solr Maven artifacts + + 1. Prerequisites: JDK 1.6+; Ant 1.7.X; and maven-ant-tasks-2.1.1.jar + (see item A.1. above for where to put the Maven ant tasks jar). + + 2. Run the following from the solr/ directory: + + ant generate-maven-artifacts + + The above command will create an internal Maven repository under + solr/package/maven/, including POMs, binary .jars, source .jars, + and javadoc .jars, for Solr Core, for the Solr test framework, + for each contrib, and for the Solr .war (for which there are no + source or javadoc .jars). + + +D. How to use Maven to build Lucene/Solr + + In summary, to enable Maven builds, perform the following: + + svn update + ant get-maven-poms + mvn -N -Pbootstrap install + + The details, followed by some example Maven commands: + + 1. Prerequisites: JDK 1.5+ (for Lucene); JDK 1.6+ (for Solr); + Maven 2.2.1 or 3.0.X + + 2. Make sure your sources are up to date. If you checked your sources out + from the Apache Subversion repository, run "svn update" from the top + level. + + 3. Copy the Maven POM templates from under dev-tools/maven/ to where they + they need to go in order to drive the Maven build, using the following + command from the top-level directory: + + ant get-maven-poms + + Note that you will need to do this whenever changes to the POM + templates are committed. It's a good idea to follow every "svn update" + with "ant get-maven-poms" for this reason. + + The above command copies all of the POM templates from dev-tools/maven/, + filling in the project version with the default "X.X-SNAPSHOT". If you + want the POMs and the Maven-built artifacts to have a version other than + the default, you can supply an alternate version on the command line + with the above command, e.g.: + + ant -Dversion=4.0-my-special-version get-maven-poms + + 4. Populate your local repository with .jars & POMs for dependencies that + are not available from public Maven repositories (a.k.a. "non-mavenized + dependencies"): + + mvn -N -Pbootstrap install + + Note that you will need to do this whenever changes to the non-Mavenized + dependencies are committed. It's a good idea to follow every + "svn update" with "ant get-maven-poms" and "mvn -N -Pbootstrap install" + for this reason. + + + Some example Maven commands you can use after you perform the above + preparatory steps: + + - Compile, package, and install all artifacts to your local repository: + + mvn install + + After compiling and packaging, but before installing each module's + artifact, the above command will also run all the module's tests. + + To compile, package and install all artifacts without running any tests: + + mvn -DskipTests install + + - Run tests: + + mvn test + + To run all test methods defined in a test class: + + mvn -Dtest=TestClassName test diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template index 95987f4c141..546040851f8 100644 --- a/dev-tools/maven/pom.xml.template +++ b/dev-tools/maven/pom.xml.template @@ -699,7 +699,7 @@ solr-noggit ${project.version} jar - solr/lib/apache-solr-noggit-r944541.jar + solr/lib/apache-solr-noggit-r1099557.jar diff --git a/dev-tools/maven/solr/contrib/dataimporthandler/src/extras/pom.xml.template b/dev-tools/maven/solr/contrib/dataimporthandler/src/extras/pom.xml.template index a9ee1f774b3..9c08baab4f1 100644 --- a/dev-tools/maven/solr/contrib/dataimporthandler/src/extras/pom.xml.template +++ b/dev-tools/maven/solr/contrib/dataimporthandler/src/extras/pom.xml.template @@ -103,8 +103,8 @@ ${build-directory} - ${build-directory}/extras/classes - ${build-directory}/extras/test-classes + ${build-directory}/classes + ${build-directory}/test-classes main/java test/java diff --git a/dev-tools/maven/solr/src/pom.xml.template b/dev-tools/maven/solr/src/pom.xml.template index 85ddb316d66..b659a01383c 100644 --- a/dev-tools/maven/solr/src/pom.xml.template +++ b/dev-tools/maven/solr/src/pom.xml.template @@ -159,7 +159,6 @@ com.google.guava guava - test junit diff --git a/dev-tools/maven/solr/src/solrj/pom.xml.template b/dev-tools/maven/solr/src/solrj/pom.xml.template index 072e1ef5286..3ae76473707 100644 --- a/dev-tools/maven/solr/src/solrj/pom.xml.template +++ b/dev-tools/maven/solr/src/solrj/pom.xml.template @@ -85,7 +85,7 @@ ${build-directory} - ${build-directory} + ${build-directory}/classes . diff --git a/dev-tools/scripts/diffSources.py b/dev-tools/scripts/diffSources.py new file mode 100644 index 00000000000..8318f241cd5 --- /dev/null +++ b/dev-tools/scripts/diffSources.py @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import sys + +# recursive, unified output format, treat missing files as present but empty +DIFF_FLAGS = '-ruN' + +if '-skipWhitespace' in sys.argv: + sys.argv.remove('-skipWhitespace') + # ignores only whitespace changes + DIFF_FLAGS += 'bBw' + +if len(sys.argv) != 3: + print + print 'Usage: python -u diffSources.py [-skipWhitespace]' + print + print '''This tool creates an applying patch between two directories. + +While you could use this to make a committable patch from a branch, that approach loses +the svn history from the branch (better to use "svn merge --reintegrate", for example). This +diff output should not be considered "authoritative" from a merging standpoint as it does +not reflect what svn will do on merge. +''' + print + sys.exit(0) + +p = subprocess.Popen(['diff', DIFF_FLAGS, '-x', '.svn', '-x', 'build', sys.argv[1], sys.argv[2]], shell=False, stdout=subprocess.PIPE) + +keep = False +while True: + l = p.stdout.readline() + if l == '': + break + if l.endswith('\r\n'): + l = l[:-2] + elif l.endswith('\n'): + l = l[:-1] + if l.startswith('diff ') or l.startswith('Binary files '): + keep = l.lower().find('/build/') == -1 and (l.lower().startswith('Only in') or ((l.lower().endswith('.java') or l.lower().endswith('.txt') or l.lower().endswith('.xml') or l.lower().endswith('.iml')) and l.find('/.svn/') == -1)) + if keep: + print + print + print l.strip() + elif keep: + print l + elif l.startswith('Only in'): + print l.strip() diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 83380041f13..bbd2579bd41 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -472,13 +472,63 @@ Changes in backwards compatibility policy a method getHeapArray() was added to retrieve the internal heap array as a non-generic Object[]. (Uwe Schindler, Yonik Seeley) +* LUCENE-1076: IndexWriter.setInfoStream now throws IOException + (Mike McCandless, Shai Erera) + +* LUCENE-3084: MergePolicy.OneMerge.segments was changed from + SegmentInfos to a List; this is actually a minor change + because SegmentInfos itself extends Vector. (Uwe + Schindler, Mike McCandless) + +Changes in runtime behavior + +* LUCENE-3065: When a NumericField is retrieved from a Document loaded + from IndexReader (or IndexSearcher), it will now come back as + NumericField not as a Field with a string-ified version of the + numeric value you had indexed. Note that this only applies for + newly-indexed Documents; older indices will still return Field + with the string-ified numeric value. If you call Document.get(), + the value comes still back as String, but Document.getFieldable() + returns NumericField instances. (Uwe Schindler, Ryan McKinley, + Mike McCandless) + +New features + +* LUCENE-3082: Added index upgrade tool oal.index.IndexUpgrader + that allows to upgrade all segments to last recent supported index + format without fully optimizing. (Uwe Schindler, Mike McCandless) + +* LUCENE-1076: Added TieredMergePolicy which is able to merge non-contiguous + segments, which means docIDs no longer necessarily stay "in order". + (Mike McCandless, Shai Erera) + +* LUCENE-3071: Adding ReversePathHierarchyTokenizer, added skip parameter to + PathHierarchyTokenizer (Olivier Favre via ryan) + +API Changes + +* LUCENE-3061: IndexWriter's getNextMerge() and merge(OneMerge) are now public + (though @lucene.experimental), allowing for custom MergeScheduler + implementations. (Shai Erera) + +* LUCENE-3065: Document.getField() was deprecated, as it throws + ClassCastException when loading lazy fields or NumericFields. + (Uwe Schindler, Ryan McKinley, Mike McCandless) + Optimizations * LUCENE-2990: ArrayUtil/CollectionUtil.*Sort() methods now exit early on empty or one-element lists/arrays. (Uwe Schindler) +* LUCENE-2897: Apply deleted terms while flushing a segment. We still + buffer deleted terms to later apply to past segments. (Mike McCandless) + Bug fixes +* LUCENE-2996: addIndexes(IndexReader) did not flush before adding the new + indexes, causing existing deletions to be applied on the incoming indexes as + well. (Shai Erera, Mike McCandless) + * LUCENE-3024: Index with more than 2.1B terms was hitting AIOOBE when seeking TermEnum (eg used by Solr's faceting) (Tom Burton-West, Mike McCandless) @@ -491,6 +541,17 @@ Bug fixes very special use cases of the TokenStream-API, most users would not have recognized it. (Uwe Schindler, Robert Muir) +* LUCENE-3054: PhraseQuery can in some cases stack overflow in + SorterTemplate.quickSort(). This fix also adds an optimization to + PhraseQuery as term with lower doc freq will also have less positions. + (Uwe Schindler, Robert Muir, Otis Gospodnetic) + +Test Cases + +* LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to + stop iterating if at least 'tests.iter.min' ran and a failure occured. + (Shai Erera, Chris Hostetter) + ======================= Lucene 3.1.0 ======================= Changes in backwards compatibility policy @@ -1472,6 +1533,10 @@ Bug fixes that warming is free to do whatever it needs to. (Earwin Burrfoot via Mike McCandless) +* LUCENE-3029: Fix corner case when MultiPhraseQuery is used with zero + position-increment tokens that would sometimes assign different + scores to identical docs. (Mike McCandless) + * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store files when a mergedSegmentWarmer is set on IndexWriter. (Mike McCandless) diff --git a/lucene/MIGRATE.txt b/lucene/MIGRATE.txt index 779b6309a44..c2666e06784 100644 --- a/lucene/MIGRATE.txt +++ b/lucene/MIGRATE.txt @@ -312,6 +312,8 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader + - o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer + - o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils * LUCENE-2514: The option to use a Collator's order (instead of binary order) for sorting and range queries has been moved to contrib/queries. diff --git a/lucene/common-build.xml b/lucene/common-build.xml index f8db3369b21..c6bdb1627aa 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -73,6 +73,7 @@ + @@ -499,6 +500,8 @@ + + diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 46a60c87712..3c553a67694 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -50,6 +50,11 @@ Bug Fixes ======================= Lucene 3.x (not yet released) ======================= +Changes in runtime behavior + + * LUCENE-3086: ItalianAnalyzer now uses ElisionFilter with a set of Italian + contractions by default. (Robert Muir) + Bug Fixes * LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was @@ -183,6 +188,10 @@ Bug fixes * LUCENE-2943: Fix thread-safety issues with ICUCollationKeyFilter. (Robert Muir) + * LUCENE-3087: Highlighter: fix case that was preventing highlighting + of exact phrase when tokens overlap. (Pierre Gossé via Mike + McCandless) + API Changes * LUCENE-2867: Some contrib queryparser methods that receives CharSequence as diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index 2c2104570e4..3957c46f3df 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -355,6 +355,7 @@ public class Highlighter { try { + tokenStream.end(); tokenStream.close(); } catch (Exception e) diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java index 616d9e26670..536c7e20465 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; @@ -158,10 +159,13 @@ public class TokenSources { OffsetAttribute offsetAtt; + PositionIncrementAttribute posincAtt; + StoredTokenStream(Token tokens[]) { this.tokens = tokens; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); + posincAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } @Override @@ -173,6 +177,10 @@ public class TokenSources { clearAttributes(); termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); + posincAtt + .setPositionIncrement(currentToken <= 1 + || tokens[currentToken - 1].startOffset() > tokens[currentToken - 2] + .startOffset() ? 1 : 0); return true; } } @@ -180,7 +188,6 @@ public class TokenSources { BytesRef[] terms = tpv.getTerms(); int[] freq = tpv.getTermFrequencies(); int totalTokens = 0; - for (int t = 0; t < freq.length; t++) { totalTokens += freq[t]; } @@ -189,7 +196,8 @@ public class TokenSources { for (int t = 0; t < freq.length; t++) { TermVectorOffsetInfo[] offsets = tpv.getOffsets(t); if (offsets == null) { - throw new IllegalArgumentException("Required TermVector Offset information was not found"); + throw new IllegalArgumentException( + "Required TermVector Offset information was not found"); } int[] pos = null; @@ -205,8 +213,8 @@ public class TokenSources { unsortedTokens = new ArrayList(); } for (int tp = 0; tp < offsets.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), offsets[tp] - .getEndOffset()); + Token token = new Token(terms[t].utf8ToString(), + offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); unsortedTokens.add(token); } } else { @@ -221,8 +229,8 @@ public class TokenSources { // tokens stored with positions - can use this to index straight into // sorted array for (int tp = 0; tp < pos.length; tp++) { - Token token = new Token(terms[t].utf8ToString(), offsets[tp].getStartOffset(), - offsets[tp].getEndOffset()); + Token token = new Token(terms[t].utf8ToString(), + offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); tokensInOriginalOrder[pos[tp]] = token; } } @@ -231,12 +239,11 @@ public class TokenSources { if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens .size()]); - ArrayUtil.quickSort(tokensInOriginalOrder, new Comparator() { + ArrayUtil.mergeSort(tokensInOriginalOrder, new Comparator() { public int compare(Token t1, Token t2) { - if (t1.startOffset() == t2.startOffset()) - return t1.endOffset() - t2.endOffset(); - else - return t1.startOffset() - t2.startOffset(); + if (t1.startOffset() == t2.startOffset()) return t1.endOffset() + - t2.endOffset(); + else return t1.startOffset() - t2.startOffset(); } }); } diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index cea67428617..b66a7b1ed70 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -1093,6 +1093,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte } public void testMaxSizeHighlight() throws Exception { + final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true); + // we disable MockTokenizer checks because we will forcefully limit the + // tokenstream and call end() before incrementToken() returns false. + analyzer.setEnableChecks(false); TestHighlightRunner helper = new TestHighlightRunner() { @Override @@ -1122,7 +1126,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte public void run() throws Exception { String goodWord = "goodtoken"; CharacterRunAutomaton stopWords = new CharacterRunAutomaton(BasicAutomata.makeString("stoppedtoken")); - + // we disable MockTokenizer checks because we will forcefully limit the + // tokenstream and call end() before incrementToken() returns false. + final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true); + analyzer.setEnableChecks(false); TermQuery query = new TermQuery(new Term("data", goodWord)); String match; @@ -1134,13 +1141,13 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte sb.append("stoppedtoken"); } SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); - Highlighter hg = getHighlighter(query, "data", new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true).tokenStream( + Highlighter hg = getHighlighter(query, "data", analyzer.tokenStream( "data", new StringReader(sb.toString())), fm);// new Highlighter(fm, // new // QueryTermScorer(query)); hg.setTextFragmenter(new NullFragmenter()); hg.setMaxDocCharsToAnalyze(100); - match = hg.getBestFragment(new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true), "data", sb.toString()); + match = hg.getBestFragment(analyzer, "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); @@ -1151,7 +1158,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte // + whitespace) sb.append(" "); sb.append(goodWord); - match = hg.getBestFragment(new MockAnalyzer(random, MockTokenizer.SIMPLE, true, stopWords, true), "data", sb.toString()); + match = hg.getBestFragment(analyzer, "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); } @@ -1726,6 +1733,11 @@ final class SynonymAnalyzer extends Analyzer { stream.addAttribute(CharTermAttribute.class); stream.addAttribute(PositionIncrementAttribute.class); stream.addAttribute(OffsetAttribute.class); + try { + stream.reset(); + } catch (IOException e) { + throw new RuntimeException(e); + } return new SynonymTokenizer(stream, synonyms); } } diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java index 45aa3f51425..30dccc4bcc8 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java @@ -28,32 +28,38 @@ import org.apache.lucene.analysis.TokenStream; public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase { public void testFilter() throws Exception { - TokenStream stream = new MockTokenizer(new StringReader( + // we disable MockTokenizer checks because we will forcefully limit the + // tokenstream and call end() before incrementToken() returns false. + MockTokenizer stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); + stream.setEnableChecks(false); OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); + stream.setEnableChecks(false); filter = new OffsetLimitTokenFilter(stream, 12); assertTokenStreamContents(filter, new String[] {"short", "toolong"}); stream = new MockTokenizer(new StringReader( "short toolong evenmuchlongertext a ab toolong foo"), MockTokenizer.WHITESPACE, false); + stream.setEnableChecks(false); filter = new OffsetLimitTokenFilter(stream, 30); assertTokenStreamContents(filter, new String[] {"short", "toolong", "evenmuchlongertext"}); - + // TODO: This is not actually testing reuse! (reusableTokenStream is not implemented) checkOneTermReuse(new Analyzer() { @Override public TokenStream tokenStream(String fieldName, Reader reader) { - return new OffsetLimitTokenFilter(new MockTokenizer(reader, - MockTokenizer.WHITESPACE, false), 10); + MockTokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + tokenizer.setEnableChecks(false); + return new OffsetLimitTokenFilter(tokenizer, 10); } }, "llenges", "llenges"); } diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java index 572aa219b78..02dd92d40e0 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java @@ -36,7 +36,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; @@ -86,12 +89,12 @@ public class TokenSourcesTest extends LuceneTestCase { public void reset() { this.i = -1; this.tokens = new Token[] { - new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3), - new Token(new char[] { '{', 'f', 'o', 'x', '}' }, 0, 5, 0, 7), - new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7), - new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11), - new Token(new char[] { 'n', 'o', 't' }, 0, 3, 12, 15), - new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) }; + new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3), + new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7), + new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7), + new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11), + new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15), + new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)}; this.tokens[1].setPositionIncrement(0); } } @@ -188,4 +191,97 @@ public class TokenSourcesTest extends LuceneTestCase { } } + public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException, + LockObtainFailedException, IOException, InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + try { + // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + // query.add(new SpanTermQuery(new Term(FIELD, "{fox}"))); + // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD, "the")), + new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); + + TopDocs hits = indexSearcher.search(phraseQuery, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + + public void testOverlapWithPositionsAndOffsetExactPhrase() + throws CorruptIndexException, LockObtainFailedException, IOException, + InvalidTokenOffsetsException { + final String TEXT = "the fox did not jump"; + final Directory directory = newDirectory(); + final IndexWriter indexWriter = new IndexWriter(directory, + newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer())); + try { + final Document document = new Document(); + document.add(new Field(FIELD, new TokenStreamOverlap(), + TermVector.WITH_POSITIONS_OFFSETS)); + indexWriter.addDocument(document); + } finally { + indexWriter.close(); + } + final IndexReader indexReader = IndexReader.open(directory, true); + try { + assertEquals(1, indexReader.numDocs()); + final IndexSearcher indexSearcher = newSearcher(indexReader); + try { + // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); + // query.add(new SpanTermQuery(new Term(FIELD, "the"))); + // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); + final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { + new SpanTermQuery(new Term(FIELD, "the")), + new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); + + TopDocs hits = indexSearcher.search(phraseQuery, 1); + assertEquals(1, hits.totalHits); + final Highlighter highlighter = new Highlighter( + new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), + new QueryScorer(phraseQuery)); + final TokenStream tokenStream = TokenSources + .getTokenStream( + (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), + false); + assertEquals("the fox did not jump", + highlighter.getBestFragment(tokenStream, TEXT)); + } finally { + indexSearcher.close(); + } + } finally { + indexReader.close(); + directory.close(); + } + } + } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java index 0e42f56e9cd..a50565cc2ed 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java @@ -192,6 +192,7 @@ public class FuzzyLikeThisQuery extends Query int corpusNumDocs=reader.numDocs(); Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects HashSet processedTerms=new HashSet(); + ts.reset(); while (ts.incrementToken()) { String term = termAtt.toString(); @@ -213,17 +214,15 @@ public class FuzzyLikeThisQuery extends Query BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class); while ((possibleMatch = fe.next()) != null) { - if (possibleMatch!=null) { - numVariants++; - totalVariantDocFreqs+=fe.docFreq(); - float score=boostAtt.getBoost(); - if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm); - variantsQ.insertWithOverflow(st); - minScore = variantsQ.top().score; // maintain minScore - } - maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); + numVariants++; + totalVariantDocFreqs+=fe.docFreq(); + float score=boostAtt.getBoost(); + if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ + ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm); + variantsQ.insertWithOverflow(st); + minScore = variantsQ.top().score; // maintain minScore } + maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY); } if(numVariants>0) @@ -246,7 +245,9 @@ public class FuzzyLikeThisQuery extends Query } } } - } + } + ts.end(); + ts.close(); } @Override diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java index 3a944197752..c2387557947 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java @@ -885,7 +885,7 @@ public final class MoreLikeThis { int tokenCount=0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - + ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; @@ -906,6 +906,8 @@ public final class MoreLikeThis { cnt.x++; } } + ts.end(); + ts.close(); } diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java index 9d320a8532b..063a826217b 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/analyzing/AnalyzingQueryParser.java @@ -110,6 +110,11 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); int countTokens = 0; + try { + source.reset(); + } catch (IOException e1) { + throw new RuntimeException(e1); + } while (true) { try { if (!source.incrementToken()) break; @@ -126,6 +131,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar } } try { + source.end(); source.close(); } catch (IOException e) { // ignore @@ -191,7 +197,11 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr)); List tlist = new ArrayList(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); - + try { + source.reset(); + } catch (IOException e1) { + throw new RuntimeException(e1); + } while (true) { try { if (!source.incrementToken()) break; @@ -202,6 +212,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar } try { + source.end(); source.close(); } catch (IOException e) { // ignore @@ -242,6 +253,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar boolean multipleTokens = false; try { + source.reset(); if (source.incrementToken()) { nextToken = termAtt.toString(); } @@ -251,6 +263,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar } try { + source.end(); source.close(); } catch (IOException e) { // ignore @@ -281,6 +294,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar try { source = getAnalyzer().tokenStream(field, new StringReader(part1)); termAtt = source.addAttribute(CharTermAttribute.class); + source.reset(); multipleTokens = false; @@ -292,6 +306,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar // ignore } try { + source.end(); source.close(); } catch (IOException e) { // ignore @@ -308,6 +323,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar termAtt = source.addAttribute(CharTermAttribute.class); try { + source.reset(); if (source.incrementToken()) { part2 = termAtt.toString(); } @@ -316,6 +332,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar // ignore } try { + source.end(); source.close(); } catch (IOException e) { // ignore diff --git a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/processors/AnalyzerQueryNodeProcessor.java b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/processors/AnalyzerQueryNodeProcessor.java index ea995156452..b0f61c543fc 100644 --- a/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/processors/AnalyzerQueryNodeProcessor.java +++ b/lucene/contrib/queryparser/src/java/org/apache/lucene/queryParser/standard/processors/AnalyzerQueryNodeProcessor.java @@ -123,6 +123,11 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl { TokenStream source = this.analyzer.tokenStream(field, new StringReader( text)); + try { + source.reset(); + } catch (IOException e1) { + throw new RuntimeException(e1); + } CachingTokenFilter buffer = new CachingTokenFilter(source); PositionIncrementAttribute posIncrAtt = null; diff --git a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java index 646abf73dbd..871356c5cff 100755 --- a/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java +++ b/lucene/contrib/wordnet/src/java/org/apache/lucene/wordnet/SynExpand.java @@ -118,12 +118,14 @@ public final class SynExpand { // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - + ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); if ( already.add( word)) top.add( word); } + ts.end(); + ts.close(); final BooleanQuery tmp = new BooleanQuery(); // [2] form query diff --git a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java index 89faf4b83af..6959a3ed0a8 100644 --- a/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java +++ b/lucene/contrib/wordnet/src/test/org/apache/lucene/wordnet/TestSynonymTokenFilter.java @@ -111,7 +111,6 @@ public class TestSynonymTokenFilter extends BaseTokenStreamTestCase { setPreviousTokenStream(streams); } else { streams.source.reset(reader); - streams.result.reset(); // reset the SynonymTokenFilter } return streams.result; } diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java index b96cf7bab4b..7a05ea717ba 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java @@ -80,9 +80,12 @@ public class LikeThisQueryBuilder implements QueryBuilder { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); try { + ts.reset(); while(ts.incrementToken()) { stopWordsSet.add(termAtt.toString()); } + ts.end(); + ts.close(); } catch(IOException ioe) { diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java index 1f8ddaebc97..822c79d598d 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/SpanOrTermsBuilder.java @@ -59,11 +59,14 @@ public class SpanOrTermsBuilder extends SpanBuilderBase TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value)); TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); + ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, new BytesRef(bytes))); clausesList.add(stq); } + ts.end(); + ts.close(); SpanOrQuery soq=new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()])); soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f)); return soq; diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java index b1198389ccf..86521ff8042 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsFilterBuilder.java @@ -64,6 +64,7 @@ public class TermsFilterBuilder implements FilterBuilder { Term term = null; BytesRef bytes = termAtt.getBytesRef(); + ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); if (term == null) @@ -76,6 +77,8 @@ public class TermsFilterBuilder implements FilterBuilder } tf.addTerm(term); } + ts.end(); + ts.close(); } catch (IOException ioe) { diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java index 051922e8a28..37ecf63e6b7 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/TermsQueryBuilder.java @@ -61,6 +61,7 @@ public class TermsQueryBuilder implements QueryBuilder { TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); Term term = null; BytesRef bytes = termAtt.getBytesRef(); + ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); if (term == null) @@ -73,6 +74,8 @@ public class TermsQueryBuilder implements QueryBuilder { } bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD)); } + ts.end(); + ts.close(); } catch (IOException ioe) { diff --git a/lucene/docs/contributions.html b/lucene/docs/contributions.html index cc67944643a..3544745073b 100644 --- a/lucene/docs/contributions.html +++ b/lucene/docs/contributions.html @@ -3,7 +3,7 @@ - + Apache Lucene - Contributions @@ -275,7 +275,7 @@ document.write("Last Published: " + document.lastModified); <a href="#PDFTextStream -- PDF text and metadata extraction">PDFTextStream -- PDF text and metadata extraction</a> </li> <li> -<a href="#PJ Classic & PJ Professional - PDF Document Conversion">PJ Classic & PJ Professional - PDF Document Conversion</a> +<a href="#PJ Classic & PJ Professional - PDF Document Conversion">PJ Classic & PJ Professional - PDF Document Conversion</a> </li> </ul> </li> @@ -403,7 +403,7 @@ document.write("Last Published: " + document.lastModified); URL </th> <td> - <a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2"> + <a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2"> http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2 </a> </td> @@ -538,7 +538,7 @@ document.write("Last Published: " + document.lastModified); </tr> </table> -<a name="N10124"></a><a name="PJ Classic & PJ Professional - PDF Document Conversion"></a> +<a name="N10124"></a><a name="PJ Classic & PJ Professional - PDF Document Conversion"></a> <h3 class="boxed">PJ Classic & PJ Professional - PDF Document Conversion</h3> <table class="ForrestTable" cellspacing="1" cellpadding="4"> diff --git a/lucene/docs/contributions.pdf b/lucene/docs/contributions.pdf index a7937c0b152..352a0ffff4f 100644 Binary files a/lucene/docs/contributions.pdf and b/lucene/docs/contributions.pdf differ diff --git a/lucene/docs/demo.html b/lucene/docs/demo.html index 4c6a0bef351..7ddf92a9edd 100644 --- a/lucene/docs/demo.html +++ b/lucene/docs/demo.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Building and Installing the Basic Demo diff --git a/lucene/docs/demo.pdf b/lucene/docs/demo.pdf index 122697aa0e9..48fff580508 100644 Binary files a/lucene/docs/demo.pdf and b/lucene/docs/demo.pdf differ diff --git a/lucene/docs/demo2.html b/lucene/docs/demo2.html index 0d4791c634f..f5b568abfee 100644 --- a/lucene/docs/demo2.html +++ b/lucene/docs/demo2.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Basic Demo Sources Walk-through diff --git a/lucene/docs/demo2.pdf b/lucene/docs/demo2.pdf index a90050e29af..6be6790fe31 100644 Binary files a/lucene/docs/demo2.pdf and b/lucene/docs/demo2.pdf differ diff --git a/lucene/docs/fileformats.html b/lucene/docs/fileformats.html index f4528c7d635..a3b5ea75e33 100644 --- a/lucene/docs/fileformats.html +++ b/lucene/docs/fileformats.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Index File Formats @@ -425,11 +425,19 @@ document.write("Last Published: " + document.lastModified); <p> In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + + Additionally segments track explicitly whether or + not they have term vectors. See LUCENE-2811 for details. + </p> +<p> + In version 3.2, numeric fields are written as natively + to stored fields file, previously they were stored in + text format only. </p> </div> -<a name="N10037"></a><a name="Definitions"></a> +<a name="N1003A"></a><a name="Definitions"></a> <h2 class="boxed">Definitions</h2> <div class="section"> <p> @@ -470,7 +478,7 @@ document.write("Last Published: " + document.lastModified); strings, the first naming the field, and the second naming text within the field. </p> -<a name="N10057"></a><a name="Inverted Indexing"></a> +<a name="N1005A"></a><a name="Inverted Indexing"></a> <h3 class="boxed">Inverted Indexing</h3> <p> The index stores statistics about terms in order @@ -480,7 +488,7 @@ document.write("Last Published: " + document.lastModified); it. This is the inverse of the natural relationship, in which documents list terms. </p> -<a name="N10063"></a><a name="Types of Fields"></a> +<a name="N10066"></a><a name="Types of Fields"></a> <h3 class="boxed">Types of Fields</h3> <p> In Lucene, fields may be <i>stored</i>, in which @@ -494,7 +502,7 @@ document.write("Last Published: " + document.lastModified); to be indexed literally. </p> <p>See the <a href="api/core/org/apache/lucene/document/Field.html">Field</a> java docs for more information on Fields.</p> -<a name="N10080"></a><a name="Segments"></a> +<a name="N10083"></a><a name="Segments"></a> <h3 class="boxed">Segments</h3> <p> Lucene indexes may be composed of multiple sub-indexes, or @@ -520,7 +528,7 @@ document.write("Last Published: " + document.lastModified); Searches may involve multiple segments and/or multiple indexes, each index potentially composed of a set of segments. </p> -<a name="N1009E"></a><a name="Document Numbers"></a> +<a name="N100A1"></a><a name="Document Numbers"></a> <h3 class="boxed">Document Numbers</h3> <p> Internally, Lucene refers to documents by an integer <i>document @@ -575,7 +583,7 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N100C5"></a><a name="Overview"></a> +<a name="N100C8"></a><a name="Overview"></a> <h2 class="boxed">Overview</h2> <div class="section"> <p> @@ -674,7 +682,7 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N10108"></a><a name="File Naming"></a> +<a name="N1010B"></a><a name="File Naming"></a> <h2 class="boxed">File Naming</h2> <div class="section"> <p> @@ -701,7 +709,7 @@ document.write("Last Published: " + document.lastModified); </p> </div> -<a name="N10117"></a><a name="file-names"></a> +<a name="N1011A"></a><a name="file-names"></a> <h2 class="boxed">Summary of File Extensions</h2> <div class="section"> <p>The following table summarizes the names and extensions of the files in Lucene: @@ -843,10 +851,10 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N10201"></a><a name="Primitive Types"></a> +<a name="N10204"></a><a name="Primitive Types"></a> <h2 class="boxed">Primitive Types</h2> <div class="section"> -<a name="N10206"></a><a name="Byte"></a> +<a name="N10209"></a><a name="Byte"></a> <h3 class="boxed">Byte</h3> <p> The most primitive type @@ -854,7 +862,7 @@ document.write("Last Published: " + document.lastModified); other data types are defined as sequences of bytes, so file formats are byte-order independent. </p> -<a name="N1020F"></a><a name="UInt32"></a> +<a name="N10212"></a><a name="UInt32"></a> <h3 class="boxed">UInt32</h3> <p> 32-bit unsigned integers are written as four @@ -864,7 +872,7 @@ document.write("Last Published: " + document.lastModified); UInt32 --> <Byte><sup>4</sup> </p> -<a name="N1021E"></a><a name="Uint64"></a> +<a name="N10221"></a><a name="Uint64"></a> <h3 class="boxed">Uint64</h3> <p> 64-bit unsigned integers are written as eight @@ -873,7 +881,7 @@ document.write("Last Published: " + document.lastModified); <p>UInt64 --> <Byte><sup>8</sup> </p> -<a name="N1022D"></a><a name="VInt"></a> +<a name="N10230"></a><a name="VInt"></a> <h3 class="boxed">VInt</h3> <p> A variable-length format for positive integers is @@ -1423,13 +1431,13 @@ document.write("Last Published: " + document.lastModified); This provides compression while still being efficient to decode. </p> -<a name="N10512"></a><a name="Chars"></a> +<a name="N10515"></a><a name="Chars"></a> <h3 class="boxed">Chars</h3> <p> Lucene writes unicode character sequences as UTF-8 encoded bytes. </p> -<a name="N1051B"></a><a name="String"></a> +<a name="N1051E"></a><a name="String"></a> <h3 class="boxed">String</h3> <p> Lucene writes strings as UTF-8 encoded bytes. @@ -1442,10 +1450,10 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N10528"></a><a name="Compound Types"></a> +<a name="N1052B"></a><a name="Compound Types"></a> <h2 class="boxed">Compound Types</h2> <div class="section"> -<a name="N1052D"></a><a name="MapStringString"></a> +<a name="N10530"></a><a name="MapStringString"></a> <h3 class="boxed">Map<String,String></h3> <p> In a couple places Lucene stores a Map @@ -1458,13 +1466,13 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N1053D"></a><a name="Per-Index Files"></a> +<a name="N10540"></a><a name="Per-Index Files"></a> <h2 class="boxed">Per-Index Files</h2> <div class="section"> <p> The files in this section exist one-per-index. </p> -<a name="N10545"></a><a name="Segments File"></a> +<a name="N10548"></a><a name="Segments File"></a> <h3 class="boxed">Segments File</h3> <p> The active segments in the index are stored in the @@ -1508,7 +1516,7 @@ document.write("Last Published: " + document.lastModified); <b>3.1</b> Segments --> Format, Version, NameCounter, SegCount, <SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField, NormGen<sup>NumField</sup>, - IsCompoundFile, DeletionCount, HasProx, Diagnostics><sup>SegCount</sup>, CommitUserData, Checksum + IsCompoundFile, DeletionCount, HasProx, Diagnostics, HasVectors><sup>SegCount</sup>, CommitUserData, Checksum </p> <p> Format, NameCounter, SegCount, SegSize, NumField, @@ -1525,7 +1533,7 @@ document.write("Last Published: " + document.lastModified); </p> <p> IsCompoundFile, HasSingleNormFile, - DocStoreIsCompoundFile, HasProx --> Int8 + DocStoreIsCompoundFile, HasProx, HasVectors --> Int8 </p> <p> CommitUserData --> Map<String,String> @@ -1634,7 +1642,10 @@ document.write("Last Published: " + document.lastModified); Lucene version, OS, Java version, why the segment was created (merge, flush, addIndexes), etc. </p> -<a name="N105CD"></a><a name="Lock File"></a> +<p> HasVectors is 1 if this segment stores term vectors, + else it's 0. + </p> +<a name="N105D3"></a><a name="Lock File"></a> <h3 class="boxed">Lock File</h3> <p> The write lock, which is stored in the index @@ -1648,14 +1659,14 @@ document.write("Last Published: " + document.lastModified); documents). This lock file ensures that only one writer is modifying the index at a time. </p> -<a name="N105D6"></a><a name="Deletable File"></a> +<a name="N105DC"></a><a name="Deletable File"></a> <h3 class="boxed">Deletable File</h3> <p> A writer dynamically computes the files that are deletable, instead, so no file is written. </p> -<a name="N105DF"></a><a name="Compound Files"></a> +<a name="N105E5"></a><a name="Compound Files"></a> <h3 class="boxed">Compound Files</h3> <p>Starting with Lucene 1.4 the compound file format became default. This is simply a container for all files described in the next section @@ -1682,14 +1693,14 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N10607"></a><a name="Per-Segment Files"></a> +<a name="N1060D"></a><a name="Per-Segment Files"></a> <h2 class="boxed">Per-Segment Files</h2> <div class="section"> <p> The remaining files are all per-segment, and are thus defined by suffix. </p> -<a name="N1060F"></a><a name="Fields"></a> +<a name="N10615"></a><a name="Fields"></a> <h3 class="boxed">Fields</h3> <p> @@ -1862,13 +1873,29 @@ document.write("Last Published: " + document.lastModified); <li>third bit is one for fields with compression option enabled (if compression is enabled, the algorithm used is ZLIB), only available for indexes until Lucene version 2.9.x</li> + +<li>4th to 6th bits (mask: 0x7<<3) define the type of a + numeric field: <ul> + +<li>all bits in mask are cleared if no numeric field at all</li> + +<li>1<<3: Value is Int</li> + +<li>2<<3: Value is Long</li> + +<li>3<<3: Value is Int as Float (as of Integer.intBitsToFloat)</li> + +<li>4<<3: Value is Long as Double (as of Double.longBitsToDouble)</li> + +</ul> +</li> </ul> </p> <p>Value --> - String | BinaryValue (depending on Bits) + String | BinaryValue | Int | Long (depending on Bits) </p> <p>BinaryValue --> @@ -1883,7 +1910,7 @@ document.write("Last Published: " + document.lastModified); </li> </ol> -<a name="N106B6"></a><a name="Term Dictionary"></a> +<a name="N106D0"></a><a name="Term Dictionary"></a> <h3 class="boxed">Term Dictionary</h3> <p> The term dictionary is represented as two files: @@ -2075,7 +2102,7 @@ document.write("Last Published: " + document.lastModified); </li> </ol> -<a name="N1073A"></a><a name="Frequencies"></a> +<a name="N10754"></a><a name="Frequencies"></a> <h3 class="boxed">Frequencies</h3> <p> The .frq file contains the lists of documents @@ -2203,7 +2230,7 @@ document.write("Last Published: " + document.lastModified); entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer to entry 31 on level 0. </p> -<a name="N107C2"></a><a name="Positions"></a> +<a name="N107DC"></a><a name="Positions"></a> <h3 class="boxed">Positions</h3> <p> The .prx file contains the lists of positions that @@ -2273,7 +2300,7 @@ document.write("Last Published: " + document.lastModified); Payload. If PayloadLength is not stored, then this Payload has the same length as the Payload at the previous position. </p> -<a name="N107FE"></a><a name="Normalization Factors"></a> +<a name="N10818"></a><a name="Normalization Factors"></a> <h3 class="boxed">Normalization Factors</h3> <p>There's a single .nrm file containing all norms: </p> @@ -2353,7 +2380,7 @@ document.write("Last Published: " + document.lastModified); </p> <p>Separate norm files are created (when adequate) for both compound and non compound segments. </p> -<a name="N1084F"></a><a name="Term Vectors"></a> +<a name="N10869"></a><a name="Term Vectors"></a> <h3 class="boxed">Term Vectors</h3> <p> Term Vector support is an optional on a field by @@ -2489,7 +2516,7 @@ document.write("Last Published: " + document.lastModified); </li> </ol> -<a name="N108EB"></a><a name="Deleted Documents"></a> +<a name="N10905"></a><a name="Deleted Documents"></a> <h3 class="boxed">Deleted Documents</h3> <p>The .del file is optional, and only exists when a segment contains deletions. @@ -2553,7 +2580,7 @@ document.write("Last Published: " + document.lastModified); </div> -<a name="N10925"></a><a name="Limitations"></a> +<a name="N1093F"></a><a name="Limitations"></a> <h2 class="boxed">Limitations</h2> <div class="section"> <p> diff --git a/lucene/docs/fileformats.pdf b/lucene/docs/fileformats.pdf index 4873eea8bae..8acadfdc1a4 100644 Binary files a/lucene/docs/fileformats.pdf and b/lucene/docs/fileformats.pdf differ diff --git a/lucene/docs/gettingstarted.html b/lucene/docs/gettingstarted.html index b70b2177ad2..29fc8cdc1b8 100644 --- a/lucene/docs/gettingstarted.html +++ b/lucene/docs/gettingstarted.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Getting Started Guide @@ -268,15 +268,13 @@ may wish to skip sections. <li> <a href="demo.html">About the command-line Lucene demo and its usage</a>. This section - is intended for anyone who wants to use the command-line Lucene demo.</li> -<p></p> + is intended for anyone who wants to use the command-line Lucene demo.</li> <li> <a href="demo2.html">About the sources and implementation for the command-line Lucene demo</a>. This section walks through the implementation details (sources) of the - command-line Lucene demo. This section is intended for developers.</li> -<p></p> + command-line Lucene demo. This section is intended for developers.</li> </ul> </div> diff --git a/lucene/docs/gettingstarted.pdf b/lucene/docs/gettingstarted.pdf index d53b62c45cc..b95fccb9e9e 100644 Binary files a/lucene/docs/gettingstarted.pdf and b/lucene/docs/gettingstarted.pdf differ diff --git a/lucene/docs/index.html b/lucene/docs/index.html index a65f2775bae..7c6f8b7c133 100644 --- a/lucene/docs/index.html +++ b/lucene/docs/index.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title>Lucene Java Documentation diff --git a/lucene/docs/index.pdf b/lucene/docs/index.pdf index 4e1a93a8647..8e10347a7e3 100644 Binary files a/lucene/docs/index.pdf and b/lucene/docs/index.pdf differ diff --git a/lucene/docs/linkmap.html b/lucene/docs/linkmap.html index 22af04471f0..89d0b146fb6 100644 --- a/lucene/docs/linkmap.html +++ b/lucene/docs/linkmap.html @@ -3,7 +3,7 @@ - + Site Linkmap Table of Contents diff --git a/lucene/docs/linkmap.pdf b/lucene/docs/linkmap.pdf index 789b0ce9847..d05bc8e2b31 100644 Binary files a/lucene/docs/linkmap.pdf and b/lucene/docs/linkmap.pdf differ diff --git a/lucene/docs/lucene-contrib/index.html b/lucene/docs/lucene-contrib/index.html index a217a52884b..b5164624c1e 100644 --- a/lucene/docs/lucene-contrib/index.html +++ b/lucene/docs/lucene-contrib/index.html @@ -3,7 +3,7 @@ - + Apache Lucene - Lucene Contrib diff --git a/lucene/docs/lucene-contrib/index.pdf b/lucene/docs/lucene-contrib/index.pdf index d3c4c988f66..1db16b319ee 100644 Binary files a/lucene/docs/lucene-contrib/index.pdf and b/lucene/docs/lucene-contrib/index.pdf differ diff --git a/lucene/docs/queryparsersyntax.html b/lucene/docs/queryparsersyntax.html index 59222f6c2a0..fed92a4fa9c 100644 --- a/lucene/docs/queryparsersyntax.html +++ b/lucene/docs/queryparsersyntax.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Query Parser Syntax diff --git a/lucene/docs/queryparsersyntax.pdf b/lucene/docs/queryparsersyntax.pdf index f2b045e82f5..79d61659ea2 100644 Binary files a/lucene/docs/queryparsersyntax.pdf and b/lucene/docs/queryparsersyntax.pdf differ diff --git a/lucene/docs/scoring.html b/lucene/docs/scoring.html index c30ec4cd2d0..a0326be5388 100644 --- a/lucene/docs/scoring.html +++ b/lucene/docs/scoring.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title> Apache Lucene - Scoring diff --git a/lucene/docs/scoring.pdf b/lucene/docs/scoring.pdf index 1855c61550e..0fdf7919bd3 100644 Binary files a/lucene/docs/scoring.pdf and b/lucene/docs/scoring.pdf differ diff --git a/lucene/docs/skin/images/apache-thanks.png b/lucene/docs/skin/images/apache-thanks.png new file mode 100644 index 00000000000..c0bea09cc8b Binary files /dev/null and b/lucene/docs/skin/images/apache-thanks.png differ diff --git a/lucene/docs/skin/images/built-with-cocoon.gif b/lucene/docs/skin/images/built-with-cocoon.gif new file mode 100644 index 00000000000..0b38f7857b6 Binary files /dev/null and b/lucene/docs/skin/images/built-with-cocoon.gif differ diff --git a/lucene/docs/systemrequirements.html b/lucene/docs/systemrequirements.html index e468f4c4e88..4025906ff9b 100644 --- a/lucene/docs/systemrequirements.html +++ b/lucene/docs/systemrequirements.html @@ -3,7 +3,7 @@ <head> <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> <meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> +<meta name="Forrest-version" content="0.9"> <meta name="Forrest-skin-name" content="lucene"> <title>Apache Lucene - System Requirements diff --git a/lucene/docs/systemrequirements.pdf b/lucene/docs/systemrequirements.pdf index f6b2924f055..14502241478 100644 Binary files a/lucene/docs/systemrequirements.pdf and b/lucene/docs/systemrequirements.pdf differ diff --git a/lucene/src/java/org/apache/lucene/document/Document.java b/lucene/src/java/org/apache/lucene/document/Document.java index 0343f673e5a..5d8262c8940 100644 --- a/lucene/src/java/org/apache/lucene/document/Document.java +++ b/lucene/src/java/org/apache/lucene/document/Document.java @@ -131,8 +131,13 @@ public final class Document { /** Returns a field with the given name if any exist in this document, or * null. If multiple fields exists with this name, this method returns the * first value added. - * Do not use this method with lazy loaded fields. + * Do not use this method with lazy loaded fields or {@link NumericField}. + * @deprecated use {@link #getFieldable} instead and cast depending on + * data type. + * @throws ClassCastException if you try to retrieve a numerical or + * lazy loaded field. */ + @Deprecated public final Field getField(String name) { return (Field) getFieldable(name); } @@ -154,6 +159,8 @@ public final class Document { * this document, or null. If multiple fields exist with this name, this * method returns the first value added. If only binary fields with this name * exist, returns null. + * For {@link NumericField} it returns the string value of the number. If you want + * the actual {@code NumericField} instance back, use {@link #getFieldable}. */ public final String get(String name) { for (Fieldable field : fields) { @@ -177,13 +184,18 @@ public final class Document { /** * Returns an array of {@link Field}s with the given name. - * Do not use with lazy loaded fields. * This method returns an empty array when there are no * matching fields. It never returns null. + * Do not use this method with lazy loaded fields or {@link NumericField}. * * @param name the name of the field * @return a Field[] array + * @deprecated use {@link #getFieldable} instead and cast depending on + * data type. + * @throws ClassCastException if you try to retrieve a numerical or + * lazy loaded field. */ + @Deprecated public final Field[] getFields(String name) { List result = new ArrayList(); for (Fieldable field : fields) { @@ -230,6 +242,8 @@ public final class Document { * Returns an array of values of the field specified as the method parameter. * This method returns an empty array when there are no * matching fields. It never returns null. + * For {@link NumericField}s it returns the string value of the number. If you want + * the actual {@code NumericField} instances back, use {@link #getFieldables}. * @param name the name of the field * @return a String[] of field values */ diff --git a/lucene/src/java/org/apache/lucene/document/NumericField.java b/lucene/src/java/org/apache/lucene/document/NumericField.java index 6cae722a1d9..3bd46cf0e2f 100644 --- a/lucene/src/java/org/apache/lucene/document/NumericField.java +++ b/lucene/src/java/org/apache/lucene/document/NumericField.java @@ -127,18 +127,18 @@ import org.apache.lucene.search.FieldCache; // javadocs * class is a wrapper around this token stream type for * easier, more intuitive usage.

* - *

NOTE: This class is only used during - * indexing. When retrieving the stored field value from a - * {@link Document} instance after search, you will get a - * conventional {@link Fieldable} instance where the numeric - * values are returned as {@link String}s (according to - * toString(value) of the used data type). - * * @since 2.9 */ public final class NumericField extends AbstractField { - private final NumericTokenStream numericTS; + /** Data type of the value in {@link NumericField}. + * @since 3.2 + */ + public static enum DataType { INT, LONG, FLOAT, DOUBLE } + + private transient NumericTokenStream numericTS; + private DataType type; + private final int precisionStep; /** * Creates a field for numeric values using the default precisionStep @@ -158,8 +158,8 @@ public final class NumericField extends AbstractField { * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * @param name the field name - * @param store if the field should be stored in plain text form - * (according to toString(value) of the used data type) + * @param store if the field should be stored, {@link Document#getFieldable} + * then returns {@code NumericField} instances on search results. * @param index if the field should be indexed using {@link NumericTokenStream} */ public NumericField(String name, Field.Store store, boolean index) { @@ -186,19 +186,43 @@ public final class NumericField extends AbstractField { * set a value using the various set???Value() methods. * @param name the field name * @param precisionStep the used precision step - * @param store if the field should be stored in plain text form - * (according to toString(value) of the used data type) + * @param store if the field should be stored, {@link Document#getFieldable} + * then returns {@code NumericField} instances on search results. * @param index if the field should be indexed using {@link NumericTokenStream} */ public NumericField(String name, int precisionStep, Field.Store store, boolean index) { super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO); + this.precisionStep = precisionStep; setOmitTermFreqAndPositions(true); - numericTS = new NumericTokenStream(precisionStep); } /** Returns a {@link NumericTokenStream} for indexing the numeric value. */ public TokenStream tokenStreamValue() { - return isIndexed() ? numericTS : null; + if (!isIndexed()) + return null; + if (numericTS == null) { + // lazy init the TokenStream as it is heavy to instantiate (attributes,...), + // if not needed (stored field loading) + numericTS = new NumericTokenStream(precisionStep); + // initialize value in TokenStream + if (fieldsData != null) { + assert type != null; + final Number val = (Number) fieldsData; + switch (type) { + case INT: + numericTS.setIntValue(val.intValue()); break; + case LONG: + numericTS.setLongValue(val.longValue()); break; + case FLOAT: + numericTS.setFloatValue(val.floatValue()); break; + case DOUBLE: + numericTS.setDoubleValue(val.doubleValue()); break; + default: + assert false : "Should never get here"; + } + } + } + return numericTS; } /** Returns always null for numeric fields */ @@ -212,7 +236,10 @@ public final class NumericField extends AbstractField { return null; } - /** Returns the numeric value as a string (how it is stored, when {@link Field.Store#YES} is chosen). */ + /** Returns the numeric value as a string. This format is also returned if you call {@link Document#get(String)} + * on search results. It is recommended to use {@link Document#getFieldable} instead + * that returns {@code NumericField} instances. You can then use {@link #getNumericValue} + * to return the stored value. */ public String stringValue() { return (fieldsData == null) ? null : fieldsData.toString(); } @@ -224,7 +251,14 @@ public final class NumericField extends AbstractField { /** Returns the precision step. */ public int getPrecisionStep() { - return numericTS.getPrecisionStep(); + return precisionStep; + } + + /** Returns the data type of the current value, {@code null} if not yet set. + * @since 3.2 + */ + public DataType getDataType() { + return type; } /** @@ -234,8 +268,9 @@ public final class NumericField extends AbstractField { * document.add(new NumericField(name, precisionStep).setLongValue(value)) */ public NumericField setLongValue(final long value) { - numericTS.setLongValue(value); + if (numericTS != null) numericTS.setLongValue(value); fieldsData = Long.valueOf(value); + type = DataType.LONG; return this; } @@ -246,8 +281,9 @@ public final class NumericField extends AbstractField { * document.add(new NumericField(name, precisionStep).setIntValue(value)) */ public NumericField setIntValue(final int value) { - numericTS.setIntValue(value); + if (numericTS != null) numericTS.setIntValue(value); fieldsData = Integer.valueOf(value); + type = DataType.INT; return this; } @@ -258,8 +294,9 @@ public final class NumericField extends AbstractField { * document.add(new NumericField(name, precisionStep).setDoubleValue(value)) */ public NumericField setDoubleValue(final double value) { - numericTS.setDoubleValue(value); + if (numericTS != null) numericTS.setDoubleValue(value); fieldsData = Double.valueOf(value); + type = DataType.DOUBLE; return this; } @@ -270,8 +307,9 @@ public final class NumericField extends AbstractField { * document.add(new NumericField(name, precisionStep).setFloatValue(value)) */ public NumericField setFloatValue(final float value) { - numericTS.setFloatValue(value); + if (numericTS != null) numericTS.setFloatValue(value); fieldsData = Float.valueOf(value); + type = DataType.FLOAT; return this; } diff --git a/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java index 11e55734046..745117daec0 100644 --- a/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java +++ b/lucene/src/java/org/apache/lucene/index/BufferedDeletesStream.java @@ -132,9 +132,9 @@ class BufferedDeletesStream { public final long gen; // If non-null, contains segments that are 100% deleted - public final SegmentInfos allDeleted; + public final List allDeleted; - ApplyDeletesResult(boolean anyDeletes, long gen, SegmentInfos allDeleted) { + ApplyDeletesResult(boolean anyDeletes, long gen, List allDeleted) { this.anyDeletes = anyDeletes; this.gen = gen; this.allDeleted = allDeleted; @@ -164,7 +164,7 @@ class BufferedDeletesStream { /** Resolves the buffered deleted Term/Query/docIDs, into * actual deleted docIDs in the deletedDocs BitVector for * each SegmentReader. */ - public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, SegmentInfos infos) throws IOException { + public synchronized ApplyDeletesResult applyDeletes(IndexWriter.ReaderPool readerPool, List infos) throws IOException { final long t0 = System.currentTimeMillis(); if (infos.size() == 0) { @@ -182,7 +182,7 @@ class BufferedDeletesStream { message("applyDeletes: infos=" + infos + " packetCount=" + deletes.size()); } - SegmentInfos infos2 = new SegmentInfos(); + List infos2 = new ArrayList(); infos2.addAll(infos); Collections.sort(infos2, sortSegInfoByDelGen); @@ -192,7 +192,7 @@ class BufferedDeletesStream { int infosIDX = infos2.size()-1; int delIDX = deletes.size()-1; - SegmentInfos allDeleted = null; + List allDeleted = null; while (infosIDX >= 0) { //System.out.println("BD: cycle delIDX=" + delIDX + " infoIDX=" + infosIDX); @@ -245,7 +245,7 @@ class BufferedDeletesStream { if (segAllDeletes) { if (allDeleted == null) { - allDeleted = new SegmentInfos(); + allDeleted = new ArrayList(); } allDeleted.add(info); } @@ -287,7 +287,7 @@ class BufferedDeletesStream { if (segAllDeletes) { if (allDeleted == null) { - allDeleted = new SegmentInfos(); + allDeleted = new ArrayList(); } allDeleted.add(info); } diff --git a/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java b/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java index c80a8343b16..f08da5c0b01 100644 --- a/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java +++ b/lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java @@ -46,8 +46,10 @@ import org.apache.lucene.util.IOUtils; * file. The {directory} that follows has that many entries. Each directory entry * contains a long pointer to the start of this file's data section, and a String * with that file's name. + * + * @lucene.internal */ -final class CompoundFileWriter { +public final class CompoundFileWriter { static final class FileEntry { @@ -137,8 +139,7 @@ final class CompoundFileWriter { /** Merge files with the extensions added up to now. * All files with these extensions are combined sequentially into the - * compound stream. After successful merge, the source files - * are deleted. + * compound stream. * @throws IllegalStateException if close() had been called before or * if no file has been added to this object */ diff --git a/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java b/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java index b9cafc7c5c2..bc29b35c241 100644 --- a/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java +++ b/lucene/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java @@ -135,8 +135,8 @@ public class ConcurrentMergeScheduler extends MergeScheduler { final MergePolicy.OneMerge m1 = t1.getCurrentMerge(); final MergePolicy.OneMerge m2 = t2.getCurrentMerge(); - final int c1 = m1 == null ? Integer.MAX_VALUE : m1.segments.totalDocCount(); - final int c2 = m2 == null ? Integer.MAX_VALUE : m2.segments.totalDocCount(); + final int c1 = m1 == null ? Integer.MAX_VALUE : m1.totalDocCount; + final int c2 = m2 == null ? Integer.MAX_VALUE : m2.totalDocCount; return c2 - c1; } diff --git a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java index 154b4884b8d..53765f84f1d 100644 --- a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java +++ b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java @@ -263,9 +263,10 @@ final class DocFieldProcessor extends DocConsumer { // enabled; we could save [small amount of] CPU // here. ArrayUtil.quickSort(fields, 0, fieldCount, fieldsComp); - - for(int i=0;i it = perThreadPool.getAllPerThreadsIterator(); while (it.hasNext()) { - it.next().perThread.docState.infoStream = infoStream; + it.next().perThread.setInfoStream(infoStream); } } diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java index 486c12659f7..85d2550a066 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterDeleteQueue.java @@ -63,9 +63,10 @@ import org.apache.lucene.search.Query; */ final class DocumentsWriterDeleteQueue { - private volatile Node tail; + private volatile Node tail; - private static final AtomicReferenceFieldUpdater tailUpdater = AtomicReferenceFieldUpdater + @SuppressWarnings("rawtypes") + private static final AtomicReferenceFieldUpdater tailUpdater = AtomicReferenceFieldUpdater .newUpdater(DocumentsWriterDeleteQueue.class, Node.class, "tail"); private final DeleteSlice globalSlice; @@ -90,7 +91,7 @@ final class DocumentsWriterDeleteQueue { * we use a sentinel instance as our initial tail. No slice will ever try to * apply this tail since the head is always omitted. */ - tail = new Node(null); // sentinel + tail = new Node(null); // sentinel globalSlice = new DeleteSlice(tail); } @@ -126,14 +127,14 @@ final class DocumentsWriterDeleteQueue { // we can do it just every n times or so? } - void add(Node item) { + void add(Node item) { /* * this non-blocking / 'wait-free' linked list add was inspired by Apache * Harmony's ConcurrentLinkedQueue Implementation. */ while (true) { - final Node currentTail = this.tail; - final Node tailNext = currentTail.next; + final Node currentTail = this.tail; + final Node tailNext = currentTail.next; if (tail == currentTail) { if (tailNext != null) { /* @@ -196,7 +197,7 @@ final class DocumentsWriterDeleteQueue { * deletes in the queue and reset the global slice to let the GC prune the * queue. */ - final Node currentTail = tail; // take the current tail make this local any + final Node currentTail = tail; // take the current tail make this local any // Changes after this call are applied later // and not relevant here if (callerSlice != null) { @@ -232,10 +233,10 @@ final class DocumentsWriterDeleteQueue { static class DeleteSlice { // No need to be volatile, slices are thread captive (only accessed by one thread)! - Node sliceHead; // we don't apply this one - Node sliceTail; + Node sliceHead; // we don't apply this one + Node sliceTail; - DeleteSlice(Node currentTail) { + DeleteSlice(Node currentTail) { assert currentTail != null; /* * Initially this is a 0 length slice pointing to the 'current' tail of @@ -256,7 +257,7 @@ final class DocumentsWriterDeleteQueue { * tail in this slice are not equal then there will be at least one more * non-null node in the slice! */ - Node current = sliceHead; + Node current = sliceHead; do { current = current.next; assert current != null : "slice property violated between the head on the tail must not be a null node"; @@ -290,7 +291,7 @@ final class DocumentsWriterDeleteQueue { void clear() { globalBufferLock.lock(); try { - final Node currentTail = tail; + final Node currentTail = tail; globalSlice.sliceHead = globalSlice.sliceTail = currentTail; globalBufferedDeletes.clear(); } finally { @@ -298,27 +299,28 @@ final class DocumentsWriterDeleteQueue { } } - private static class Node { - volatile Node next; - final Object item; + private static class Node { + volatile Node next; + final T item; - private Node(Object item) { + Node(T item) { this.item = item; } - static final AtomicReferenceFieldUpdater nextUpdater = AtomicReferenceFieldUpdater + @SuppressWarnings("rawtypes") + static final AtomicReferenceFieldUpdater nextUpdater = AtomicReferenceFieldUpdater .newUpdater(Node.class, Node.class, "next"); void apply(BufferedDeletes bufferedDeletes, int docIDUpto) { assert false : "sentinel item must never be applied"; } - boolean casNext(Node cmp, Node val) { + boolean casNext(Node cmp, Node val) { return nextUpdater.compareAndSet(this, cmp, val); } } - private static final class TermNode extends Node { + private static final class TermNode extends Node { TermNode(Term term) { super(term); @@ -326,33 +328,31 @@ final class DocumentsWriterDeleteQueue { @Override void apply(BufferedDeletes bufferedDeletes, int docIDUpto) { - bufferedDeletes.addTerm((Term) item, docIDUpto); + bufferedDeletes.addTerm(item, docIDUpto); } } - private static final class QueryArrayNode extends Node { + private static final class QueryArrayNode extends Node { QueryArrayNode(Query[] query) { super(query); } @Override void apply(BufferedDeletes bufferedDeletes, int docIDUpto) { - final Query[] queries = (Query[]) item; - for (Query query : queries) { + for (Query query : item) { bufferedDeletes.addQuery(query, docIDUpto); } } } - private static final class TermArrayNode extends Node { + private static final class TermArrayNode extends Node { TermArrayNode(Term[] term) { super(term); } @Override void apply(BufferedDeletes bufferedDeletes, int docIDUpto) { - final Term[] terms = (Term[]) item; - for (Term term : terms) { + for (Term term : item) { bufferedDeletes.addTerm(term, docIDUpto); } } @@ -361,7 +361,7 @@ final class DocumentsWriterDeleteQueue { private boolean forceApplyGlobalSlice() { globalBufferLock.lock(); - final Node currentTail = tail; + final Node currentTail = tail; try { if (globalSlice.sliceTail != currentTail) { globalSlice.sliceTail = currentTail; diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java index 443df5139ca..932b3d29b36 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java @@ -122,13 +122,13 @@ public final class DocumentsWriterFlushControl { // is super important since we can not address more than 2048 MB per DWPT setFlushPending(perThread); if (fullFlush) { - DocumentsWriterPerThread toBlock = internalTryCheckOutForFlush(perThread, false); + DocumentsWriterPerThread toBlock = internalTryCheckOutForFlush(perThread); assert toBlock != null; blockedFlushes.add(toBlock); } } } - final DocumentsWriterPerThread flushingDWPT = tryCheckoutForFlush(perThread, false); + final DocumentsWriterPerThread flushingDWPT = tryCheckoutForFlush(perThread); healthiness.updateStalled(this); return flushingDWPT; } @@ -189,18 +189,15 @@ public final class DocumentsWriterFlushControl { } synchronized DocumentsWriterPerThread tryCheckoutForFlush( - ThreadState perThread, boolean setPending) { + ThreadState perThread) { if (fullFlush) { return null; } - return internalTryCheckOutForFlush(perThread, setPending); + return internalTryCheckOutForFlush(perThread); } private DocumentsWriterPerThread internalTryCheckOutForFlush( - ThreadState perThread, boolean setPending) { - if (setPending && !perThread.flushPending) { - setFlushPending(perThread); - } + ThreadState perThread) { if (perThread.flushPending) { // We are pending so all memory is already moved to flushBytes if (perThread.tryLock()) { @@ -245,7 +242,7 @@ public final class DocumentsWriterFlushControl { while (allActiveThreads.hasNext() && numPending > 0) { ThreadState next = allActiveThreads.next(); if (next.flushPending) { - final DocumentsWriterPerThread dwpt = tryCheckoutForFlush(next, false); + final DocumentsWriterPerThread dwpt = tryCheckoutForFlush(next); if (dwpt != null) { return dwpt; } @@ -330,7 +327,12 @@ public final class DocumentsWriterFlushControl { } if (next.perThread.getNumDocsInRAM() > 0 ) { final DocumentsWriterPerThread dwpt = next.perThread; // just for assert - final DocumentsWriterPerThread flushingDWPT = internalTryCheckOutForFlush(next, true); + synchronized (this) { + if (!next.flushPending) { + setFlushPending(next); + } + } + final DocumentsWriterPerThread flushingDWPT = internalTryCheckOutForFlush(next); assert flushingDWPT != null : "DWPT must never be null here since we hold the lock and it holds documents"; assert dwpt == flushingDWPT : "flushControl returned different DWPT"; toFlush.add(flushingDWPT); diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java index e943055bc37..4f14fd8f341 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java @@ -163,7 +163,7 @@ public class DocumentsWriterPerThread { boolean hasAborted = false; // True if the last exception throws by #updateDocument was aborting private FieldInfos fieldInfos; - private final PrintStream infoStream; + private PrintStream infoStream; private int numDocsInRAM; private int flushedDocCount; DocumentsWriterDeleteQueue deleteQueue; @@ -235,6 +235,7 @@ public class DocumentsWriterPerThread { // mark document as deleted deleteDocID(docState.docID); numDocsInRAM++; + fieldInfos.revertUncommitted(); } else { abort(); } @@ -377,15 +378,12 @@ public class DocumentsWriterPerThread { boolean success = false; try { - - SegmentInfo newSegment = new SegmentInfo(segment, flushState.numDocs, directory, false, fieldInfos.hasProx(), flushState.segmentCodecs, false, fieldInfos); consumer.flush(flushState); pendingDeletes.terms.clear(); - newSegment.setHasVectors(flushState.hasVectors); - + final SegmentInfo newSegment = new SegmentInfo(segment, flushState.numDocs, directory, false, flushState.segmentCodecs, fieldInfos.asReadOnly()); if (infoStream != null) { message("new segment has " + (flushState.deletedDocs == null ? 0 : flushState.deletedDocs.count()) + " deleted docs"); - message("new segment has " + (flushState.hasVectors ? "vectors" : "no vectors")); + message("new segment has " + (newSegment.getHasVectors() ? "vectors" : "no vectors")); message("flushedFiles=" + newSegment.files()); message("flushed codecs=" + newSegment.getSegmentCodecs()); } @@ -435,10 +433,6 @@ public class DocumentsWriterPerThread { return bytesUsed.get() + pendingDeletes.bytesUsed.get(); } - FieldInfos getFieldInfos() { - return fieldInfos; - } - void message(String message) { writer.message("DWPT: " + message); } @@ -498,4 +492,9 @@ public class DocumentsWriterPerThread { assert segment != null; return new PerDocWriteState(infoStream, directory, segment, fieldInfos, bytesUsed, codecId); } + + void setInfoStream(PrintStream infoStream) { + this.infoStream = infoStream; + docState.infoStream = infoStream; + } } diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/src/java/org/apache/lucene/index/FieldInfo.java index 3aba2850b42..26b8d30a3ea 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfo.java @@ -22,7 +22,6 @@ import org.apache.lucene.index.values.Type; /** @lucene.experimental */ public final class FieldInfo { public static final int UNASSIGNED_CODEC_ID = -1; - public final String name; public final int number; @@ -113,7 +112,6 @@ public final class FieldInfo { } assert !this.omitTermFreqAndPositions || !this.storePayloads; } - void setDocValues(Type v) { if (docValues == null) { docValues = v; @@ -127,4 +125,29 @@ public final class FieldInfo { public Type getDocValues() { return docValues; } + + private boolean vectorsCommitted; + + /** + * Reverts all uncommitted changes on this {@link FieldInfo} + * @see #commitVectors() + */ + void revertUncommitted() { + if (storeTermVector && !vectorsCommitted) { + storeOffsetWithTermVector = false; + storePositionWithTermVector = false; + storeTermVector = false; + } + } + + /** + * Commits term vector modifications. Changes to term-vectors must be + * explicitly committed once the necessary files are created. If those changes + * are not committed subsequent {@link #revertUncommitted()} will reset the + * all term-vector flags before the next document. + */ + void commitVectors() { + assert storeTermVector; + vectorsCommitted = true; + } } diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index c62649a6bf1..422560ea057 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -220,6 +220,10 @@ public final class FieldInfos implements Iterable { static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40; private int format; + private boolean hasProx; // only set if readonly + private boolean hasVectors; // only set if readonly + private long version; // internal use to track changes + /** * Creates a new {@link FieldInfos} instance with a private @@ -267,7 +271,7 @@ public final class FieldInfos implements Iterable { */ public FieldInfos(Directory d, String name) throws IOException { this((FieldNumberBiMap)null, null); // use null here to make this FIs Read-Only - IndexInput input = d.openInput(name); + final IndexInput input = d.openInput(name); try { read(input, name); } finally { @@ -303,6 +307,9 @@ public final class FieldInfos implements Iterable { @Override synchronized public Object clone() { FieldInfos fis = new FieldInfos(globalFieldNumbers, segmentCodecsBuilder); + fis.format = format; + fis.hasProx = hasProx; + fis.hasVectors = hasVectors; for (FieldInfo fi : this) { FieldInfo clone = (FieldInfo) (fi).clone(); fis.putInternal(clone); @@ -312,6 +319,10 @@ public final class FieldInfos implements Iterable { /** Returns true if any fields do not omitTermFreqAndPositions */ public boolean hasProx() { + if (isReadOnly()) { + return hasProx; + } + // mutable FIs must check! for (FieldInfo fi : this) { if (fi.isIndexed && !fi.omitTermFreqAndPositions) { return true; @@ -445,6 +456,7 @@ public final class FieldInfos implements Iterable { if ((fi.isIndexed || fi.hasDocValues()) && fi.getCodecId() == FieldInfo.UNASSIGNED_CODEC_ID) { segmentCodecsBuilder.tryAddAndSet(fi); } + version++; return fi; } @@ -514,6 +526,10 @@ public final class FieldInfos implements Iterable { } public boolean hasVectors() { + if (isReadOnly()) { + return hasVectors; + } + // mutable FIs must check for (FieldInfo fi : this) { if (fi.storeTermVector) { return true; @@ -566,6 +582,10 @@ public final class FieldInfos implements Iterable { public final boolean isReadOnly() { return globalFieldNumbers == null; } + + synchronized final long getVersion() { + return version; + } public void write(IndexOutput output) throws IOException { output.writeVInt(FORMAT_CURRENT); @@ -658,7 +678,8 @@ public final class FieldInfos implements Iterable { if (omitTermFreqAndPositions) { storePayloads = false; } - + hasVectors |= storeTermVector; + hasProx |= isIndexed && !omitTermFreqAndPositions; Type docValuesType = null; if (format <= FORMAT_INDEX_VALUES) { final byte b = input.readByte(); @@ -705,5 +726,29 @@ public final class FieldInfos implements Iterable { throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length()); } } + + /** + * Reverts all uncommitted changes + * @see FieldInfo#revertUncommitted() + */ + void revertUncommitted() { + for (FieldInfo fieldInfo : this) { + fieldInfo.revertUncommitted(); + } + } + + final FieldInfos asReadOnly() { + if (isReadOnly()) { + return this; + } + final FieldInfos roFis = new FieldInfos((FieldNumberBiMap)null, null); + for (FieldInfo fieldInfo : this) { + FieldInfo clone = (FieldInfo) (fieldInfo).clone(); + roFis.putInternal(clone); + roFis.hasVectors |= clone.storeTermVector; + roFis.hasProx |= clone.isIndexed && !clone.omitTermFreqAndPositions; + } + return roFis; + } } diff --git a/lucene/src/java/org/apache/lucene/index/FieldsReader.java b/lucene/src/java/org/apache/lucene/index/FieldsReader.java index 76c0ed23552..e135d6d2870 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/FieldsReader.java @@ -24,10 +24,11 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexInput; +import org.apache.lucene.document.NumericField; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.CloseableThreadLocal; import java.io.IOException; @@ -212,40 +213,39 @@ public final class FieldsReader implements Cloneable { Document doc = new Document(); int numFields = fieldsStream.readVInt(); - for (int i = 0; i < numFields; i++) { + out: for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); - byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + int bits = fieldsStream.readByte() & 0xFF; + assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; - //TODO: Find an alternative approach here if this list continues to grow beyond the - //list of 5 or 6 currently here. See Lucene 762 for discussion - if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, tokenize); - } - else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, tokenize); - break;//Get out of this loop - } - else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, tokenize, true); - } - else if (acceptField.equals(FieldSelectorResult.LATENT)) { - addFieldLazy(doc, fi, binary, tokenize, false); - } - else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(addFieldSize(doc, fi, binary)); - } - else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary); - break; - } - else { - skipField(); + final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; + + switch (acceptField) { + case LOAD: + addField(doc, fi, binary, tokenize, numeric); + break; + case LOAD_AND_BREAK: + addField(doc, fi, binary, tokenize, numeric); + break out; //Get out of this loop + case LAZY_LOAD: + addFieldLazy(doc, fi, binary, tokenize, true, numeric); + break; + case LATENT: + addFieldLazy(doc, fi, binary, tokenize, false, numeric); + break; + case SIZE: + skipFieldBytes(addFieldSize(doc, fi, binary, numeric)); + break; + case SIZE_AND_BREAK: + addFieldSize(doc, fi, binary, numeric); + break out; //Get out of this loop + default: + skipField(numeric); } } @@ -282,72 +282,121 @@ public final class FieldsReader implements Cloneable { * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField() throws IOException { - skipField(fieldsStream.readVInt()); + private void skipField(int numeric) throws IOException { + final int numBytes; + switch(numeric) { + case 0: + numBytes = fieldsStream.readVInt(); + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + numBytes = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + numBytes = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + + skipFieldBytes(numBytes); } - private void skipField(int toRead) throws IOException { + private void skipFieldBytes(int toRead) throws IOException { fieldsStream.seek(fieldsStream.getFilePointer() + toRead); } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize, boolean cacheResult) throws IOException { + private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException { + assert numeric != 0; + switch(numeric) { + case FieldsWriter.FIELD_IS_NUMERIC_INT: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readInt()); + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readLong()); + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readInt())); + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readLong())); + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } + + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize, boolean cacheResult, int numeric) throws IOException { + final AbstractField f; if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); - //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); - doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, cacheResult)); + f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, cacheResult); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - AbstractField f; int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //Skip ahead of where we are by the length of what is stored fieldsStream.seek(pointer+length); f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, cacheResult); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - - doc.add(f); } - + + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + doc.add(f); } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException { + private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize, int numeric) throws CorruptIndexException, IOException { + final AbstractField f; if (binary) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); - doc.add(new Field(fi.name, b)); + f = new Field(fi.name, b); + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { - Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - - AbstractField f; f = new Field(fi.name, // name - false, - fieldsStream.readString(), // read value - store, - index, - termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); - - doc.add(f); + false, + fieldsStream.readString(), // read value + Field.Store.YES, + index, + termVector); } + + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); + doc.add(f); } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary, int numeric) throws IOException { + final int bytesize, size; + switch(numeric) { + case 0: + size = fieldsStream.readVInt(); + bytesize = binary ? size : 2*size; + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + size = bytesize = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + size = bytesize = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -358,7 +407,7 @@ public final class FieldsReader implements Cloneable { } /** - * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is + * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { diff --git a/lucene/src/java/org/apache/lucene/index/FieldsWriter.java b/lucene/src/java/org/apache/lucene/index/FieldsWriter.java index 303aa912bc3..9efd909574e 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/FieldsWriter.java @@ -21,22 +21,40 @@ import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; final class FieldsWriter { - static final byte FIELD_IS_TOKENIZED = 0x1; - static final byte FIELD_IS_BINARY = 0x2; + static final int FIELD_IS_TOKENIZED = 1 << 0; + static final int FIELD_IS_BINARY = 1 << 1; + // the old bit 1 << 2 was compressed, is now left out + + private static final int _NUMERIC_BIT_SHIFT = 3; + static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT; + + static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; + + // the next possible bits are: 1 << 6; 1 << 7 + // Lucene 3.0: Removal of compressed fields static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; + // Lucene 3.2: NumericFields are stored in binary format + static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; // when removing support for old versions, leave the last supported version here static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; @@ -121,13 +139,26 @@ final class FieldsWriter { final void writeField(int fieldNumber, Fieldable field) throws IOException { fieldsStream.writeVInt(fieldNumber); - byte bits = 0; + int bits = 0; if (field.isTokenized()) - bits |= FieldsWriter.FIELD_IS_TOKENIZED; + bits |= FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= FieldsWriter.FIELD_IS_BINARY; - - fieldsStream.writeByte(bits); + bits |= FIELD_IS_BINARY; + if (field instanceof NumericField) { + switch (((NumericField) field).getDataType()) { + case INT: + bits |= FIELD_IS_NUMERIC_INT; break; + case LONG: + bits |= FIELD_IS_NUMERIC_LONG; break; + case FLOAT: + bits |= FIELD_IS_NUMERIC_FLOAT; break; + case DOUBLE: + bits |= FIELD_IS_NUMERIC_DOUBLE; break; + default: + assert false : "Should never get here"; + } + } + fieldsStream.writeByte((byte) bits); if (field.isBinary()) { final byte[] data; @@ -139,8 +170,22 @@ final class FieldsWriter { fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); - } - else { + } else if (field instanceof NumericField) { + final NumericField nf = (NumericField) field; + final Number n = nf.getNumericValue(); + switch (nf.getDataType()) { + case INT: + fieldsStream.writeInt(n.intValue()); break; + case LONG: + fieldsStream.writeLong(n.longValue()); break; + case FLOAT: + fieldsStream.writeInt(Float.floatToIntBits(n.floatValue())); break; + case DOUBLE: + fieldsStream.writeLong(Double.doubleToLongBits(n.doubleValue())); break; + default: + assert false : "Should never get here"; + } + } else { fieldsStream.writeString(field.stringValue()); } } diff --git a/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java b/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java index ecf41bacabc..c4559870cfb 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java +++ b/lucene/src/java/org/apache/lucene/index/IndexFileDeleter.java @@ -22,6 +22,7 @@ import java.io.FilenameFilter; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; @@ -196,7 +197,31 @@ final class IndexFileDeleter { } } if (sis != null) { - CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis); + final SegmentInfos infos = sis; + for (SegmentInfo segmentInfo : infos) { + try { + /* + * Force FI to load for each segment since we could see a + * segments file and load successfully above if the files are + * still referenced when they are deleted and the os doesn't let + * you delete them. Yet its likely that fnm files are removed + * while seg file is still around Since LUCENE-2984 we need FI + * to find out if a seg has vectors and prox so we need those + * files to be opened for a commit point. + */ + segmentInfo.getFieldInfos(); + } catch (FileNotFoundException e) { + refresh(segmentInfo.name); + sis = null; + if (infoStream != null) { + message("init: hit FileNotFoundException when loading commit \"" + fileName + "\"; skipping this commit point"); + } + } + } + + } + if (sis != null) { + final CommitPoint commitPoint = new CommitPoint(commitsToDelete, directory, sis); if (sis.getGeneration() == segmentInfos.getGeneration()) { currentCommitPoint = commitPoint; } diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index 984f77b7117..ed7d472c33f 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -1428,7 +1428,7 @@ public abstract class IndexReader implements Cloneable,Closeable { cfr = new CompoundFileReader(dir, filename); String [] files = cfr.listAll(); - ArrayUtil.quickSort(files); // sort the array of filename so that the output is more readable + ArrayUtil.mergeSort(files); // sort the array of filename so that the output is more readable for (int i = 0; i < files.length; ++i) { long len = cfr.fileLength(files[i]); diff --git a/lucene/src/java/org/apache/lucene/index/IndexUpgrader.java b/lucene/src/java/org/apache/lucene/index/IndexUpgrader.java new file mode 100644 index 00000000000..e53dae99a2c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/IndexUpgrader.java @@ -0,0 +1,129 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Constants; +import org.apache.lucene.util.Version; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Collection; + +/** + * This is an easy-to-use tool that upgrades all segments of an index from previous Lucene versions + * to the current segment file format. It can be used from command line: + *
+  *  java -cp lucene-core.jar org.apache.lucene.index.IndexUpgrader [-delete-prior-commits] [-verbose] indexDir
+  * 
+ * Alternatively this class can be instantiated and {@link #upgrade} invoked. It uses {@link UpgradeIndexMergePolicy} + * and triggers the upgrade via an optimize request to {@link IndexWriter}. + *

This tool keeps only the last commit in an index; for this + * reason, if the incoming index has more than one commit, the tool + * refuses to run by default. Specify {@code -delete-prior-commits} + * to override this, allowing the tool to delete all but the last commit. + * From Java code this can be enabled by passing {@code true} to + * {@link #IndexUpgrader(Directory,PrintStream,boolean)}. + */ +public final class IndexUpgrader { + + private static void printUsage() { + System.err.println("Upgrades an index so all segments created with a previous Lucene version are rewritten."); + System.err.println("Usage:"); + System.err.println(" java " + IndexUpgrader.class.getName() + " [-delete-prior-commits] [-verbose] indexDir"); + System.err.println("This tool keeps only the last commit in an index; for this"); + System.err.println("reason, if the incoming index has more than one commit, the tool"); + System.err.println("refuses to run by default. Specify -delete-prior-commits to override"); + System.err.println("this, allowing the tool to delete all but the last commit."); + System.exit(1); + } + + public static void main(String[] args) throws IOException { + String dir = null; + boolean deletePriorCommits = false; + PrintStream out = null; + for (String arg : args) { + if ("-delete-prior-commits".equals(arg)) { + deletePriorCommits = true; + } else if ("-verbose".equals(arg)) { + out = System.out; + } else if (dir == null) { + dir = arg; + } else { + printUsage(); + } + } + if (dir == null) { + printUsage(); + } + + new IndexUpgrader(FSDirectory.open(new File(dir)), out, deletePriorCommits).upgrade(); + } + + private final Directory dir; + private final PrintStream infoStream; + private final IndexWriterConfig iwc; + private final boolean deletePriorCommits; + + @SuppressWarnings("deprecation") + public IndexUpgrader(Directory dir) { + this(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, null), null, false); + } + + @SuppressWarnings("deprecation") + public IndexUpgrader(Directory dir, PrintStream infoStream, boolean deletePriorCommits) { + this(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, null), infoStream, deletePriorCommits); + } + + public IndexUpgrader(Directory dir, IndexWriterConfig iwc, PrintStream infoStream, boolean deletePriorCommits) { + this.dir = dir; + this.iwc = iwc; + this.infoStream = infoStream; + this.deletePriorCommits = deletePriorCommits; + } + + public void upgrade() throws IOException { + if (!IndexReader.indexExists(dir)) { + throw new IndexNotFoundException(dir.toString()); + } + + if (!deletePriorCommits) { + final Collection commits = IndexReader.listCommits(dir); + if (commits.size() > 1) { + throw new IllegalArgumentException("This tool was invoked to not delete prior commit points, but the following commits were found: " + commits); + } + } + + final IndexWriterConfig c = (IndexWriterConfig) iwc.clone(); + c.setMergePolicy(new UpgradeIndexMergePolicy(c.getMergePolicy())); + c.setIndexDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()); + + final IndexWriter w = new IndexWriter(dir, c); + try { + w.setInfoStream(infoStream); + w.message("Upgrading all pre-" + Constants.LUCENE_MAIN_VERSION + " segments of index directory '" + dir + "' to version " + Constants.LUCENE_MAIN_VERSION + "..."); + w.optimize(); + w.message("All segments upgraded to version " + Constants.LUCENE_MAIN_VERSION); + } finally { + w.close(); + } + } + +} diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/src/java/org/apache/lucene/index/IndexWriter.java index 826049c997f..2a6d4ae9519 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriter.java @@ -421,7 +421,7 @@ public class IndexWriter implements Closeable { private final Map readerMap = new HashMap(); /** Forcefully clear changes for the specified segments. This is called on successful merge. */ - synchronized void clear(SegmentInfos infos) throws IOException { + synchronized void clear(List infos) throws IOException { if (infos == null) { for (Map.Entry ent: readerMap.entrySet()) { ent.getValue().hasChanges = false; @@ -511,7 +511,7 @@ public class IndexWriter implements Closeable { return false; } - public synchronized void drop(SegmentInfos infos) throws IOException { + public synchronized void drop(List infos) throws IOException { for(SegmentInfo info : infos) { drop(info); } @@ -2355,7 +2355,7 @@ public class IndexWriter implements Closeable { String mergedName = newSegmentName(); SegmentMerger merger = new SegmentMerger(directory, config.getTermIndexInterval(), - mergedName, null, codecs, payloadProcessorProvider, + mergedName, null, payloadProcessorProvider, globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs))); for (IndexReader reader : readers) // add new indexes @@ -2365,8 +2365,7 @@ public class IndexWriter implements Closeable { final FieldInfos fieldInfos = merger.fieldInfos(); SegmentInfo info = new SegmentInfo(mergedName, docCount, directory, - false, fieldInfos.hasProx(), merger.getSegmentCodecs(), - fieldInfos.hasVectors(), + false, merger.getSegmentCodecs(), fieldInfos); setDiagnostics(info, "addIndexes(IndexReader...)"); @@ -2729,7 +2728,7 @@ public class IndexWriter implements Closeable { assert testPoint("startCommitMergeDeletes"); - final SegmentInfos sourceSegments = merge.segments; + final List sourceSegments = merge.segments; if (infoStream != null) message("commitMergeDeletes " + merge.segString(directory)); @@ -2741,7 +2740,7 @@ public class IndexWriter implements Closeable { long minGen = Long.MAX_VALUE; for(int i=0; i < sourceSegments.size(); i++) { - SegmentInfo info = sourceSegments.info(i); + SegmentInfo info = sourceSegments.get(i); minGen = Math.min(info.getBufferedDeletesGen(), minGen); int docCount = info.docCount; final SegmentReader previousReader = merge.readerClones.get(i); @@ -3041,7 +3040,16 @@ public class IndexWriter implements Closeable { // is running (while synchronized) to avoid race // condition where two conflicting merges from different // threads, start - message("registerMerge merging=" + mergingSegments); + if (infoStream != null) { + StringBuilder builder = new StringBuilder("registerMerge merging= ["); + for (SegmentInfo info : mergingSegments) { + builder.append(info.name).append(", "); + } + builder.append("]"); + // don't call mergingSegments.toString() could lead to ConcurrentModException + // since merge updates the segments FieldInfos + message(builder.toString()); + } for(SegmentInfo info : merge.segments) { message("registerMerge info=" + info); mergingSegments.add(info); @@ -3094,7 +3102,7 @@ public class IndexWriter implements Closeable { // Bind a new segment name here so even with // ConcurrentMergePolicy we keep deterministic segment // names. - merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, false, null, false, globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs))); + merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, null, globalFieldNumberMap.newFieldInfos(SegmentCodecsBuilder.create(codecs))); // Lock order: IW -> BD final BufferedDeletesStream.ApplyDeletesResult result = bufferedDeletesStream.applyDeletes(readerPool, merge.segments); @@ -3133,6 +3141,16 @@ public class IndexWriter implements Closeable { message("merge seg=" + merge.info.name); } + assert merge.estimatedMergeBytes == 0; + for(SegmentInfo info : merge.segments) { + if (info.docCount > 0) { + final int delCount = numDeletedDocs(info); + assert delCount <= info.docCount; + final double delRatio = ((double) delCount)/info.docCount; + merge.estimatedMergeBytes += info.sizeInBytes(true) * (1.0 - delRatio); + } + } + // TODO: I think this should no longer be needed (we // now build CFS before adding segment to the infos); // however, on removing it, tests fail for some reason! @@ -3174,7 +3192,7 @@ public class IndexWriter implements Closeable { // It's possible we are called twice, eg if there was an // exception inside mergeInit if (merge.registerDone) { - final SegmentInfos sourceSegments = merge.segments; + final List sourceSegments = merge.segments; for(SegmentInfo info : sourceSegments) { mergingSegments.remove(info); } @@ -3245,21 +3263,17 @@ public class IndexWriter implements Closeable { int mergedDocCount = 0; - SegmentInfos sourceSegments = merge.segments; + List sourceSegments = merge.segments; SegmentMerger merger = new SegmentMerger(directory, config.getTermIndexInterval(), mergedName, merge, - codecs, payloadProcessorProvider, - merge.info.getFieldInfos()); + payloadProcessorProvider, merge.info.getFieldInfos()); if (infoStream != null) { - message("merging " + merge.segString(directory) + " mergeVectors=" + merger.fieldInfos().hasVectors()); + message("merging " + merge.segString(directory) + " mergeVectors=" + merge.info.getFieldInfos().hasVectors()); } merge.readers = new ArrayList(); merge.readerClones = new ArrayList(); - - merge.estimatedMergeBytes = 0; - // This is try/finally to make sure merger's readers are // closed: boolean success = false; @@ -3268,7 +3282,7 @@ public class IndexWriter implements Closeable { int segUpto = 0; while(segUpto < sourceSegments.size()) { - final SegmentInfo info = sourceSegments.info(segUpto); + final SegmentInfo info = sourceSegments.get(segUpto); // Hold onto the "live" reader; we will use this to // commit merged deletes @@ -3277,13 +3291,6 @@ public class IndexWriter implements Closeable { -config.getReaderTermsIndexDivisor()); merge.readers.add(reader); - final int readerMaxDoc = reader.maxDoc(); - if (readerMaxDoc > 0) { - final int delCount = reader.numDeletedDocs(); - final double delRatio = ((double) delCount)/readerMaxDoc; - merge.estimatedMergeBytes += info.sizeInBytes(true) * (1.0 - delRatio); - } - // We clone the segment readers because other // deletes may come in while we're merging so we // need readers that will not change @@ -3308,8 +3315,6 @@ public class IndexWriter implements Closeable { // Record which codec was used to write the segment merge.info.setSegmentCodecs(merger.getSegmentCodecs()); - // Record if we have merged vectors - merge.info.setHasVectors(merger.fieldInfos().hasVectors()); if (infoStream != null) { message("merge segmentCodecs=" + merger.getSegmentCodecs()); @@ -3323,13 +3328,11 @@ public class IndexWriter implements Closeable { // because codec must know if prox was written for // this segment: //System.out.println("merger set hasProx=" + merger.hasProx() + " seg=" + merge.info.name); - merge.info.setHasProx(merger.fieldInfos().hasProx()); - boolean useCompoundFile; synchronized (this) { // Guard segmentInfos useCompoundFile = mergePolicy.useCompoundFile(segmentInfos, merge.info); } - + if (useCompoundFile) { success = false; final String compoundFileName = IndexFileNames.segmentFileName(mergedName, "", IndexFileNames.COMPOUND_FILE_EXTENSION); @@ -3469,14 +3472,14 @@ public class IndexWriter implements Closeable { } /** @lucene.internal */ - public synchronized String segString(SegmentInfos infos) throws IOException { + public synchronized String segString(List infos) throws IOException { StringBuilder buffer = new StringBuilder(); final int count = infos.size(); for(int i = 0; i < count; i++) { if (i > 0) { buffer.append(' '); } - buffer.append(segString(infos.info(i))); + buffer.append(segString(infos.get(i))); } return buffer.toString(); @@ -3531,6 +3534,7 @@ public class IndexWriter implements Closeable { // called only from assert private boolean filesExist(SegmentInfos toSync) throws IOException { + Collection files = toSync.files(directory, false); for(final String fileName: files) { assert directory.fileExists(fileName): "file " + fileName + " does not exist"; diff --git a/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java b/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java index 1be4f26b77f..fc419bd2f7c 100644 --- a/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java +++ b/lucene/src/java/org/apache/lucene/index/LogMergePolicy.java @@ -20,7 +20,6 @@ package org.apache.lucene.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Comparator; import java.util.List; import java.util.Set; @@ -595,7 +594,7 @@ public abstract class LogMergePolicy extends MergePolicy { } else if (!anyTooLarge) { if (spec == null) spec = new MergeSpecification(); - final SegmentInfos mergeInfos = new SegmentInfos(); + final List mergeInfos = new ArrayList(); for(int i=start;i 0; // Make sure it all adds up: - assert docShift == maxDocID - (newStarts[docMaps.length-1] + merge.segments.info(docMaps.length-1).docCount - delCounts[docMaps.length-1]); + assert docShift == maxDocID - (newStarts[docMaps.length-1] + merge.segments.get(docMaps.length-1).docCount - delCounts[docMaps.length-1]); } public int remap(int oldDocID) { diff --git a/lucene/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/src/java/org/apache/lucene/index/MergePolicy.java index 31289bd18d6..bbced4e9cef 100644 --- a/lucene/src/java/org/apache/lucene/index/MergePolicy.java +++ b/lucene/src/java/org/apache/lucene/index/MergePolicy.java @@ -75,15 +75,21 @@ public abstract class MergePolicy implements java.io.Closeable { long estimatedMergeBytes; // used by IndexWriter List readers; // used by IndexWriter List readerClones; // used by IndexWriter - public final SegmentInfos segments; + public final List segments; + public final int totalDocCount; boolean aborted; Throwable error; boolean paused; - public OneMerge(SegmentInfos segments) { + public OneMerge(List segments) { if (0 == segments.size()) throw new RuntimeException("segments must include at least one segment"); this.segments = segments; + int count = 0; + for(SegmentInfo info : segments) { + count += info.docCount; + } + totalDocCount = count; } /** Record that an exception occurred while executing @@ -147,7 +153,7 @@ public abstract class MergePolicy implements java.io.Closeable { final int numSegments = segments.size(); for(int i=0;i 0) b.append(' '); - b.append(segments.info(i).toString(dir, 0)); + b.append(segments.get(i).toString(dir, 0)); } if (info != null) b.append(" into ").append(info.name); diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java index f7999da4219..15c400e6c87 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java @@ -43,7 +43,8 @@ import org.apache.lucene.util.Constants; * @lucene.experimental */ public final class SegmentInfo { - + // TODO: remove with hasVector and hasProx + private static final int CHECK_FIELDINFO = -2; static final int NO = -1; // e.g. no norms; no deletes; static final int YES = 1; // e.g. have norms; have deletes; static final int WITHOUT_GEN = 0; // a file name that has no GEN in it. @@ -85,10 +86,12 @@ public final class SegmentInfo { private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx) private int delCount; // How many deleted docs in this segment + + //TODO: remove when we don't have to support old indexes anymore that had this field + private int hasVectors = CHECK_FIELDINFO; + //TODO: remove when we don't have to support old indexes anymore that had this field + private int hasProx = CHECK_FIELDINFO; // True if this segment has any fields with omitTermFreqAndPositions==false - private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false - - private boolean hasVectors; // True if this segment wrote term vectors private FieldInfos fieldInfos; @@ -106,9 +109,12 @@ public final class SegmentInfo { // NOTE: only used in-RAM by IW to track buffered deletes; // this is never written to/read from the Directory private long bufferedDeletesGen; - + + // holds the fieldInfos Version to refresh files() cache if FI has changed + private long fieldInfosVersion; + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, - boolean hasProx, SegmentCodecs segmentCodecs, boolean hasVectors, FieldInfos fieldInfos) { + SegmentCodecs segmentCodecs, FieldInfos fieldInfos) { this.name = name; this.docCount = docCount; this.dir = dir; @@ -116,9 +122,7 @@ public final class SegmentInfo { this.isCompoundFile = isCompoundFile; this.docStoreOffset = -1; this.docStoreSegment = name; - this.hasProx = hasProx; this.segmentCodecs = segmentCodecs; - this.hasVectors = hasVectors; delCount = 0; version = Constants.LUCENE_MAIN_VERSION; this.fieldInfos = fieldInfos; @@ -213,7 +217,7 @@ public final class SegmentInfo { delCount = input.readInt(); assert delCount <= docCount; - hasProx = input.readByte() == YES; + hasProx = input.readByte(); // System.out.println(Thread.currentThread().getName() + ": si.read hasProx=" + hasProx + " seg=" + name); if (format <= DefaultSegmentInfosWriter.FORMAT_4_0) { @@ -226,7 +230,7 @@ public final class SegmentInfo { diagnostics = input.readStringStringMap(); if (format <= DefaultSegmentInfosWriter.FORMAT_HAS_VECTORS) { - hasVectors = input.readByte() == 1; + hasVectors = input.readByte(); } else { final String storesSegment; final String ext; @@ -247,7 +251,7 @@ public final class SegmentInfo { dirToTest = dir; } try { - hasVectors = dirToTest.fileExists(IndexFileNames.segmentFileName(storesSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION)); + hasVectors = dirToTest.fileExists(IndexFileNames.segmentFileName(storesSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION)) ? YES : NO; } finally { if (isCompoundFile) { dirToTest.close(); @@ -311,14 +315,9 @@ public final class SegmentInfo { } public boolean getHasVectors() throws IOException { - return hasVectors; + return hasVectors == CHECK_FIELDINFO ? getFieldInfos().hasVectors() : hasVectors == YES; } - - public void setHasVectors(boolean v) { - hasVectors = v; - clearFilesCache(); - } - + public FieldInfos getFieldInfos() throws IOException { loadFieldInfos(dir, true); return fieldInfos; @@ -349,7 +348,7 @@ public final class SegmentInfo { @Override public Object clone() { - final SegmentInfo si = new SegmentInfo(name, docCount, dir, isCompoundFile, hasProx, segmentCodecs, hasVectors, + final SegmentInfo si = new SegmentInfo(name, docCount, dir, isCompoundFile, segmentCodecs, fieldInfos == null ? null : (FieldInfos) fieldInfos.clone()); si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; @@ -364,6 +363,8 @@ public final class SegmentInfo { } } si.version = version; + si.hasProx = hasProx; + si.hasVectors = hasVectors; return si; } @@ -569,19 +570,14 @@ public final class SegmentInfo { output.writeByte((byte) (isCompoundFile ? YES : NO)); output.writeInt(delCount); - output.writeByte((byte) (hasProx ? 1:0)); + output.writeByte((byte) (hasProx)); segmentCodecs.write(output); output.writeStringStringMap(diagnostics); - output.writeByte((byte) (hasVectors ? 1 : 0)); + output.writeByte((byte) (hasVectors)); } - void setHasProx(boolean hasProx) { - this.hasProx = hasProx; - clearFilesCache(); - } - - public boolean getHasProx() { - return hasProx; + public boolean getHasProx() throws IOException { + return hasProx == CHECK_FIELDINFO ? getFieldInfos().hasProx() : hasProx == YES; } /** Can only be called once. */ @@ -609,13 +605,14 @@ public final class SegmentInfo { */ public List files() throws IOException { - - if (files != null) { + final long fisVersion = fieldInfosVersion; + if (fisVersion != (fieldInfosVersion = getFieldInfos().getVersion())) { + clearFilesCache(); // FIS has modifications - need to recompute + } else if (files != null) { // Already cached: return files; } - - Set fileSet = new HashSet(); + final Set fileSet = new HashSet(); boolean useCompoundFile = getUseCompoundFile(); @@ -637,7 +634,7 @@ public final class SegmentInfo { } else { fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.FIELDS_INDEX_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.FIELDS_EXTENSION)); - if (hasVectors) { + if (getHasVectors()) { fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_INDEX_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(docStoreSegment, "", IndexFileNames.VECTORS_FIELDS_EXTENSION)); @@ -646,7 +643,7 @@ public final class SegmentInfo { } else if (!useCompoundFile) { fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.FIELDS_INDEX_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.FIELDS_EXTENSION)); - if (hasVectors) { + if (getHasVectors()) { fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_INDEX_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)); fileSet.add(IndexFileNames.segmentFileName(name, "", IndexFileNames.VECTORS_FIELDS_EXTENSION)); @@ -709,8 +706,12 @@ public final class SegmentInfo { if (this.dir != dir) { s.append('x'); } - if (hasVectors) { - s.append('v'); + try { + if (getHasVectors()) { + s.append('v'); + } + } catch (IOException e) { + throw new RuntimeException(e); } s.append(docCount); diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java index 46c050e3588..4523d821286 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java @@ -72,7 +72,7 @@ final class SegmentMerger { private PayloadProcessorProvider payloadProcessorProvider; - SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, CodecProvider codecs, PayloadProcessorProvider payloadProcessorProvider, FieldInfos fieldInfos) { + SegmentMerger(Directory dir, int termIndexInterval, String name, MergePolicy.OneMerge merge, PayloadProcessorProvider payloadProcessorProvider, FieldInfos fieldInfos) { this.payloadProcessorProvider = payloadProcessorProvider; directory = dir; segment = name; diff --git a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java index c29add9bd93..79c2638add4 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java @@ -32,7 +32,6 @@ public class SegmentWriteState { public final String segmentName; public final FieldInfos fieldInfos; public final int numDocs; - public boolean hasVectors; // Deletes to apply while we are flushing the segment. A // Term is enrolled in here if it was deleted at one diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java index da43f3ad311..fa956dda190 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java @@ -63,7 +63,6 @@ final class TermVectorsTermsWriter extends TermsHashConsumer { } lastDocID = 0; - state.hasVectors = hasVectors; hasVectors = false; } @@ -121,8 +120,7 @@ final class TermVectorsTermsWriter extends TermsHashConsumer { fill(docState.docID); // Append term vectors to the real outputs: - long pointer = tvd.getFilePointer(); - tvx.writeLong(pointer); + tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvf.getFilePointer()); tvd.writeVInt(numVectorFields); if (numVectorFields > 0) { @@ -136,6 +134,8 @@ final class TermVectorsTermsWriter extends TermsHashConsumer { tvd.writeVLong(pos-lastPos); lastPos = pos; perFields[i].finishDocument(); + // commit the termVectors once successful success - FI will otherwise reset them + perFields[i].fieldInfo.commitVectors(); } } diff --git a/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java index a070ce0f8c4..e69f612553d 100644 --- a/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java +++ b/lucene/src/java/org/apache/lucene/index/TieredMergePolicy.java @@ -23,6 +23,8 @@ import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Comparator; +import java.util.List; +import java.util.ArrayList; /** * Merges segments of approximately equal size, subject to @@ -249,7 +251,7 @@ public class TieredMergePolicy extends MergePolicy { final Collection merging = writer.get().getMergingSegments(); final Collection toBeMerged = new HashSet(); - final SegmentInfos infosSorted = new SegmentInfos(); + final List infosSorted = new ArrayList(); infosSorted.addAll(infos); Collections.sort(infosSorted, segmentByteSizeDescending); @@ -277,7 +279,7 @@ public class TieredMergePolicy extends MergePolicy { // If we have too-large segments, grace them out // of the maxSegmentCount: int tooBigCount = 0; - while (tooBigCount < infosSorted.size() && size(infosSorted.info(tooBigCount)) >= maxMergedSegmentBytes/2.0) { + while (tooBigCount < infosSorted.size() && size(infosSorted.get(tooBigCount)) >= maxMergedSegmentBytes/2.0) { totIndexBytes -= size(infosSorted.get(tooBigCount)); tooBigCount++; } @@ -310,7 +312,7 @@ public class TieredMergePolicy extends MergePolicy { // Gather eligible segments for merging, ie segments // not already being merged and not already picked (by // prior iteration of this loop) for merging: - final SegmentInfos eligible = new SegmentInfos(); + final List eligible = new ArrayList(); for(int idx = tooBigCount; idx best = null; boolean bestTooLarge = false; long bestMergeBytes = 0; @@ -341,10 +343,10 @@ public class TieredMergePolicy extends MergePolicy { long totAfterMergeBytes = 0; - final SegmentInfos candidate = new SegmentInfos(); + final List candidate = new ArrayList(); boolean hitTooLarge = false; for(int idx = startIdx;idx maxMergedSegmentBytes) { @@ -398,7 +400,7 @@ public class TieredMergePolicy extends MergePolicy { } /** Expert: scores one merge; subclasses can override. */ - protected MergeScore score(SegmentInfos candidate, boolean hitTooLarge, long mergingBytes) throws IOException { + protected MergeScore score(List candidate, boolean hitTooLarge, long mergingBytes) throws IOException { long totBeforeMergeBytes = 0; long totAfterMergeBytes = 0; long totAfterMergeBytesFloored = 0; @@ -420,7 +422,7 @@ public class TieredMergePolicy extends MergePolicy { // over time: skew = 1.0/maxMergeAtOnce; } else { - skew = ((double) floorSize(size(candidate.info(0))))/totAfterMergeBytesFloored; + skew = ((double) floorSize(size(candidate.get(0))))/totAfterMergeBytesFloored; } // Strongly favor merges with less skew (smaller @@ -458,7 +460,8 @@ public class TieredMergePolicy extends MergePolicy { if (verbose()) { message("findMergesForOptimize maxSegmentCount=" + maxSegmentCount + " infos=" + writer.get().segString(infos) + " segmentsToOptimize=" + segmentsToOptimize); } - SegmentInfos eligible = new SegmentInfos(); + + List eligible = new ArrayList(); boolean optimizeMergeRunning = false; final Collection merging = writer.get().getMergingSegments(); for(SegmentInfo info : infos) { @@ -499,7 +502,7 @@ public class TieredMergePolicy extends MergePolicy { if (spec == null) { spec = new MergeSpecification(); } - final OneMerge merge = new OneMerge(eligible.range(end-maxMergeAtOnceExplicit, end)); + final OneMerge merge = new OneMerge(eligible.subList(end-maxMergeAtOnceExplicit, end)); if (verbose()) { message("add merge=" + writer.get().segString(merge.segments)); } @@ -510,7 +513,7 @@ public class TieredMergePolicy extends MergePolicy { if (spec == null && !optimizeMergeRunning) { // Do final merge final int numToMerge = end - maxSegmentCount + 1; - final OneMerge merge = new OneMerge(eligible.range(end-numToMerge, end)); + final OneMerge merge = new OneMerge(eligible.subList(end-numToMerge, end)); if (verbose()) { message("add final merge=" + merge.segString(writer.get().getDirectory())); } @@ -527,7 +530,7 @@ public class TieredMergePolicy extends MergePolicy { if (verbose()) { message("findMergesToExpungeDeletes infos=" + writer.get().segString(infos) + " expungeDeletesPctAllowed=" + expungeDeletesPctAllowed); } - final SegmentInfos eligible = new SegmentInfos(); + final List eligible = new ArrayList(); final Collection merging = writer.get().getMergingSegments(); for(SegmentInfo info : infos) { double pctDeletes = 100.*((double) writer.get().numDeletedDocs(info))/info.docCount; @@ -580,7 +583,7 @@ public class TieredMergePolicy extends MergePolicy { spec = new MergeSpecification(); } - final OneMerge merge = new OneMerge(eligible.range(start, upto)); + final OneMerge merge = new OneMerge(eligible.subList(start, upto)); if (verbose()) { message("add merge=" + writer.get().segString(merge.segments)); } diff --git a/lucene/src/java/org/apache/lucene/index/UpgradeIndexMergePolicy.java b/lucene/src/java/org/apache/lucene/index/UpgradeIndexMergePolicy.java new file mode 100644 index 00000000000..7e57888461d --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/UpgradeIndexMergePolicy.java @@ -0,0 +1,152 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Constants; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** This {@link MergePolicy} is used for upgrading all existing segments of + * an index when calling {@link IndexWriter#optimize()}. + * All other methods delegate to the base {@code MergePolicy} given to the constructor. + * This allows for an as-cheap-as possible upgrade of an older index by only upgrading segments that + * are created by previous Lucene versions. Optimize does no longer really optimize + * it is just used to "optimize" older segment versions away. + *

In general one would use {@link IndexUpgrader}, but for a fully customizeable upgrade, + * you can use this like any other {@code MergePolicy} and call {@link IndexWriter#optimize()}: + *

+  *  IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_XX, new KeywordAnalyzer());
+  *  iwc.setMergePolicy(new UpgradeIndexMergePolicy(iwc.getMergePolicy()));
+  *  IndexWriter w = new IndexWriter(dir, iwc);
+  *  w.optimize();
+  *  w.close();
+  * 
+ * @lucene.experimental + * @see IndexUpgrader + */ +public class UpgradeIndexMergePolicy extends MergePolicy { + + protected final MergePolicy base; + + /** Wrap the given {@link MergePolicy} and intercept optimize requests to + * only upgrade segments written with previous Lucene versions. */ + public UpgradeIndexMergePolicy(MergePolicy base) { + this.base = base; + } + + /** Returns if the given segment should be upgraded. The default implementation + * will return {@code !Constants.LUCENE_MAIN_VERSION.equals(si.getVersion())}, + * so all segments created with a different version number than this Lucene version will + * get upgraded. + */ + protected boolean shouldUpgradeSegment(SegmentInfo si) { + return !Constants.LUCENE_MAIN_VERSION.equals(si.getVersion()); + } + + @Override + public void setIndexWriter(IndexWriter writer) { + super.setIndexWriter(writer); + base.setIndexWriter(writer); + } + + @Override + public MergeSpecification findMerges(SegmentInfos segmentInfos) throws CorruptIndexException, IOException { + return base.findMerges(segmentInfos); + } + + @Override + public MergeSpecification findMergesForOptimize(SegmentInfos segmentInfos, int maxSegmentCount, Set segmentsToOptimize) throws CorruptIndexException, IOException { + // first find all old segments + final HashSet oldSegments = new HashSet(); + for (final SegmentInfo si : segmentInfos) { + if (segmentsToOptimize.contains(si) && shouldUpgradeSegment(si)) { + oldSegments.add(si); + } + } + + if (verbose()) message("findMergesForOptimize: segmentsToUpgrade=" + oldSegments); + + if (oldSegments.isEmpty()) + return null; + + MergeSpecification spec = base.findMergesForOptimize(segmentInfos, maxSegmentCount, oldSegments); + + if (spec != null) { + // remove all segments that are in merge specification from oldSegments, + // the resulting set contains all segments that are left over + // and will be merged to one additional segment: + for (final OneMerge om : spec.merges) { + oldSegments.removeAll(om.segments); + } + } + + if (!oldSegments.isEmpty()) { + if (verbose()) + message("findMergesForOptimize: " + base.getClass().getSimpleName() + + " does not want to merge all old segments, merge remaining ones into new segment: " + oldSegments); + final List newInfos = new ArrayList(); + for (final SegmentInfo si : segmentInfos) { + if (oldSegments.contains(si)) { + newInfos.add(si); + } + } + // add the final merge + if (spec == null) { + spec = new MergeSpecification(); + } + spec.add(new OneMerge(newInfos)); + } + + return spec; + } + + @Override + public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos) throws CorruptIndexException, IOException { + return base.findMergesToExpungeDeletes(segmentInfos); + } + + @Override + public boolean useCompoundFile(SegmentInfos segments, SegmentInfo newSegment) throws IOException { + return base.useCompoundFile(segments, newSegment); + } + + @Override + public void close() { + base.close(); + } + + @Override + public String toString() { + return "[" + getClass().getSimpleName() + "->" + base + "]"; + } + + private boolean verbose() { + IndexWriter w = writer.get(); + return w != null && w.verbose(); + } + + private void message(String message) { + if (verbose()) + writer.get().message("UPGMP: " + message); + } + +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index bd9046bf69a..590ef0eadeb 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -72,6 +72,11 @@ public class CodecProvider { } } } + + /** @lucene.internal */ + public synchronized Set listAll() { + return codecs.keySet(); + } public Collection getAllExtensions() { return knownExtensions; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java index 4b42caa244b..9acb75e1d85 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java @@ -68,15 +68,8 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { @Override public Object clone() { - PulsingTermState clone; - clone = (PulsingTermState) super.clone(); - if (postingsSize != -1) { - clone.postings = new byte[postingsSize]; - System.arraycopy(postings, 0, clone.postings, 0, postingsSize); - } else { - assert wrappedTermState != null; - clone.wrappedTermState = (BlockTermState) wrappedTermState.clone(); - } + PulsingTermState clone = new PulsingTermState(); + clone.copyFrom(this); return clone; } @@ -90,8 +83,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)]; } System.arraycopy(other.postings, 0, postings, 0, other.postingsSize); - } else { + } else if (wrappedTermState != null) { wrappedTermState.copyFrom(other.wrappedTermState); + } else { + wrappedTermState = (BlockTermState) other.wrappedTermState.clone(); } // NOTE: we do not copy the diff --git a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java index 289df17ac9d..4d25e7afd5b 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/sep/SepPostingsReaderImpl.java @@ -85,7 +85,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { } } - public static void files(SegmentInfo segmentInfo, String codecId, Collection files) { + public static void files(SegmentInfo segmentInfo, String codecId, Collection files) throws IOException { files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.DOC_EXTENSION)); files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, SepPostingsWriterImpl.SKIP_EXTENSION)); @@ -151,14 +151,8 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { @Override public Object clone() { - SepTermState other = (SepTermState) super.clone(); - other.docIndex = (IntIndexInput.Index) docIndex.clone(); - if (freqIndex != null) { - other.freqIndex = (IntIndexInput.Index) freqIndex.clone(); - } - if (posIndex != null) { - other.posIndex = (IntIndexInput.Index) posIndex.clone(); - } + SepTermState other = new SepTermState(); + other.copyFrom(this); return other; } @@ -166,12 +160,28 @@ public class SepPostingsReaderImpl extends PostingsReaderBase { public void copyFrom(TermState _other) { super.copyFrom(_other); SepTermState other = (SepTermState) _other; - docIndex.set(other.docIndex); - if (freqIndex != null && other.freqIndex != null) { - freqIndex.set(other.freqIndex); + if (docIndex == null) { + docIndex = (IntIndexInput.Index) other.docIndex.clone(); + } else { + docIndex.set(other.docIndex); } - if (posIndex != null && other.posIndex != null) { - posIndex.set(other.posIndex); + if (other.freqIndex != null) { + if (freqIndex == null) { + freqIndex = (IntIndexInput.Index) other.freqIndex.clone(); + } else { + freqIndex.set(other.freqIndex); + } + } else { + freqIndex = null; + } + if (other.posIndex != null) { + if (posIndex == null) { + posIndex = (IntIndexInput.Index) other.posIndex.clone(); + } else { + posIndex.set(other.posIndex); + } + } else { + posIndex = null; } payloadFP = other.payloadFP; skipFP = other.skipFP; diff --git a/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java b/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java index eaf6d3f0126..58c77fd2897 100644 --- a/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java +++ b/lucene/src/java/org/apache/lucene/queryParser/QueryParserBase.java @@ -806,6 +806,7 @@ public abstract class QueryParserBase { } try { + source.end(); source.close(); } catch (IOException ignored) {} diff --git a/lucene/src/java/org/apache/lucene/search/HitQueue.java b/lucene/src/java/org/apache/lucene/search/HitQueue.java index 15e2052568c..e5ce5bcbc7a 100644 --- a/lucene/src/java/org/apache/lucene/search/HitQueue.java +++ b/lucene/src/java/org/apache/lucene/search/HitQueue.java @@ -21,8 +21,6 @@ import org.apache.lucene.util.PriorityQueue; final class HitQueue extends PriorityQueue { - private boolean prePopulate; - /** * Creates a new instance with size elements. If * prePopulate is set to true, the queue will pre-populate itself diff --git a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java index f199edc92c6..8429ec0c9af 100644 --- a/lucene/src/java/org/apache/lucene/search/IndexSearcher.java +++ b/lucene/src/java/org/apache/lucene/search/IndexSearcher.java @@ -46,8 +46,18 @@ import org.apache.lucene.util.ThreadInterruptedException; * *

Applications usually need only call the inherited * {@link #search(Query,int)} - * or {@link #search(Query,Filter,int)} methods. For performance reasons it is - * recommended to open only one IndexSearcher and use it for all of your searches. + * or {@link #search(Query,Filter,int)} methods. For + * performance reasons, if your index is unchanging, you + * should share a single IndexSearcher instance across + * multiple searches instead of creating a new one + * per-search. If your index has changed and you wish to + * see the changes reflected in searching, you should + * use {@link IndexReader#reopen} to obtain a new reader and + * then create a new IndexSearcher from that. Also, for + * low-latency turnaround it's best to use a near-real-time + * reader ({@link IndexReader#open(IndexWriter,boolean)}). + * Once you have a new {@link IndexReader}, it's relatively + * cheap to create a new IndexSearcher from it. * *

NOTE: {@link * IndexSearcher} instances are completely diff --git a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java index 7cb6994ccaa..fc9598078d8 100644 --- a/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -214,12 +214,12 @@ public class MultiPhraseQuery extends Query { docFreq = reader.docFreq(term.field(), term.bytes()); } - postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue()); + postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms[0]); } // sort by increasing docFreq order if (slop == 0) { - ArrayUtil.quickSort(postingsFreqs); + ArrayUtil.mergeSort(postingsFreqs); } if (slop == 0) { diff --git a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java index 303cbd166b1..00c638965cc 100644 --- a/lucene/src/java/org/apache/lucene/search/PhrasePositions.java +++ b/lucene/src/java/org/apache/lucene/search/PhrasePositions.java @@ -28,13 +28,15 @@ final class PhrasePositions { int position; // position in doc int count; // remaining pos in this doc int offset; // position in phrase + final int ord; // unique across all PhrasePositions instances final DocsAndPositionsEnum postings; // stream of docs & positions PhrasePositions next; // used to make lists boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) - PhrasePositions(DocsAndPositionsEnum postings, int o) { + PhrasePositions(DocsAndPositionsEnum postings, int o, int ord) { this.postings = postings; offset = o; + this.ord = ord; } final boolean next() throws IOException { // increments to next doc diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java index 2c8d977fa82..70adec70f7f 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQuery.java @@ -124,16 +124,48 @@ public class PhraseQuery extends Query { final DocsAndPositionsEnum postings; final int docFreq; final int position; + final Term term; - public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position) { + public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position, Term term) { this.postings = postings; this.docFreq = docFreq; this.position = position; + this.term = term; } public int compareTo(PostingsAndFreq other) { + if (docFreq == other.docFreq) { + if (position == other.position) { + return term.compareTo(other.term); + } + return position - other.position; + } return docFreq - other.docFreq; } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + docFreq; + result = prime * result + position; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + PostingsAndFreq other = (PostingsAndFreq) obj; + if (docFreq != other.docFreq) return false; + if (position != other.position) return false; + if (term == null) { + if (other.term != null) return false; + } else if (!term.equals(other.term)) return false; + return true; + } } private class PhraseWeight extends Weight { @@ -197,12 +229,12 @@ public class PhraseQuery extends Query { return null; } } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue()); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t); } // sort by increasing docFreq order if (slop == 0) { - ArrayUtil.quickSort(postingsFreqs); + ArrayUtil.mergeSort(postingsFreqs); } if (slop == 0) { // optimize exact case diff --git a/lucene/src/java/org/apache/lucene/search/PhraseQueue.java b/lucene/src/java/org/apache/lucene/search/PhraseQueue.java index 5b19567c59c..bac0a971d7d 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseQueue.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseQueue.java @@ -30,10 +30,16 @@ final class PhraseQueue extends PriorityQueue { if (pp1.position == pp2.position) // same doc and pp.position, so decide by actual term positions. // rely on: pp.position == tp.position - offset. - return pp1.offset < pp2.offset; - else + if (pp1.offset == pp2.offset) { + return pp1.ord < pp2.ord; + } else { + return pp1.offset < pp2.offset; + } + else { return pp1.position < pp2.position; - else + } + else { return pp1.doc < pp2.doc; + } } } diff --git a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java index 1fedc2eb3ee..da84dbcca42 100644 --- a/lucene/src/java/org/apache/lucene/search/PhraseScorer.java +++ b/lucene/src/java/org/apache/lucene/search/PhraseScorer.java @@ -55,7 +55,7 @@ abstract class PhraseScorer extends Scorer { // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. for (int i = 0; i < postings.length; i++) { - PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position); + PhrasePositions pp = new PhrasePositions(postings[i].postings, postings[i].position, i); if (last != null) { // add next to end of list last.next = pp; } else { diff --git a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java index 472e99de705..24356e27bcf 100644 --- a/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java +++ b/lucene/src/java/org/apache/lucene/search/TopTermsRewrite.java @@ -134,7 +134,7 @@ public abstract class TopTermsRewrite extends TermCollectingRew final Term placeholderTerm = new Term(query.field); final Q q = getTopLevelQuery(); final ScoreTerm[] scoreTerms = stQueue.toArray(new ScoreTerm[stQueue.size()]); - ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp); + ArrayUtil.mergeSort(scoreTerms, scoreTermSortByTermComp); for (final ScoreTerm st : scoreTerms) { final Term term = placeholderTerm.createTerm(st.bytes); assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq(); diff --git a/lucene/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java b/lucene/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java index 2bc9f87d27f..0eae1582573 100644 --- a/lucene/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java +++ b/lucene/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java @@ -190,7 +190,7 @@ public class NearSpansOrdered extends Spans { /** Advance the subSpans to the same document */ private boolean toSameDoc() throws IOException { - ArrayUtil.quickSort(subSpansByDoc, spanDocComparator); + ArrayUtil.mergeSort(subSpansByDoc, spanDocComparator); int firstIndex = 0; int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc(); while (subSpansByDoc[firstIndex].doc() != maxDoc) { diff --git a/lucene/src/java/org/apache/lucene/util/SorterTemplate.java b/lucene/src/java/org/apache/lucene/util/SorterTemplate.java index b0e558c1c20..1ce4619984f 100644 --- a/lucene/src/java/org/apache/lucene/util/SorterTemplate.java +++ b/lucene/src/java/org/apache/lucene/util/SorterTemplate.java @@ -62,13 +62,26 @@ public abstract class SorterTemplate { /** Sorts via in-place, but unstable, QuickSort algorithm. * For small collections falls back to {@link #insertionSort(int,int)}. */ - public final void quickSort(int lo, int hi) { + public final void quickSort(final int lo, final int hi) { + if (hi <= lo) return; + // from Integer's Javadocs: ceil(log2(x)) = 32 - numberOfLeadingZeros(x - 1) + quickSort(lo, hi, (Integer.SIZE - Integer.numberOfLeadingZeros(hi - lo)) << 1); + } + + private void quickSort(int lo, int hi, int maxDepth) { + // fall back to insertion when array has short length final int diff = hi - lo; if (diff <= QUICKSORT_THRESHOLD) { insertionSort(lo, hi); return; } + // fall back to merge sort when recursion depth gets too big + if (--maxDepth == 0) { + mergeSort(lo, hi); + return; + } + final int mid = lo + (diff >>> 1); if (compare(lo, mid) > 0) { @@ -101,8 +114,8 @@ public abstract class SorterTemplate { } } - quickSort(lo, left); - quickSort(left + 1, hi); + quickSort(lo, left, maxDepth); + quickSort(left + 1, hi, maxDepth); } /** Sorts via stable in-place MergeSort algorithm diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java index fed8cd21098..19949170936 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java @@ -261,9 +261,12 @@ public class Builder { add(scratchIntsRef, output); } + /** It's OK to add the same input twice in a row with + * different outputs, as long as outputs impls the merge + * method. */ public void add(IntsRef input, T output) throws IOException { //System.out.println("\nFST ADD: input=" + input + " output=" + fst.outputs.outputToString(output)); - assert lastInput.length == 0 || input.compareTo(lastInput) > 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; + assert lastInput.length == 0 || input.compareTo(lastInput) >= 0: "inputs are added out of order lastInput=" + lastInput + " vs input=" + input; assert validOutput(output); //System.out.println("\nadd: " + input); @@ -347,8 +350,15 @@ public class Builder { assert validOutput(output); } - // push remaining output: - frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + if (lastInput.length == input.length && prefixLenPlus1 == 1+input.length) { + // same input more than 1 time in a row, mapping to + // multiple outputs + lastNode.output = fst.outputs.merge(lastNode.output, output); + } else { + // this new arc is private to this new input; set its + // arc output to the leftover output: + frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); + } // save last input lastInput.copy(input); diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java index dde66270873..dbce4c011c3 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java @@ -231,10 +231,13 @@ public class FST { } void setEmptyOutput(T v) throws IOException { - if (emptyOutput != null && !emptyOutput.equals(v)) { - throw new IllegalStateException("empty output is already set: " + outputs.outputToString(emptyOutput) + " vs " + outputs.outputToString(v)); + if (emptyOutput != null) { + if (!emptyOutput.equals(v)) { + emptyOutput = outputs.merge(emptyOutput, v); + } + } else { + emptyOutput = v; } - emptyOutput = v; // TODO: this is messy -- replace with sillyBytesWriter; maybe make // bytes private @@ -446,25 +449,17 @@ public class FST { // reverse bytes in-place; we do this so that the // "BIT_TARGET_NEXT" opto can work, ie, it reads the // node just before the current one - final int endAddress = writer.posWrite; - final int stopAt = (endAddress - startAddress)/2; - int upto = 0; - while (upto < stopAt) { - final byte b = bytes[startAddress+upto]; - bytes[startAddress+upto] = bytes[endAddress-upto-1]; - bytes[endAddress-upto-1] = b; - upto++; + final int endAddress = lastFrozenNode = writer.posWrite - 1; + + int left = startAddress; + int right = endAddress; + while (left < right) { + final byte b = bytes[left]; + bytes[left++] = bytes[right]; + bytes[right--] = b; } - lastFrozenNode = endAddress - 1; - /* - System.out.println(" return node addr=" + (endAddress-1)); - for(int i=endAddress-1;i>=startAddress;i--) { - System.out.println(" bytes[" + i + "]=" + bytes[i]); - } - */ - - return endAddress-1; + return endAddress; } /** Fills virtual 'start' arc, ie, an empty incoming arc to diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java index 77484164c01..db1b7ddee12 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FSTEnum.java @@ -140,7 +140,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -278,7 +278,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java index 02719d81a6e..dde6409fc9a 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/NodeHash.java @@ -40,7 +40,7 @@ final class NodeHash { return false; } for(int arcUpto=0;arcUpto arc = node.arcs[arcUpto]; if (arc.label != scratchArc.label || !arc.output.equals(scratchArc.output) || ((Builder.CompiledNode) arc.target).address != scratchArc.target || diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java index 18f4dc29432..66efc3ff008 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Outputs.java @@ -54,4 +54,8 @@ public abstract class Outputs { public abstract T getNoOutput(); public abstract String outputToString(T output); + + public T merge(T first, T second) { + throw new UnsupportedOperationException(); + } } diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java index fc8aa6691f3..7b6ead92a91 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/PairOutputs.java @@ -43,7 +43,7 @@ public class PairOutputs extends Outputs> { this.output2 = output2; } - @Override @SuppressWarnings("unchecked") + @Override @SuppressWarnings("rawtypes") public boolean equals(Object other) { if (other == this) { return true; diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java index ba17fe99dee..984324e07ce 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/PositiveIntOutputs.java @@ -22,14 +22,11 @@ import java.io.IOException; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; -// TODO: make a sharing and non-sharing variant; eg if you -// output docFreq per term the FST will be smaller if you -// don't share since they are not "well shared" - /** * Output is a long, for each input term. NOTE: the * resulting FST is not guaranteed to be minimal! See - * {@link Builder}. + * {@link Builder}. You cannot store 0 output with this + * (that's reserved to mean "no output")! * @lucene.experimental */ diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/UpToTwoPositiveIntOutputs.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/UpToTwoPositiveIntOutputs.java new file mode 100644 index 00000000000..0c388d28710 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/UpToTwoPositiveIntOutputs.java @@ -0,0 +1,224 @@ +package org.apache.lucene.util.automaton.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +/** + * Holds one or two longs for each input term. If it's a + * single output, Long is returned; else, TwoLongs. Order + * is preseved in the TwoLongs case, ie .first is the first + * input/output added to Builder, and .second is the + * second. You cannot store 0 output with this (that's + * reserved to mean "no output")! + * + * NOTE: the resulting FST is not guaranteed to be minimal! + * See {@link Builder}. + * + * @lucene.experimental + */ + +public final class UpToTwoPositiveIntOutputs extends Outputs { + + public final static class TwoLongs { + final long first; + final long second; + + public TwoLongs(long first, long second) { + this.first = first; + this.second = second; + assert first >= 0; + assert second >= 0; + } + + @Override + public String toString() { + return "TwoLongs:" + first + "," + second; + } + + @Override + public boolean equals(Object _other) { + if (_other instanceof TwoLongs) { + final TwoLongs other = (TwoLongs) _other; + return first == other.first && second == other.second; + } else { + return false; + } + } + + @Override + public int hashCode() { + return (int) ((first^(first>>>32)) ^ (second^(second>>32))); + } + } + + private final static Long NO_OUTPUT = new Long(0); + + private final boolean doShare; + + private final static UpToTwoPositiveIntOutputs singletonShare = new UpToTwoPositiveIntOutputs(true); + private final static UpToTwoPositiveIntOutputs singletonNoShare = new UpToTwoPositiveIntOutputs(false); + + private UpToTwoPositiveIntOutputs(boolean doShare) { + this.doShare = doShare; + } + + public static UpToTwoPositiveIntOutputs getSingleton(boolean doShare) { + return doShare ? singletonShare : singletonNoShare; + } + + public Long get(long v) { + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } + + public TwoLongs get(long first, long second) { + return new TwoLongs(first, second); + } + + @Override + public Long common(Object _output1, Object _output2) { + assert valid(_output1, false); + assert valid(_output2, false); + final Long output1 = (Long) _output1; + final Long output2 = (Long) _output2; + if (output1 == NO_OUTPUT || output2 == NO_OUTPUT) { + return NO_OUTPUT; + } else if (doShare) { + assert output1 > 0; + assert output2 > 0; + return Math.min(output1, output2); + } else if (output1.equals(output2)) { + return output1; + } else { + return NO_OUTPUT; + } + } + + @Override + public Long subtract(Object _output, Object _inc) { + assert valid(_output, false); + assert valid(_inc, false); + final Long output = (Long) _output; + final Long inc = (Long) _inc; + assert output >= inc; + + if (inc == NO_OUTPUT) { + return output; + } else if (output.equals(inc)) { + return NO_OUTPUT; + } else { + return output - inc; + } + } + + @Override + public Object add(Object _prefix, Object _output) { + assert valid(_prefix, false); + assert valid(_output, true); + final Long prefix = (Long) _prefix; + if (_output instanceof Long) { + final Long output = (Long) _output; + if (prefix == NO_OUTPUT) { + return output; + } else if (output == NO_OUTPUT) { + return prefix; + } else { + return prefix + output; + } + } else { + final TwoLongs output = (TwoLongs) _output; + final long v = prefix; + return new TwoLongs(output.first + v, output.second + v); + } + } + + @Override + public void write(Object _output, DataOutput out) throws IOException { + assert valid(_output, true); + if (_output instanceof Long) { + final Long output = (Long) _output; + out.writeVLong(output<<1); + } else { + final TwoLongs output = (TwoLongs) _output; + out.writeVLong((output.first<<1) | 1); + out.writeVLong(output.second); + } + } + + @Override + public Object read(DataInput in) throws IOException { + final long code = in.readVLong(); + if ((code & 1) == 0) { + // single long + final long v = code >>> 1; + if (v == 0) { + return NO_OUTPUT; + } else { + return Long.valueOf(v); + } + } else { + // two longs + final long first = code >>> 1; + final long second = in.readVLong(); + return new TwoLongs(first, second); + } + } + + private boolean valid(Long o) { + assert o != null; + assert o instanceof Long; + assert o == NO_OUTPUT || o > 0; + return true; + } + + // Used only by assert + private boolean valid(Object _o, boolean allowDouble) { + if (!allowDouble) { + assert _o instanceof Long; + return valid((Long) _o); + } else if (_o instanceof TwoLongs) { + return true; + } else { + return valid((Long) _o); + } + } + + @Override + public Object getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(Object output) { + return output.toString(); + } + + @Override + public Object merge(Object first, Object second) { + assert valid(first, false); + assert valid(second, false); + return new TwoLongs((Long) first, (Long) second); + } +} diff --git a/lucene/src/site/src/documentation/content/xdocs/fileformats.xml b/lucene/src/site/src/documentation/content/xdocs/fileformats.xml index eacbc16c3e8..228e18a2b62 100644 --- a/lucene/src/site/src/documentation/content/xdocs/fileformats.xml +++ b/lucene/src/site/src/documentation/content/xdocs/fileformats.xml @@ -90,6 +90,14 @@

In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + + Additionally segments track explicitly whether or + not they have term vectors. See LUCENE-2811 for details. +

+

+ In version 3.2, numeric fields are written as natively + to stored fields file, previously they were stored in + text format only.

@@ -935,7 +943,7 @@ 3.1 Segments --> Format, Version, NameCounter, SegCount, <SegVersion, SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField, NormGenNumField, - IsCompoundFile, DeletionCount, HasProx, Diagnostics>SegCount, CommitUserData, Checksum + IsCompoundFile, DeletionCount, HasProx, Diagnostics, HasVectors>SegCount, CommitUserData, Checksum

@@ -957,7 +965,7 @@

IsCompoundFile, HasSingleNormFile, - DocStoreIsCompoundFile, HasProx --> Int8 + DocStoreIsCompoundFile, HasProx, HasVectors --> Int8

@@ -1083,6 +1091,10 @@ Lucene version, OS, Java version, why the segment was created (merge, flush, addIndexes), etc.

+ +

HasVectors is 1 if this segment stores term vectors, + else it's 0. +

@@ -1293,10 +1305,18 @@
  • third bit is one for fields with compression option enabled (if compression is enabled, the algorithm used is ZLIB), only available for indexes until Lucene version 2.9.x
  • +
  • 4th to 6th bits (mask: 0x7<<3) define the type of a + numeric field:
      +
    • all bits in mask are cleared if no numeric field at all
    • +
    • 1<<3: Value is Int
    • +
    • 2<<3: Value is Long
    • +
    • 3<<3: Value is Int as Float (as of Integer.intBitsToFloat)
    • +
    • 4<<3: Value is Long as Double (as of Double.longBitsToDouble)
    • +
  • Value --> - String | BinaryValue (depending on Bits) + String | BinaryValue | Int | Long (depending on Bits)

    BinaryValue --> ValueSize, <Byte>^ValueSize diff --git a/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml b/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml index 4dde0f34ecb..7ab6441214e 100644 --- a/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml +++ b/lucene/src/site/src/documentation/content/xdocs/gettingstarted.xml @@ -28,11 +28,11 @@ may wish to skip sections.

    diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 4d4141c6ab3..c5bb9f26448 100644 --- a/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -262,6 +262,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { tokens.add(termAtt.toString()); // TODO: we could collect offsets etc here for better checking that reset() really works. } + ts.end(); ts.close(); // verify reusing is "reproducable" and also get the normal tokenstream sanity checks if (!tokens.isEmpty()) diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java b/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java index ae889c1c3b1..3818d071f99 100644 --- a/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java +++ b/lucene/src/test-framework/org/apache/lucene/analysis/MockAnalyzer.java @@ -36,6 +36,7 @@ public final class MockAnalyzer extends Analyzer { private int positionIncrementGap; private final Random random; private Map previousMappings = new HashMap(); + private boolean enableChecks = true; /** * Creates a new MockAnalyzer. @@ -75,6 +76,7 @@ public final class MockAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { MockTokenizer tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase); + tokenizer.setEnableChecks(enableChecks); TokenFilter filt = new MockTokenFilter(tokenizer, filter, enablePositionIncrements); filt = maybePayload(filt, fieldName); return filt; @@ -98,13 +100,13 @@ public final class MockAnalyzer extends Analyzer { if (saved == null) { saved = new SavedStreams(); saved.tokenizer = new MockTokenizer(reader, runAutomaton, lowerCase); + saved.tokenizer.setEnableChecks(enableChecks); saved.filter = new MockTokenFilter(saved.tokenizer, filter, enablePositionIncrements); saved.filter = maybePayload(saved.filter, fieldName); map.put(fieldName, saved); return saved.filter; } else { saved.tokenizer.reset(reader); - saved.filter.reset(); return saved.filter; } } @@ -139,4 +141,12 @@ public final class MockAnalyzer extends Analyzer { public int getPositionIncrementGap(String fieldName){ return positionIncrementGap; } + + /** + * Toggle consumer workflow checking: if your test consumes tokenstreams normally you + * should leave this enabled. + */ + public void setEnableChecks(boolean enableChecks) { + this.enableChecks = enableChecks; + } } diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/MockPayloadAnalyzer.java b/lucene/src/test-framework/org/apache/lucene/analysis/MockPayloadAnalyzer.java index 63d99af28c6..fe64ad8884e 100644 --- a/lucene/src/test-framework/org/apache/lucene/analysis/MockPayloadAnalyzer.java +++ b/lucene/src/test-framework/org/apache/lucene/analysis/MockPayloadAnalyzer.java @@ -86,6 +86,7 @@ final class MockPayloadFilter extends TokenFilter { @Override public void reset() throws IOException { + super.reset(); i = 0; pos = 0; } diff --git a/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java b/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java index 6e4f30b3968..15e501f0f41 100644 --- a/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java @@ -20,14 +20,15 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; /** * Automaton-based tokenizer for testing. Optionally lowercases. */ -public class MockTokenizer extends CharTokenizer { +public class MockTokenizer extends Tokenizer { /** Acts Similar to WhitespaceTokenizer */ public static final CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton()); @@ -45,21 +46,88 @@ public class MockTokenizer extends CharTokenizer { private final boolean lowerCase; private int state; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + int off = 0; + + // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? + // currently, we can only check that the lifecycle is correct if someone is reusing, + // but not for "one-offs". + private static enum State { + SETREADER, // consumer set a reader input either via ctor or via reset(Reader) + RESET, // consumer has called reset() + INCREMENT, // consumer is consuming, has called incrementToken() == true + INCREMENT_FALSE, // consumer has called incrementToken() which returned false + END, // consumer has called end() to perform end of stream operations + CLOSE // consumer has called close() to release any resources + }; + + private State streamState = State.CLOSE; + private boolean enableChecks = true; + public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - super(LuceneTestCase.TEST_VERSION_CURRENT, factory, input); + super(factory, input); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); + this.streamState = State.SETREADER; } public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - super(LuceneTestCase.TEST_VERSION_CURRENT, input); + super(input); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); + this.streamState = State.SETREADER; } @Override + public final boolean incrementToken() throws IOException { + assert !enableChecks || (streamState == State.RESET || streamState == State.INCREMENT) + : "incrementToken() called while in wrong state: " + streamState; + clearAttributes(); + for (;;) { + int startOffset = off; + int cp = readCodePoint(); + if (cp < 0) { + break; + } else if (isTokenChar(cp)) { + int endOffset; + do { + char chars[] = Character.toChars(normalize(cp)); + for (int i = 0; i < chars.length; i++) + termAtt.append(chars[i]); + endOffset = off; + cp = readCodePoint(); + } while (cp >= 0 && isTokenChar(cp)); + offsetAtt.setOffset(startOffset, endOffset); + streamState = State.INCREMENT; + return true; + } + } + streamState = State.INCREMENT_FALSE; + return false; + } + + protected int readCodePoint() throws IOException { + int ch = input.read(); + if (ch < 0) { + return ch; + } else { + assert !Character.isLowSurrogate((char) ch); + off++; + if (Character.isHighSurrogate((char) ch)) { + int ch2 = input.read(); + if (ch2 >= 0) { + off++; + assert Character.isLowSurrogate((char) ch2); + return Character.toCodePoint((char) ch, (char) ch2); + } + } + return ch; + } + } + protected boolean isTokenChar(int c) { state = runAutomaton.step(state, c); if (state < 0) { @@ -70,7 +138,6 @@ public class MockTokenizer extends CharTokenizer { } } - @Override protected int normalize(int c) { return lowerCase ? Character.toLowerCase(c) : c; } @@ -79,5 +146,43 @@ public class MockTokenizer extends CharTokenizer { public void reset() throws IOException { super.reset(); state = runAutomaton.getInitialState(); + off = 0; + assert !enableChecks || streamState != State.RESET : "double reset()"; + streamState = State.RESET; + } + + @Override + public void close() throws IOException { + super.close(); + // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() + // these tests should disable this check, by default we check the normal workflow. + // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this + assert !enableChecks || streamState == State.END || streamState == State.CLOSE : "close() called in wrong state: " + streamState; + streamState = State.CLOSE; + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + assert !enableChecks || streamState == State.CLOSE : "setReader() called in wrong state: " + streamState; + streamState = State.SETREADER; + } + + @Override + public void end() throws IOException { + int finalOffset = correctOffset(off); + offsetAtt.setOffset(finalOffset, finalOffset); + // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. + // these tests should disable this check (in general you should consume the entire stream) + assert !enableChecks || streamState == State.INCREMENT_FALSE : "end() called before incrementToken() returned false!"; + streamState = State.END; + } + + /** + * Toggle consumer workflow checking: if your test consumes tokenstreams normally you + * should leave this enabled. + */ + public void setEnableChecks(boolean enableChecks) { + this.enableChecks = enableChecks; } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/MockRandomMergePolicy.java b/lucene/src/test-framework/org/apache/lucene/index/MockRandomMergePolicy.java index e8bc977931b..0cc621aff6d 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/MockRandomMergePolicy.java +++ b/lucene/src/test-framework/org/apache/lucene/index/MockRandomMergePolicy.java @@ -18,7 +18,9 @@ package org.apache.lucene.index; */ import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Random; import java.util.Set; @@ -58,21 +60,36 @@ public class MockRandomMergePolicy extends MergePolicy { SegmentInfos segmentInfos, int maxSegmentCount, Set segmentsToOptimize) throws CorruptIndexException, IOException { - //System.out.println("MRMP: findMergesForOptimize sis=" + segmentInfos); + final List eligibleSegments = new ArrayList(); + for(SegmentInfo info : segmentInfos) { + if (segmentsToOptimize.contains(info)) { + eligibleSegments.add(info); + } + } + + //System.out.println("MRMP: findMergesForOptimize sis=" + segmentInfos + " eligible=" + eligibleSegments); MergeSpecification mergeSpec = null; - if (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && segmentInfos.info(0).hasDeletions())) { + if (eligibleSegments.size() > 1 || (eligibleSegments.size() == 1 && eligibleSegments.get(0).hasDeletions())) { mergeSpec = new MergeSpecification(); - SegmentInfos segmentInfos2 = new SegmentInfos(); - segmentInfos2.addAll(segmentInfos); - Collections.shuffle(segmentInfos2, random); + // Already shuffled having come out of a set but + // shuffle again for good measure: + Collections.shuffle(eligibleSegments, random); int upto = 0; - while(upto < segmentInfos.size()) { - int max = Math.min(10, segmentInfos.size()-upto); + while(upto < eligibleSegments.size()) { + int max = Math.min(10, eligibleSegments.size()-upto); int inc = max <= 2 ? max : _TestUtil.nextInt(random, 2, max); - mergeSpec.add(new OneMerge(segmentInfos2.range(upto, upto+inc))); + mergeSpec.add(new OneMerge(eligibleSegments.subList(upto, upto+inc))); upto += inc; } } + + if (mergeSpec != null) { + for(OneMerge merge : mergeSpec.merges) { + for(SegmentInfo info : merge.segments) { + assert segmentsToOptimize.contains(info); + } + } + } return mergeSpec; } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java index c09a48a6e3b..68ecf499740 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mockrandom/MockRandomCodec.java @@ -146,6 +146,9 @@ public class MockRandomCodec extends Codec { out.close(); final Random random = new Random(seed); + + random.nextInt(); // consume a random for buffersize + PostingsWriterBase postingsWriter; if (random.nextBoolean()) { @@ -244,16 +247,22 @@ public class MockRandomCodec extends Codec { in.close(); final Random random = new Random(seed); + + int readBufferSize = _TestUtil.nextInt(random, 1, 4096); + if (LuceneTestCase.VERBOSE) { + System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize); + } + PostingsReaderBase postingsReader; if (random.nextBoolean()) { postingsReader = new SepPostingsReaderImpl(state.dir, state.segmentInfo, - state.readBufferSize, new MockIntStreamFactory(random), state.codecId); + readBufferSize, new MockIntStreamFactory(random), state.codecId); } else { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading Standard postings"); } - postingsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId); + postingsReader = new StandardPostingsReader(state.dir, state.segmentInfo, readBufferSize, state.codecId); } if (random.nextBoolean()) { @@ -318,7 +327,7 @@ public class MockRandomCodec extends Codec { state.fieldInfos, state.segmentInfo.name, postingsReader, - state.readBufferSize, + readBufferSize, termsCacheSize, state.codecId); success = true; diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java index ca08b6e84ba..4331457bdca 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/mocksep/MockSepCodec.java @@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.codecs.Codec; -import org.apache.lucene.index.codecs.DocValuesConsumer; import org.apache.lucene.index.codecs.DefaultDocValuesProducer; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; diff --git a/lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java b/lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java index 17b62a6f94d..c7b0d036dd1 100644 --- a/lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java +++ b/lucene/src/test-framework/org/apache/lucene/store/MockDirectoryWrapper.java @@ -71,6 +71,7 @@ public class MockDirectoryWrapper extends Directory { Set openFilesForWrite = new HashSet(); volatile boolean crashed; private ThrottledIndexOutput throttledOutput; + private Throttling throttling = Throttling.SOMETIMES; // use this for tracking files for crash. // additionally: provides debugging information in case you leave one open @@ -104,6 +105,8 @@ public class MockDirectoryWrapper extends Directory { // called from different threads; else test failures may // not be reproducible from the original seed this.randomState = new Random(random.nextInt()); + this.throttledOutput = new ThrottledIndexOutput(ThrottledIndexOutput + .mBitsToBytes(40 + randomState.nextInt(10)), 5 + randomState.nextInt(5), null); init(); } @@ -117,8 +120,17 @@ public class MockDirectoryWrapper extends Directory { preventDoubleWrite = value; } - public void setThrottledIndexOutput(ThrottledIndexOutput throttledOutput) { - this.throttledOutput = throttledOutput; + public static enum Throttling { + /** always emulate a slow hard disk. could be very slow! */ + ALWAYS, + /** sometimes (2% of the time) emulate a slow hard disk. */ + SOMETIMES, + /** never throttle output */ + NEVER + }; + + public void setThrottling(Throttling throttling) { + this.throttling = throttling; } @Override @@ -354,7 +366,17 @@ public class MockDirectoryWrapper extends Directory { IndexOutput io = new MockIndexOutputWrapper(this, delegate.createOutput(name), name); openFileHandles.put(io, new RuntimeException("unclosed IndexOutput")); openFilesForWrite.add(name); - return throttledOutput == null ? io : throttledOutput.newFromDelegate(io); + + // throttling REALLY slows down tests, so don't do it very often for SOMETIMES. + if (throttling == Throttling.ALWAYS || + (throttling == Throttling.SOMETIMES && randomState.nextInt(50) == 0)) { + if (LuceneTestCase.VERBOSE) { + System.out.println("MockDirectoryWrapper: throttling indexOutput"); + } + return throttledOutput.newFromDelegate(io); + } else { + return io; + } } @Override diff --git a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java index 5888a1c008c..7e95cffacd9 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test-framework/org/apache/lucene/util/LuceneTestCase.java @@ -137,6 +137,8 @@ public abstract class LuceneTestCase extends Assert { // tests) /** Gets the codec to run tests with. */ public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField"); + /** Gets the codecprovider to run tests with */ + public static final String TEST_CODECPROVIDER = System.getProperty("tests.codecprovider", "random"); /** Gets the locale to run tests with */ public static final String TEST_LOCALE = System.getProperty("tests.locale", "random"); /** Gets the timezone to run tests with */ @@ -329,15 +331,38 @@ public abstract class LuceneTestCase extends Assert { tempDirs.clear(); stores = Collections.synchronizedMap(new IdentityHashMap()); savedCodecProvider = CodecProvider.getDefault(); - if ("randomPerField".equals(TEST_CODEC)) { - if (random.nextInt(4) == 0) { // preflex-only setup - codec = installTestCodecs("PreFlex", CodecProvider.getDefault()); - } else { // per-field setup - CodecProvider.setDefault(new RandomCodecProvider(random)); + if ("random".equals(TEST_CODECPROVIDER)) { + if ("randomPerField".equals(TEST_CODEC)) { + if (random.nextInt(4) == 0) { // preflex-only setup + codec = installTestCodecs("PreFlex", CodecProvider.getDefault()); + } else { // per-field setup + CodecProvider.setDefault(new RandomCodecProvider(random)); + codec = installTestCodecs(TEST_CODEC, CodecProvider.getDefault()); + } + } else { // ordinary setup codec = installTestCodecs(TEST_CODEC, CodecProvider.getDefault()); } - } else { // ordinary setup - codec = installTestCodecs(TEST_CODEC, CodecProvider.getDefault()); + } else { + // someone specified their own codecprovider by class + try { + Class cpClazz = Class.forName(TEST_CODECPROVIDER).asSubclass(CodecProvider.class); + CodecProvider cp = cpClazz.newInstance(); + String codecName; + if (TEST_CODEC.startsWith("random")) { // TODO: somehow do random per-field?! + Set codecSet = cp.listAll(); + String availableCodecs[] = codecSet.toArray(new String[codecSet.size()]); + codecName = availableCodecs[random.nextInt(availableCodecs.length)]; + } else { + codecName = TEST_CODEC; + } + + codec = cp.lookup(codecName); + cp.setDefaultFieldCodec(codecName); + CodecProvider.setDefault(cp); + } catch (Exception e) { + System.err.println("Could not instantiate CodecProvider: " + TEST_CODECPROVIDER); + throw new RuntimeException(e); + } } savedLocale = Locale.getDefault(); locale = TEST_LOCALE.equals("random") ? randomLocale(random) : localeForName(TEST_LOCALE); @@ -360,16 +385,13 @@ public abstract class LuceneTestCase extends Assert { String codecDescription; CodecProvider cp = CodecProvider.getDefault(); - if ("randomPerField".equals(TEST_CODEC)) { - if (cp instanceof RandomCodecProvider) - codecDescription = cp.toString(); - else - codecDescription = "PreFlex"; + if ("randomPerField".equals(TEST_CODEC) && cp instanceof RandomCodecProvider) { + codecDescription = cp.toString(); } else { codecDescription = codec.toString(); } - if (CodecProvider.getDefault() == savedCodecProvider) + if ("random".equals(TEST_CODECPROVIDER) && CodecProvider.getDefault() == savedCodecProvider) removeTestCodecs(codec, CodecProvider.getDefault()); CodecProvider.setDefault(savedCodecProvider); Locale.setDefault(savedLocale); diff --git a/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java b/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java index e5ec6fad862..4df9f1f456f 100644 --- a/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java +++ b/lucene/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java @@ -107,6 +107,7 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase { // consume } stream.end(); + stream.close(); assertAnalyzesToReuse(analyzer, testString, new String[] { "t" }); } diff --git a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java index 25cf0c4d987..6fffc48664a 100644 --- a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java +++ b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java @@ -153,7 +153,8 @@ public class Test2BTerms extends LuceneTestCase { List savedTerms = null; - Directory dir = newFSDirectory(_TestUtil.getTempDir("2BTerms")); + MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms")); + dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER); //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex")); if (true) { diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index efee37fce32..b83f7369e50 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.Arrays; +import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -41,10 +42,12 @@ import org.apache.lucene.search.Similarity; import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.Constants; /* Verify we can read the pre-4.0 file format, do searches @@ -63,26 +66,27 @@ public class TestBackwardsCompatibility extends LuceneTestCase { // oldNames array. /* - public void testCreatePreLocklessCFS() throws IOException { - createIndex("index.cfs", true); - } - - public void testCreatePreLocklessNoCFS() throws IOException { - createIndex("index.nocfs", false); - } - */ - -/* public void testCreateCFS() throws IOException { - String dirName = "testindex.cfs"; - createIndex(dirName, true); - rmDir(dirName); + createIndex("index.cfs", true, false); } public void testCreateNoCFS() throws IOException { - String dirName = "testindex.nocfs"; - createIndex(dirName, true); - rmDir(dirName); + createIndex("index.nocfs", false, false); + } + */ + +/* + // These are only needed for the special upgrade test to verify + // that also optimized indexes are correctly upgraded by IndexUpgrader. + // You don't need them to be build for non-3.1 (the test is happy with just one + // "old" segment format, version is unimportant: + + public void testCreateOptimizedCFS() throws IOException { + createIndex("index.optimized.cfs", true, true); + } + + public void testCreateOptimizedNoCFS() throws IOException { + createIndex("index.optimized.nocfs", false, true); } */ @@ -90,6 +94,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "30.nocfs", "31.cfs", "31.nocfs", + "32.cfs", + "32.nocfs", }; final String[] unsupportedNames = {"19.cfs", @@ -108,6 +114,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase { "29.nocfs", }; + final String[] oldOptimizedNames = {"31.optimized.cfs", + "31.optimized.nocfs", + }; + /** This test checks that *only* IndexFormatTooOldExceptions are throws when you open and operate on too old indexes! */ public void testUnsupportedOldIndexes() throws Exception { for(int i=0;i names = new ArrayList(oldNames.length + oldOptimizedNames.length); + names.addAll(Arrays.asList(oldNames)); + names.addAll(Arrays.asList(oldOptimizedNames)); + for(String name : names) { + if (VERBOSE) { + System.out.println("testUpgradeOldIndex: index=" +name); + } + File oldIndxeDir = _TestUtil.getTempDir(name); + _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); + + new IndexUpgrader(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null), VERBOSE ? System.out : null, false) + .upgrade(); + + checkAllSegmentsUpgraded(dir); + + _TestUtil.checkIndex(dir); + + dir.close(); + _TestUtil.rmDir(oldIndxeDir); + } + } + + public void testUpgradeOldOptimizedIndexWithAdditions() throws Exception { + for (String name : oldOptimizedNames) { + if (VERBOSE) { + System.out.println("testUpgradeOldOptimizedIndexWithAdditions: index=" +name); + } + File oldIndxeDir = _TestUtil.getTempDir(name); + _TestUtil.unzip(getDataFile("index." + name + ".zip"), oldIndxeDir); + Directory dir = newFSDirectory(oldIndxeDir); + + assertEquals("Original index must be optimized", 1, getNumberOfSegments(dir)); + + // create a bunch of dummy segments + int id = 40; + RAMDirectory ramDir = new RAMDirectory(); + for (int i = 0; i < 3; i++) { + // only use Log- or TieredMergePolicy, to make document addition predictable and not suddenly merge: + MergePolicy mp = random.nextBoolean() ? newLogMergePolicy() : newTieredMergePolicy(); + IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMergePolicy(mp); + IndexWriter w = new IndexWriter(ramDir, iwc); + // add few more docs: + for(int j = 0; j < RANDOM_MULTIPLIER * random.nextInt(30); j++) { + addDoc(w, id++); + } + w.close(false); + } + + // add dummy segments (which are all in current version) to optimized index + MergePolicy mp = random.nextBoolean() ? newLogMergePolicy() : newTieredMergePolicy(); + IndexWriterConfig iwc = new IndexWriterConfig(TEST_VERSION_CURRENT, null) + .setMergePolicy(mp); + IndexWriter w = new IndexWriter(dir, iwc); + w.setInfoStream(VERBOSE ? System.out : null); + w.addIndexes(ramDir); + w.close(false); + + // determine count of segments in modified index + final int origSegCount = getNumberOfSegments(dir); + + new IndexUpgrader(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, null), VERBOSE ? System.out : null, false) + .upgrade(); + + final int segCount = checkAllSegmentsUpgraded(dir); + assertEquals("Index must still contain the same number of segments, as only one segment was upgraded and nothing else merged", + origSegCount, segCount); + + dir.close(); + _TestUtil.rmDir(oldIndxeDir); + } + } } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index 44b84b504ad..040e9d035f6 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -241,8 +241,7 @@ public class TestCodecs extends LuceneTestCase { final Directory dir = newDirectory(); FieldInfos clonedFieldInfos = (FieldInfos) fieldInfos.clone(); this.write(fieldInfos, dir, fields, true); - final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, true, clonedFieldInfos.buildSegmentCodecs(false), clonedFieldInfos.hasVectors(), clonedFieldInfos); - si.setHasProx(false); + final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, clonedFieldInfos.buildSegmentCodecs(false), clonedFieldInfos); final FieldsProducer reader = si.getSegmentCodecs().codec().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 64, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR)); @@ -294,7 +293,7 @@ public class TestCodecs extends LuceneTestCase { FieldInfos clonedFieldInfos = (FieldInfos) fieldInfos.clone(); this.write(fieldInfos, dir, fields, false); - final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, true, clonedFieldInfos.buildSegmentCodecs(false), clonedFieldInfos.hasVectors(), clonedFieldInfos); + final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, clonedFieldInfos.buildSegmentCodecs(false), clonedFieldInfos); if (VERBOSE) { System.out.println("TEST: now read postings"); diff --git a/lucene/src/test/org/apache/lucene/index/TestDoc.java b/lucene/src/test/org/apache/lucene/index/TestDoc.java index 874df62c91a..9352f9174c9 100644 --- a/lucene/src/test/org/apache/lucene/index/TestDoc.java +++ b/lucene/src/test/org/apache/lucene/index/TestDoc.java @@ -196,7 +196,7 @@ public class TestDoc extends LuceneTestCase { SegmentReader r1 = SegmentReader.get(true, si1, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); SegmentReader r2 = SegmentReader.get(true, si2, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); - SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, merged, null, CodecProvider.getDefault(), null, new FieldInfos()); + SegmentMerger merger = new SegmentMerger(si1.dir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, merged, null, null, new FieldInfos()); merger.add(r1); merger.add(r2); @@ -205,8 +205,7 @@ public class TestDoc extends LuceneTestCase { r2.close(); final FieldInfos fieldInfos = merger.fieldInfos(); final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, - false, fieldInfos.hasProx(), merger.getSegmentCodecs(), - fieldInfos.hasVectors(), fieldInfos); + false, merger.getSegmentCodecs(), fieldInfos); if (useCompoundFile) { Collection filesToDelete = merger.createCompoundFile(merged + ".cfs", info); diff --git a/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java index 26b1717072f..75a9be9cc0e 100644 --- a/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -24,12 +24,14 @@ import java.util.*; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.LoadFirstFieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; @@ -511,4 +513,69 @@ public class TestFieldsReader extends LuceneTestCase { } } + + public void testNumericField() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir); + final int numDocs = _TestUtil.nextInt(random, 500, 1000) * RANDOM_MULTIPLIER; + final Number[] answers = new Number[numDocs]; + final NumericField.DataType[] typeAnswers = new NumericField.DataType[numDocs]; + for(int id=0;id 0); + reader.close(); + SegmentInfos sis = new SegmentInfos(); + sis.read(dir); + for (SegmentInfo segmentInfo : sis) { + assertFalse(segmentInfo.getHasVectors()); + } + dir.close(); + + } + } + } + + private static class FailOnTermVectors extends MockDirectoryWrapper.Failure { + + private static final String INIT_STAGE = "initTermVectorsWriter"; + private static final String AFTER_INIT_STAGE = "finishDocument"; + private static final String EXC_MSG = "FOTV"; + private final String stage; + + public FailOnTermVectors(String stage) { + this.stage = stage; + } + + @Override + public void eval(MockDirectoryWrapper dir) throws IOException { + StackTraceElement[] trace = new Exception().getStackTrace(); + boolean failOnInit = false; + boolean failOnfinish = false; + for (int i = 0; i < trace.length; i++) { + if ("org.apache.lucene.index.TermVectorsTermsWriter".equals(trace[i].getClassName()) && stage.equals(trace[i].getMethodName())) + failOnInit = true; + if ("org.apache.lucene.index.TermVectorsTermsWriter".equals(trace[i].getClassName()) && stage.equals(trace[i].getMethodName())) + failOnfinish = true; + } + + if (failOnInit) { + throw new RuntimeException(EXC_MSG + " fail on init"); + } else if (failOnfinish) { + throw new RuntimeException(EXC_MSG + " fail on finishDoc"); + } + } + } } diff --git a/lucene/src/test/org/apache/lucene/index/TestLongPostings.java b/lucene/src/test/org/apache/lucene/index/TestLongPostings.java index fbe6fa1e0f4..1d745d7d8f0 100644 --- a/lucene/src/test/org/apache/lucene/index/TestLongPostings.java +++ b/lucene/src/test/org/apache/lucene/index/TestLongPostings.java @@ -49,6 +49,7 @@ public class TestLongPostings extends LuceneTestCase { final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class); final BytesRef termBytes = termAtt.getBytesRef(); int count = 0; + ts.reset(); while(ts.incrementToken()) { termAtt.fillBytesRef(); if (count == 0 && !termBytes.utf8ToString().equals(s)) { diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java index d161e130ccb..7961601c013 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -73,15 +73,15 @@ public class TestSegmentMerger extends LuceneTestCase { } public void testMerge() throws IOException { - SegmentMerger merger = new SegmentMerger(mergedDir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, mergedSegment, null, CodecProvider.getDefault(), null, new FieldInfos()); + SegmentMerger merger = new SegmentMerger(mergedDir, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, mergedSegment, null, null, new FieldInfos()); merger.add(reader1); merger.add(reader2); int docsMerged = merger.merge(); assertTrue(docsMerged == 2); final FieldInfos fieldInfos = merger.fieldInfos(); //Should be able to open a new SegmentReader against the new directory - SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, fieldInfos.hasProx(), - merger.getSegmentCodecs(), fieldInfos.hasVectors(), fieldInfos), + SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, + merger.getSegmentCodecs(), fieldInfos), BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); diff --git a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java index a0fbe6d1f55..8eb6224805f 100644 --- a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java +++ b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java @@ -616,7 +616,7 @@ public class TestStressIndexing2 extends LuceneTestCase { } for(int i=start;i lastDoc = docs.get(r.nextInt(docs.size())); diff --git a/lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java b/lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java index 5be8753df0b..67d9333d6e1 100644 --- a/lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java +++ b/lucene/src/test/org/apache/lucene/search/TestTimeLimitingCollector.java @@ -75,7 +75,7 @@ public class TestTimeLimitingCollector extends LuceneTestCase { "blueberry pizza", }; directory = newDirectory(); - RandomIndexWriter iw = new RandomIndexWriter(random, directory); + RandomIndexWriter iw = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); for (int i=0; i(random, dir, inputMode, pairs, outputs).doTest(); } + + // Up to two positive ints, shared, generally but not + // monotonically increasing + { + if (VERBOSE) { + System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); + } + final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } } private static class FSTTester { @@ -328,11 +358,13 @@ public class TestFSTs extends LuceneTestCase { // no pruning doTest(0, 0); - // simple pruning - doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); - - // leafy pruning - doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { + // simple pruning + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0); + + // leafy pruning + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size())); + } } // runs the term, returning the output, or null if term @@ -421,7 +453,14 @@ public class TestFSTs extends LuceneTestCase { prune1==0 && prune2==0, outputs); for(InputOutput pair : pairs) { - builder.add(pair.input, pair.output); + if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { + final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; + final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; + ((Builder) builder).add(pair.input, (Object) _outputs.get(twoLongs.first)); + ((Builder) builder).add(pair.input, (Object) _outputs.get(twoLongs.second)); + } else { + builder.add(pair.input, pair.output); + } } FST fst = builder.finish(); diff --git a/modules/analysis/CHANGES.txt b/modules/analysis/CHANGES.txt index b636dc7cc20..e2b616e110f 100644 --- a/modules/analysis/CHANGES.txt +++ b/modules/analysis/CHANGES.txt @@ -83,6 +83,8 @@ New Features - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader + - o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer + - o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils * SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of /something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java index 26f06d3ffa0..fff6148d19a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java @@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ar; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java index a9853386d53..3bf349719d7 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java index c10972b701b..0e1c7e616fb 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java @@ -22,7 +22,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java index a65d90b4ffa..ecdf550eb85 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java index ce2bc6abd7c..31cfa1e00dc 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java @@ -20,7 +20,7 @@ package org.apache.lucene.analysis.core; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java index 85ce28efc99..357350cef38 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java @@ -19,7 +19,7 @@ package org.apache.lucene.analysis.core; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java index 4bf4f049dee..01004c68ca5 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java @@ -19,8 +19,8 @@ package org.apache.lucene.analysis.core; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 1fed10384da..01c537b85cb 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -21,7 +21,7 @@ import java.io.IOException; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; /** diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java index b43a5c3b0dc..507a114336a 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilter.java @@ -31,8 +31,6 @@ import org.apache.lucene.util.Version; /** * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be * tokenized as "avion" (plane). - *

    - * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out. * * @see Elision in Wikipedia */ diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java index f89b07a3cbc..2e4c6e43e3f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java @@ -19,7 +19,7 @@ package org.apache.lucene.analysis.in; import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java index adb51f29d44..bd8cc47a40f 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java @@ -19,11 +19,13 @@ package org.apache.lucene.analysis.it; import java.io.IOException; import java.io.Reader; +import java.util.Arrays; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.fr.ElisionFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -38,6 +40,14 @@ import org.tartarus.snowball.ext.ItalianStemmer; /** * {@link Analyzer} for Italian. + *

    + * + *

    You must specify the required {@link Version} + * compatibility when creating ItalianAnalyzer: + *

    */ public final class ItalianAnalyzer extends StopwordAnalyzerBase { private final Set stemExclusionSet; @@ -45,6 +55,13 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { /** File containing default Italian stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "italian_stop.txt"; + private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( + new CharArraySet(Version.LUCENE_CURRENT, + Arrays.asList( + "c", "l", "all", "dall", "dell", "nell", "sull", "coll", "pell", + "gl", "agl", "dagl", "degl", "negl", "sugl", "un", "m", "t", "s", "v", "d" + ), true)); + /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. @@ -112,7 +129,7 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { * @return A * {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} + * {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, {@link StopFilter} * , {@link KeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @@ -121,6 +138,9 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase { Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); + if (matchVersion.onOrAfter(Version.LUCENE_32)) { + result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES); + } result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, stopwords); if(!stemExclusionSet.isEmpty()) diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java index b0cd8d60cfc..608c386625d 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java @@ -25,57 +25,71 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** - * + * * Take something like: - * + * *
    - *  /soemthing/something/else
    + *  /something/something/else
      * 
    - * + * * and make: - * + * *
    - *  /soemthing
    - *  /soemthing/something
    - *  /soemthing/something/else
    + *  /something
    + *  /something/something
    + *  /something/something/else
      * 
    - * */ public class PathHierarchyTokenizer extends Tokenizer { public PathHierarchyTokenizer(Reader input) { - this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER); + this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP); + } + + public PathHierarchyTokenizer(Reader input, int skip) { + this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip); } public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) { - this(input, bufferSize, delimiter, delimiter); + this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP); } public PathHierarchyTokenizer(Reader input, char delimiter, char replacement) { - this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement); + this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP); } - public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) { + public PathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) { + this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip); + } + + public PathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(input); termAtt.resizeBuffer(bufferSize); + this.delimiter = delimiter; this.replacement = replacement; - endDelimiter = false; + this.skip = skip; resultToken = new StringBuilder(bufferSize); } - + private static final int DEFAULT_BUFFER_SIZE = 1024; public static final char DEFAULT_DELIMITER = '/'; + public static final int DEFAULT_SKIP = 0; + private final char delimiter; private final char replacement; - + private final int skip; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + private int startPosition = 0; private int finalOffset = 0; - private boolean endDelimiter; + private int skipped = 0; + private boolean endDelimiter = false; private StringBuilder resultToken; + @Override public final boolean incrementToken() throws IOException { clearAttributes(); @@ -97,43 +111,69 @@ public class PathHierarchyTokenizer extends Tokenizer { while (true) { int c = input.read(); - if( c < 0 ) { - length += resultToken.length(); - termAtt.setLength(length); - finalOffset = correctOffset(length); - offsetAtt.setOffset(correctOffset(0), finalOffset); - if( added ){ - resultToken.setLength(0); - resultToken.append(termAtt.buffer(), 0, length); - } - return added; - } - added = true; - if( c == delimiter ) { - if( length > 0 ){ - endDelimiter = true; - break; + if( c < 0 ){ + if( skipped > skip ) { + length += resultToken.length(); + termAtt.setLength(length); + finalOffset = correctOffset(startPosition + length); + offsetAtt.setOffset(correctOffset(startPosition), finalOffset); + if( added ){ + resultToken.setLength(0); + resultToken.append(termAtt.buffer(), 0, length); + } + return added; } else{ - termAtt.append(replacement); + finalOffset = correctOffset(startPosition + length); + return false; + } + } + if( !added ){ + added = true; + skipped++; + if( skipped > skip ){ + termAtt.append(c == delimiter ? replacement : (char)c); length++; } + else { + startPosition++; + } } else { - termAtt.append((char)c); - length++; + if( c == delimiter ){ + if( skipped > skip ){ + endDelimiter = true; + break; + } + skipped++; + if( skipped > skip ){ + termAtt.append(replacement); + length++; + } + else { + startPosition++; + } + } + else { + if( skipped > skip ){ + termAtt.append((char)c); + length++; + } + else { + startPosition++; + } + } } } - length += resultToken.length(); termAtt.setLength(length); - finalOffset = correctOffset(length); - offsetAtt.setOffset(correctOffset(0), finalOffset); + finalOffset = correctOffset(startPosition + length); + offsetAtt.setOffset(correctOffset(startPosition), finalOffset); resultToken.setLength(0); resultToken.append(termAtt.buffer(), 0, length); return true; } - + @Override public final void end() { // set final offset @@ -146,5 +186,6 @@ public class PathHierarchyTokenizer extends Tokenizer { resultToken.setLength(0); finalOffset = 0; endDelimiter = false; + skipped = 0; } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java new file mode 100644 index 00000000000..07aa11fbbaf --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java @@ -0,0 +1,173 @@ +package org.apache.lucene.analysis.path; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * + * Take something like: + * + *
    + * www.site.co.uk
    + * 
    + * + * and make: + * + *
    + * www.site.co.uk
    + * site.co.uk
    + * co.uk
    + * uk
    + * 
    + * + */ +public class ReversePathHierarchyTokenizer extends Tokenizer { + + public ReversePathHierarchyTokenizer(Reader input) { + this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP); + } + + public ReversePathHierarchyTokenizer(Reader input, int skip) { + this(input, DEFAULT_BUFFER_SIZE, DEFAULT_DELIMITER, DEFAULT_DELIMITER, skip); + } + + public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter) { + this(input, bufferSize, delimiter, delimiter, DEFAULT_SKIP); + } + + public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement) { + this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, DEFAULT_SKIP); + } + + public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement) { + this(input, bufferSize, delimiter, replacement, DEFAULT_SKIP); + } + + public ReversePathHierarchyTokenizer(Reader input, char delimiter, int skip) { + this(input, DEFAULT_BUFFER_SIZE, delimiter, delimiter, skip); + } + + public ReversePathHierarchyTokenizer(Reader input, char delimiter, char replacement, int skip) { + this(input, DEFAULT_BUFFER_SIZE, delimiter, replacement, skip); + } + + public ReversePathHierarchyTokenizer(Reader input, int bufferSize, char delimiter, char replacement, int skip) { + super(input); + termAtt.resizeBuffer(bufferSize); + this.delimiter = delimiter; + this.replacement = replacement; + this.skip = skip; + resultToken = new StringBuilder(bufferSize); + resultTokenBuffer = new char[bufferSize]; + delimiterPositions = new ArrayList(bufferSize/10); + } + + private static final int DEFAULT_BUFFER_SIZE = 1024; + public static final char DEFAULT_DELIMITER = '/'; + public static final int DEFAULT_SKIP = 0; + + private final char delimiter; + private final char replacement; + private final int skip; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class); + + private int endPosition = 0; + private int finalOffset = 0; + private int skipped = 0; + private StringBuilder resultToken; + + private List delimiterPositions; + private int delimitersCount = -1; + private char[] resultTokenBuffer; + + @Override + public final boolean incrementToken() throws IOException { + clearAttributes(); + if(delimitersCount == -1){ + int length = 0; + delimiterPositions.add(0); + while (true) { + int c = input.read(); + if( c < 0 ) { + break; + } + length++; + if( c == delimiter ) { + delimiterPositions.add(length); + resultToken.append(replacement); + } + else{ + resultToken.append((char)c); + } + } + delimitersCount = delimiterPositions.size(); + if( delimiterPositions.get(delimitersCount-1) < length ){ + delimiterPositions.add(length); + delimitersCount++; + } + if( resultTokenBuffer.length < resultToken.length() ){ + resultTokenBuffer = new char[resultToken.length()]; + } + resultToken.getChars(0, resultToken.length(), resultTokenBuffer, 0); + resultToken.setLength(0); + endPosition = delimiterPositions.get(delimitersCount-1 - skip); + finalOffset = correctOffset(length); + posAtt.setPositionIncrement(1); + } + else{ + posAtt.setPositionIncrement(0); + } + + while( skipped < delimitersCount-skip-1 ){ + int start = delimiterPositions.get(skipped); + termAtt.copyBuffer(resultTokenBuffer, start, endPosition - start); + offsetAtt.setOffset(correctOffset(start), correctOffset(endPosition)); + skipped++; + return true; + } + + return false; + } + + @Override + public final void end() { + // set final offset + offsetAtt.setOffset(finalOffset, finalOffset); + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + resultToken.setLength(0); + finalOffset = 0; + skipped = 0; + delimitersCount = -1; + delimiterPositions.clear(); + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java index e5426d775b2..088b8025064 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java @@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ru; */ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; // for javadocs +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs import org.apache.lucene.util.AttributeSource; diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java index cd52e392070..552ea3fd3dd 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java @@ -24,7 +24,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; diff --git a/lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java similarity index 97% rename from lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java rename to modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java index 3055d19e5b2..5d91a3a3fe1 100644 --- a/lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,12 +20,13 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; -import org.apache.lucene.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; /** * An abstract base class for simple, character-oriented tokenizers. diff --git a/lucene/src/java/org/apache/lucene/util/CharacterUtils.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java similarity index 99% rename from lucene/src/java/org/apache/lucene/util/CharacterUtils.java rename to modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java index 8f5a8af9ede..fe622788198 100644 --- a/lucene/src/java/org/apache/lucene/util/CharacterUtils.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java @@ -1,8 +1,10 @@ -package org.apache.lucene.util; +package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.util.Version; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java index ae4bf2f2d24..83d7a863b35 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianAnalyzer.java @@ -23,6 +23,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; public class TestItalianAnalyzer extends BaseTokenStreamTestCase { /** This test fails with NPE when the @@ -55,4 +56,18 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase { public void testRandomStrings() throws Exception { checkRandomData(random, new ItalianAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER); } + + /** test that the elisionfilter is working */ + public void testContractions() throws IOException { + Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT); + assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" }); + assertAnalyzesTo(a, "l'Italiano", new String[] { "ital" }); + } + + /** test that we don't enable this before 3.2*/ + public void testContractionsBackwards() throws IOException { + Analyzer a = new ItalianAnalyzer(Version.LUCENE_31); + assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" }); + assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" }); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java index cb0adc9e474..9cc50735965 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java @@ -127,4 +127,70 @@ public class TestPathHierarchyTokenizer extends BaseTokenStreamTestCase { new int[]{1, 0, 0, 0}, path.length()); } + + public void testBasicSkip() throws Exception { + String path = "/a/b/c"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/b", "/b/c"}, + new int[]{2, 2}, + new int[]{4, 6}, + new int[]{1, 0}, + path.length()); + } + + public void testEndOfDelimiterSkip() throws Exception { + String path = "/a/b/c/"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/b", "/b/c", "/b/c/"}, + new int[]{2, 2, 2}, + new int[]{4, 6, 7}, + new int[]{1, 0, 0}, + path.length()); + } + + public void testStartOfCharSkip() throws Exception { + String path = "a/b/c"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/b", "/b/c"}, + new int[]{1, 1}, + new int[]{3, 5}, + new int[]{1, 0}, + path.length()); + } + + public void testStartOfCharEndOfDelimiterSkip() throws Exception { + String path = "a/b/c/"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/b", "/b/c", "/b/c/"}, + new int[]{1, 1, 1}, + new int[]{3, 5, 6}, + new int[]{1, 0, 0}, + path.length()); + } + + public void testOnlyDelimiterSkip() throws Exception { + String path = "/"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{}, + new int[]{}, + new int[]{}, + new int[]{}, + path.length()); + } + + public void testOnlyDelimitersSkip() throws Exception { + String path = "//"; + PathHierarchyTokenizer t = new PathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/"}, + new int[]{1}, + new int[]{2}, + new int[]{1}, + path.length()); + } } diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java new file mode 100644 index 00000000000..a881be03ea3 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/path/TestReversePathHierarchyTokenizer.java @@ -0,0 +1,157 @@ +package org.apache.lucene.analysis.path; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; + +public class TestReversePathHierarchyTokenizer extends BaseTokenStreamTestCase { + + public void testBasicReverse() throws Exception { + String path = "/a/b/c"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"/a/b/c", "a/b/c", "b/c", "c"}, + new int[]{0, 1, 3, 5}, + new int[]{6, 6, 6, 6}, + new int[]{1, 0, 0, 0}, + path.length()); + } + + public void testEndOfDelimiterReverse() throws Exception { + String path = "/a/b/c/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"/a/b/c/", "a/b/c/", "b/c/", "c/"}, + new int[]{0, 1, 3, 5}, + new int[]{7, 7, 7, 7}, + new int[]{1, 0, 0, 0}, + path.length()); + } + + public void testStartOfCharReverse() throws Exception { + String path = "a/b/c"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"a/b/c", "b/c", "c"}, + new int[]{0, 2, 4}, + new int[]{5, 5, 5}, + new int[]{1, 0, 0}, + path.length()); + } + + public void testStartOfCharEndOfDelimiterReverse() throws Exception { + String path = "a/b/c/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"a/b/c/", "b/c/", "c/"}, + new int[]{0, 2, 4}, + new int[]{6, 6, 6}, + new int[]{1, 0, 0}, + path.length()); + } + + public void testOnlyDelimiterReverse() throws Exception { + String path = "/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"/"}, + new int[]{0}, + new int[]{1}, + new int[]{1}, + path.length()); + } + + public void testOnlyDelimitersReverse() throws Exception { + String path = "//"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path) ); + assertTokenStreamContents(t, + new String[]{"//", "/"}, + new int[]{0, 1}, + new int[]{2, 2}, + new int[]{1, 0}, + path.length()); + } + + public void testEndOfDelimiterReverseSkip() throws Exception { + String path = "/a/b/c/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/a/b/", "a/b/", "b/"}, + new int[]{0, 1, 3}, + new int[]{5, 5, 5}, + new int[]{1, 0, 0}, + path.length()); + } + + public void testStartOfCharReverseSkip() throws Exception { + String path = "a/b/c"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"a/b/", "b/"}, + new int[]{0, 2}, + new int[]{4, 4}, + new int[]{1, 0}, + path.length()); + } + + public void testStartOfCharEndOfDelimiterReverseSkip() throws Exception { + String path = "a/b/c/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"a/b/", "b/"}, + new int[]{0, 2}, + new int[]{4, 4}, + new int[]{1, 0}, + path.length()); + } + + public void testOnlyDelimiterReverseSkip() throws Exception { + String path = "/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{}, + new int[]{}, + new int[]{}, + new int[]{}, + path.length()); + } + + public void testOnlyDelimitersReverseSkip() throws Exception { + String path = "//"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 1 ); + assertTokenStreamContents(t, + new String[]{"/"}, + new int[]{0}, + new int[]{1}, + new int[]{1}, + path.length()); + } + + public void testReverseSkip2() throws Exception { + String path = "/a/b/c/"; + ReversePathHierarchyTokenizer t = new ReversePathHierarchyTokenizer( new StringReader(path), 2 ); + assertTokenStreamContents(t, + new String[]{"/a/", "a/"}, + new int[]{0, 1}, + new int[]{3, 3}, + new int[]{1, 0}, + path.length()); + } +} diff --git a/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java similarity index 82% rename from lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java rename to modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java index adb902d95f6..f129596df92 100644 --- a/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,6 +20,10 @@ package org.apache.lucene.analysis; import java.io.IOException; import java.io.StringReader; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseTokenizer; + /** * Testcase for {@link CharTokenizer} subclasses @@ -42,7 +46,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); } @@ -59,7 +63,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { builder.append("a"); } builder.append("\ud801\udc1cabc"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); } } @@ -73,7 +77,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { for (int i = 0; i < 255; i++) { builder.append("A"); } - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } @@ -87,7 +91,7 @@ public class TestCharTokenizers extends BaseTokenStreamTestCase { builder.append("A"); } builder.append("\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } } diff --git a/lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java similarity index 97% rename from lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java rename to modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java index 69393bca871..4e9fdbf6c24 100644 --- a/lucene/src/test/org/apache/lucene/util/TestCharacterUtils.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java @@ -1,4 +1,4 @@ -package org.apache.lucene.util; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,7 +21,9 @@ import java.io.IOException; import java.io.Reader; import java.io.StringReader; -import org.apache.lucene.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; import org.junit.Test; /** diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 0ed4698c902..42555a66b8a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -198,6 +198,9 @@ Bug Fixes initialization if the schema.xml contains an analyzer configuration for a fieldType that does not use TextField. (hossman) +* SOLR-2467: Fix initialization so any errors + are logged properly. (hossman) + Other Changes ---------------------- @@ -267,6 +270,12 @@ Detailed Change List New Features ---------------------- +* SOLR-2496: Add ability to specify overwrite and commitWithin as request + parameters (e.g. specified in the URL) when using the JSON update format, + and added a simplified format for specifying multiple documents. + Example: [{"id":"doc1"},{"id":"doc2"}] + (yonik) + Optimizations ---------------------- @@ -309,6 +318,13 @@ Bug Fixes did not clear all attributes so they displayed incorrect attribute values for tokens in later filter stages. (uschindler, rmuir, yonik) +* SOLR-2493: SolrQueryParser was fixed to not parse the SolrConfig DOM tree on each + instantiation which is a huge slowdown. (Stephane Bailliez via uschindler) + +* SOLR-2495: The JSON parser could hang on corrupted input and could fail + to detect numbers that were too large to fit in a long. (yonik) + + Other Changes ---------------------- diff --git a/solr/README.txt b/solr/README.txt index 0a9e939128d..f9a72c0f187 100644 --- a/solr/README.txt +++ b/solr/README.txt @@ -64,18 +64,18 @@ docs/api/index.html Instructions for Building Apache Solr from Source ------------------------------------------------- -1. Download the Java SE 6 JDK (Java Development Kit) or later from http://java.sun.com. - You will need the JDK installed, and the %JAVA_HOME%\bin directory included - on your command path. To test this, issue a "java -version" command from your - shell and verify that the Java version is 1.6 or later. +1. Download the Java SE 6 JDK (Java Development Kit) or later from http://java.sun.com/ + You will need the JDK installed, and the $JAVA_HOME/bin (Windows: %JAVA_HOME%\bin) + folder included on your command path. To test this, issue a "java -version" command + from your shell (command prompt) and verify that the Java version is 1.6 or later. -2. Download the Apache Ant binary distribution (1.7.0 or greater) from http://ant.apache.org. - You will need Ant installed and the %ANT_HOME%\bin directory included on your - command path. To test this, issue a "ant -version" command from your - shell and verify that Ant is available. +2. Download the Apache Ant binary distribution (1.7.0 or greater) from http://ant.apache.org/ + You will need Ant installed and the $ANT_HOME/bin (Windows: %ANT_HOME%\bin) folder + included on your command path. To test this, issue a "ant -version" command from your + shell (command prompt) and verify that Ant is available. -3. Download the Apache Solr distribution, linked from the above - web site. Expand the distribution to a folder of your choice, e.g. c:\solr. +3. Download the Apache Solr distribution, linked from the above web site. + Unzip the distribution to a folder of your choice, e.g. C:\solr or ~/solr Alternately, you can obtain a copy of the latest Apache Solr source code directly from the Subversion repository: diff --git a/solr/build.xml b/solr/build.xml index 8c68ca3464e..de7ef217bee 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -450,6 +450,7 @@ > + @@ -1020,7 +1021,7 @@ jar.file="lib/commons-csv-1.0-SNAPSHOT-r966014.jar" /> + jar.file="lib/apache-solr-noggit-r1099557.jar" /> diff --git a/solr/common-build.xml b/solr/common-build.xml index 861ff237062..a57b4074e25 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -61,6 +61,7 @@ + diff --git a/solr/contrib/analysis-extras/build.xml b/solr/contrib/analysis-extras/build.xml index 9cc5aa217bc..6ec8ecdbaa8 100644 --- a/solr/contrib/analysis-extras/build.xml +++ b/solr/contrib/analysis-extras/build.xml @@ -146,6 +146,7 @@ > + diff --git a/solr/contrib/clustering/build.xml b/solr/contrib/clustering/build.xml index aee297e3b8f..9a0c67eaa2f 100644 --- a/solr/contrib/clustering/build.xml +++ b/solr/contrib/clustering/build.xml @@ -118,6 +118,7 @@ > + diff --git a/solr/contrib/dataimporthandler/build.xml b/solr/contrib/dataimporthandler/build.xml index bd6ea50a2e2..79a0524fcc7 100644 --- a/solr/contrib/dataimporthandler/build.xml +++ b/solr/contrib/dataimporthandler/build.xml @@ -171,6 +171,7 @@ + @@ -231,6 +232,7 @@ > + diff --git a/solr/contrib/extraction/build.xml b/solr/contrib/extraction/build.xml index 01aa60e7485..50dcb4983d5 100644 --- a/solr/contrib/extraction/build.xml +++ b/solr/contrib/extraction/build.xml @@ -115,6 +115,7 @@ > + diff --git a/solr/contrib/uima/CHANGES.txt b/solr/contrib/uima/CHANGES.txt index a31054a05b5..6e97c775acb 100644 --- a/solr/contrib/uima/CHANGES.txt +++ b/solr/contrib/uima/CHANGES.txt @@ -28,6 +28,11 @@ Upgrading from Solr 3.1 It should move to UIMAUpdateRequestProcessorFactory setting. See contrib/uima/README.txt for more details. (SOLR-2436) +New Features +---------------------- + +* SOLR-2503: extend mapping function to map feature value to dynamicField. (koji) + Test Cases: ---------------------- diff --git a/solr/contrib/uima/README.txt b/solr/contrib/uima/README.txt index a8ef9cd5598..2e21536d3a5 100644 --- a/solr/contrib/uima/README.txt +++ b/solr/contrib/uima/README.txt @@ -37,20 +37,26 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f - - org.apache.uima.alchemy.ts.concept.ConceptFS - text - concept + + org.apache.uima.alchemy.ts.concept.ConceptFS + + text + concept + - - org.apache.uima.alchemy.ts.language.LanguageFS - language - language + + org.apache.uima.alchemy.ts.language.LanguageFS + + language + language + - - org.apache.uima.SentenceAnnotation - coveredText - sentence + + org.apache.uima.SentenceAnnotation + + coveredText + sentence + diff --git a/solr/contrib/uima/build.xml b/solr/contrib/uima/build.xml index 631f8a8c413..16c7de67844 100644 --- a/solr/contrib/uima/build.xml +++ b/solr/contrib/uima/build.xml @@ -114,6 +114,7 @@ > + diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java index 22357262ba3..68c9e1bac0a 100644 --- a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java @@ -30,14 +30,14 @@ public class SolrUIMAConfiguration { private boolean fieldsMerging; - private Map> typesFeaturesFieldsMapping; + private Map> typesFeaturesFieldsMapping; private String aePath; private Map runtimeParameters; public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging, - Map> typesFeaturesFieldsMapping, + Map> typesFeaturesFieldsMapping, Map runtimeParameters) { this.aePath = aePath; this.fieldsToAnalyze = fieldsToAnalyze; @@ -54,7 +54,7 @@ public class SolrUIMAConfiguration { return fieldsMerging; } - public Map> getTypesFeaturesFieldsMapping() { + public Map> getTypesFeaturesFieldsMapping() { return typesFeaturesFieldsMapping; } @@ -65,4 +65,39 @@ public class SolrUIMAConfiguration { public Map getRuntimeParameters() { return runtimeParameters; } + + static final class MapField { + + private String fieldName, fieldNameFeature; + private boolean prefix; // valid if dynamicField == true + // false: *_s, true: s_* + + MapField(String fieldName, String fieldNameFeature){ + this.fieldName = fieldName; + this.fieldNameFeature = fieldNameFeature; + if(fieldNameFeature != null){ + if(fieldName.startsWith("*")){ + prefix = false; + this.fieldName = fieldName.substring(1); + } + else if(fieldName.endsWith("*")){ + prefix = true; + this.fieldName = fieldName.substring(0, fieldName.length() - 1); + } + else + throw new RuntimeException("static field name cannot be used for dynamicField"); + } + } + + String getFieldNameFeature(){ + return fieldNameFeature; + } + + String getFieldName(String featureValue){ + if(fieldNameFeature != null){ + return prefix ? fieldName + featureValue : featureValue + fieldName; + } + return fieldName; + } + } } diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java index 00e6aca3288..fc225d1deba 100644 --- a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Map; import org.apache.solr.common.util.NamedList; +import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField; /** * Read configuration for Solr-UIMA integration @@ -62,18 +63,31 @@ public class SolrUIMAConfigurationReader { } @SuppressWarnings("rawtypes") - private Map> readTypesFeaturesFieldsMapping() { - Map> map = new HashMap>(); + private Map> readTypesFeaturesFieldsMapping() { + Map> map = new HashMap>(); NamedList fieldMappings = (NamedList) args.get("fieldMappings"); /* iterate over UIMA types */ for (int i = 0; i < fieldMappings.size(); i++) { - NamedList mapping = (NamedList) fieldMappings.get("mapping", i); - String typeName = (String) mapping.get("type"); - String featureName = (String) mapping.get("feature"); - String mappedFieldName = (String) mapping.get("field"); - Map subMap = new HashMap(); - subMap.put(featureName, mappedFieldName); + NamedList type = (NamedList) fieldMappings.get("type", i); + String typeName = (String)type.get("name"); + + Map subMap = new HashMap(); + /* iterate over mapping definitions */ + for(int j = 0; j < type.size() - 1; j++){ + NamedList mapping = (NamedList) type.get("mapping", j + 1); + String featureName = (String) mapping.get("feature"); + String fieldNameFeature = null; + String mappedFieldName = (String) mapping.get("field"); + if(mappedFieldName == null){ + fieldNameFeature = (String) mapping.get("fieldNameFeature"); + mappedFieldName = (String) mapping.get("dynamicField"); + } + if(mappedFieldName == null) + throw new RuntimeException("either of field or dynamicField should be defined for feature " + featureName); + MapField mapField = new MapField(mappedFieldName, fieldNameFeature); + subMap.put(featureName, mapField); + } map.put(typeName, subMap); } return map; diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java index 29e7b5c2926..6d8cdc50c0d 100644 --- a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java @@ -20,6 +20,7 @@ package org.apache.solr.uima.processor; import java.util.Map; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; @@ -53,7 +54,7 @@ public class UIMAToSolrMapper { * name of UIMA type to map * @param featureFieldsmapping */ - public void map(String typeName, Map featureFieldsmapping) { + public void map(String typeName, Map featureFieldsmapping) { try { FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor( JCas.class).newInstance(cas); @@ -62,7 +63,11 @@ public class UIMAToSolrMapper { .hasNext();) { FeatureStructure fs = iterator.next(); for (String featureName : featureFieldsmapping.keySet()) { - String fieldName = featureFieldsmapping.get(featureName); + MapField mapField = featureFieldsmapping.get(featureName); + String fieldNameFeature = mapField.getFieldNameFeature(); + String fieldNameFeatureValue = fieldNameFeature == null ? null : + fs.getFeatureValueAsString(type.getFeatureByBaseName(fieldNameFeature)); + String fieldName = mapField.getFieldName(fieldNameFeatureValue); log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName) .append(" to ").append(fieldName).toString()); String featureValue = null; diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java index 8b3cb547d67..9950838569c 100644 --- a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java @@ -22,6 +22,7 @@ import java.util.Map; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrCore; +import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField; import org.apache.solr.uima.processor.ae.AEProvider; import org.apache.solr.uima.processor.ae.AEProviderFactory; import org.apache.solr.update.AddUpdateCommand; @@ -39,7 +40,7 @@ import org.apache.uima.resource.ResourceInitializationException; */ public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor { - private SolrUIMAConfiguration solrUIMAConfiguration; + SolrUIMAConfiguration solrUIMAConfiguration; private AEProvider aeProvider; @@ -69,7 +70,7 @@ public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor { UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas); /* get field mapping from config */ - Map> typesAndFeaturesFieldsMap = solrUIMAConfiguration + Map> typesAndFeaturesFieldsMap = solrUIMAConfiguration .getTypesFeaturesFieldsMapping(); /* map type features on fields */ for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) { diff --git a/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java index 392afcf1ffc..c7275829171 100644 --- a/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java +++ b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java @@ -33,6 +33,8 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.handler.XmlUpdateRequestHandler; import org.apache.solr.request.SolrQueryRequestBase; import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField; +import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.update.processor.UpdateRequestProcessorChain; import org.junit.Before; import org.junit.BeforeClass; @@ -66,6 +68,26 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 { UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained .getFactories()[0]; assertNotNull(factory); + UpdateRequestProcessor processor = factory.getInstance(req(), null, null); + assertTrue(processor instanceof UIMAUpdateRequestProcessor); + } + + @Test + public void testMultiMap() { + SolrCore core = h.getCore(); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima-multi-map"); + assertNotNull(chained); + UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained + .getFactories()[0]; + assertNotNull(factory); + UpdateRequestProcessor processor = factory.getInstance(req(), null, null); + assertTrue(processor instanceof UIMAUpdateRequestProcessor); + SolrUIMAConfiguration conf = ((UIMAUpdateRequestProcessor)processor).solrUIMAConfiguration; + Map> map = conf.getTypesFeaturesFieldsMapping(); + Map subMap = map.get("a-type-which-can-have-multiple-features"); + assertEquals(2, subMap.size()); + assertEquals("1", subMap.get("A").getFieldName(null)); + assertEquals("2", subMap.get("B").getFieldName(null)); } @Test @@ -83,7 +105,7 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 { assertU(commit()); assertQ(req("sentence:*"), "//*[@numFound='1']"); assertQ(req("sentiment:*"), "//*[@numFound='0']"); - assertQ(req("entity:Prague"), "//*[@numFound='1']"); + assertQ(req("OTHER_sm:Prague"), "//*[@numFound='1']"); } @Test @@ -103,7 +125,7 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 { assertQ(req("sentence:*"), "//*[@numFound='2']"); assertQ(req("sentiment:positive"), "//*[@numFound='1']"); - assertQ(req("entity:Apache"), "//*[@numFound='2']"); + assertQ(req("ORGANIZATION_sm:Apache"), "//*[@numFound='2']"); } private void addDoc(String doc) throws Exception { diff --git a/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java index 6c3941ac49e..e59da1228fe 100644 --- a/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java +++ b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java @@ -34,6 +34,12 @@ public class DummyEntityAnnotator extends JCasAnnotator_ImplBase{ EntityAnnotation entityAnnotation = new EntityAnnotation(jcas); entityAnnotation.setBegin(annotation.getBegin()); entityAnnotation.setEnd(annotation.getEnd()); + String entityString = annotation.getCoveredText(); + entityAnnotation.setEntity(entityString); + String name = "OTHER"; // "OTHER" makes no sense. In practice, "PERSON", "COUNTRY", "E-MAIL", etc. + if(entityString.equals("Apache")) + name = "ORGANIZATION"; + entityAnnotation.setName(name); entityAnnotation.addToIndexes(); } } diff --git a/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation.java b/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation.java index f48e5bc0912..ed597514a71 100644 --- a/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation.java +++ b/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation.java @@ -1,6 +1,6 @@ -/* First created by JCasGen Fri Mar 04 12:48:08 CET 2011 */ +/* First created by JCasGen Sat May 07 22:33:38 JST 2011 */ package org.apache.solr.uima.ts; import org.apache.uima.jcas.JCas; @@ -11,8 +11,8 @@ import org.apache.uima.jcas.tcas.Annotation; /** - * Updated by JCasGen Fri Mar 04 12:50:14 CET 2011 - * XML source: /Users/tommasoteofili/Documents/workspaces/lucene_workspace/lucene_dev/solr/contrib/uima/src/test/resources/DummyEntityAEDescriptor.xml + * Updated by JCasGen Sat May 07 22:33:38 JST 2011 + * XML source: /Users/koji/Documents/workspace/DummyEntityAnnotator/desc/DummyEntityAEDescriptor.xml * @generated */ public class EntityAnnotation extends Annotation { /** @generated @@ -57,6 +57,42 @@ public class EntityAnnotation extends Annotation { @generated modifiable */ private void readObject() {} -} + + + //*--------------* + //* Feature: name + + /** getter for name - gets + * @generated */ + public String getName() { + if (EntityAnnotation_Type.featOkTst && ((EntityAnnotation_Type)jcasType).casFeat_name == null) + jcasType.jcas.throwFeatMissing("name", "org.apache.solr.uima.ts.EntityAnnotation"); + return jcasType.ll_cas.ll_getStringValue(addr, ((EntityAnnotation_Type)jcasType).casFeatCode_name);} + + /** setter for name - sets + * @generated */ + public void setName(String v) { + if (EntityAnnotation_Type.featOkTst && ((EntityAnnotation_Type)jcasType).casFeat_name == null) + jcasType.jcas.throwFeatMissing("name", "org.apache.solr.uima.ts.EntityAnnotation"); + jcasType.ll_cas.ll_setStringValue(addr, ((EntityAnnotation_Type)jcasType).casFeatCode_name, v);} + + + //*--------------* + //* Feature: entity + + /** getter for entity - gets + * @generated */ + public String getEntity() { + if (EntityAnnotation_Type.featOkTst && ((EntityAnnotation_Type)jcasType).casFeat_entity == null) + jcasType.jcas.throwFeatMissing("entity", "org.apache.solr.uima.ts.EntityAnnotation"); + return jcasType.ll_cas.ll_getStringValue(addr, ((EntityAnnotation_Type)jcasType).casFeatCode_entity);} + + /** setter for entity - sets + * @generated */ + public void setEntity(String v) { + if (EntityAnnotation_Type.featOkTst && ((EntityAnnotation_Type)jcasType).casFeat_entity == null) + jcasType.jcas.throwFeatMissing("entity", "org.apache.solr.uima.ts.EntityAnnotation"); + jcasType.ll_cas.ll_setStringValue(addr, ((EntityAnnotation_Type)jcasType).casFeatCode_entity, v);} + } \ No newline at end of file diff --git a/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation_Type.java b/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation_Type.java index f7bb572f7aa..5be6a1a6020 100644 --- a/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation_Type.java +++ b/solr/contrib/uima/src/test/java/org/apache/solr/uima/ts/EntityAnnotation_Type.java @@ -1,5 +1,5 @@ -/* First created by JCasGen Fri Mar 04 12:48:08 CET 2011 */ +/* First created by JCasGen Sat May 07 22:33:38 JST 2011 */ package org.apache.solr.uima.ts; import org.apache.uima.jcas.JCas; @@ -9,10 +9,12 @@ import org.apache.uima.cas.impl.FSGenerator; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.impl.TypeImpl; import org.apache.uima.cas.Type; +import org.apache.uima.cas.impl.FeatureImpl; +import org.apache.uima.cas.Feature; import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Fri Mar 04 12:50:14 CET 2011 + * Updated by JCasGen Sat May 07 22:33:38 JST 2011 * @generated */ public class EntityAnnotation_Type extends Annotation_Type { /** @generated */ @@ -38,6 +40,42 @@ public class EntityAnnotation_Type extends Annotation_Type { /** @generated @modifiable */ public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.solr.uima.ts.EntityAnnotation"); + + /** @generated */ + final Feature casFeat_name; + /** @generated */ + final int casFeatCode_name; + /** @generated */ + public String getName(int addr) { + if (featOkTst && casFeat_name == null) + jcas.throwFeatMissing("name", "org.apache.solr.uima.ts.EntityAnnotation"); + return ll_cas.ll_getStringValue(addr, casFeatCode_name); + } + /** @generated */ + public void setName(int addr, String v) { + if (featOkTst && casFeat_name == null) + jcas.throwFeatMissing("name", "org.apache.solr.uima.ts.EntityAnnotation"); + ll_cas.ll_setStringValue(addr, casFeatCode_name, v);} + + + + /** @generated */ + final Feature casFeat_entity; + /** @generated */ + final int casFeatCode_entity; + /** @generated */ + public String getEntity(int addr) { + if (featOkTst && casFeat_entity == null) + jcas.throwFeatMissing("entity", "org.apache.solr.uima.ts.EntityAnnotation"); + return ll_cas.ll_getStringValue(addr, casFeatCode_entity); + } + /** @generated */ + public void setEntity(int addr, String v) { + if (featOkTst && casFeat_entity == null) + jcas.throwFeatMissing("entity", "org.apache.solr.uima.ts.EntityAnnotation"); + ll_cas.ll_setStringValue(addr, casFeatCode_entity, v);} + + @@ -47,6 +85,14 @@ public class EntityAnnotation_Type extends Annotation_Type { super(jcas, casType); casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); + + casFeat_name = jcas.getRequiredFeatureDE(casType, "name", "uima.cas.String", featOkTst); + casFeatCode_name = (null == casFeat_name) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_name).getCode(); + + + casFeat_entity = jcas.getRequiredFeatureDE(casType, "entity", "uima.cas.String", featOkTst); + casFeatCode_entity = (null == casFeat_entity) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_entity).getCode(); + } } diff --git a/solr/contrib/uima/src/test/resources/DummyEntityAEDescriptor.xml b/solr/contrib/uima/src/test/resources/DummyEntityAEDescriptor.xml index 61f1d8c8046..33f05e50e39 100644 --- a/solr/contrib/uima/src/test/resources/DummyEntityAEDescriptor.xml +++ b/solr/contrib/uima/src/test/resources/DummyEntityAEDescriptor.xml @@ -32,6 +32,18 @@ org.apache.solr.uima.ts.EntityAnnotation uima.tcas.Annotation + + + name + + uima.cas.String + + + entity + + uima.cas.String + + diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml b/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml index 6df09b51320..85d15ef77f1 100644 --- a/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml @@ -597,6 +597,7 @@ stored="true" multiValued="true"/> --> + + samsung electronics hard drive 7200RPM, 8MB cache, IDE Ultra ATA-133 @@ -36,6 +38,8 @@ 6H500F0 Maxtor DiamondMax 11 - hard drive - 500 GB - SATA-300 Maxtor Corp. + + maxtor electronics hard drive SATA 3.0Gb/s, NCQ diff --git a/solr/example/exampledocs/ipod_other.xml b/solr/example/exampledocs/ipod_other.xml index f259e9e7b43..7756c9fc805 100644 --- a/solr/example/exampledocs/ipod_other.xml +++ b/solr/example/exampledocs/ipod_other.xml @@ -21,6 +21,8 @@ F8V7067-APL-KIT Belkin Mobile Power Cord for iPod w/ Dock Belkin + + belkin electronics connector car power adapter, white @@ -37,6 +39,8 @@ IW-02 iPod & iPod Mini USB 2.0 Cable Belkin + + belkin electronics connector car power adapter for iPod, white diff --git a/solr/example/exampledocs/ipod_video.xml b/solr/example/exampledocs/ipod_video.xml index 7895860ea19..1ca5f6f5c21 100644 --- a/solr/example/exampledocs/ipod_video.xml +++ b/solr/example/exampledocs/ipod_video.xml @@ -19,6 +19,8 @@ MA147LL/A Apple 60 GB iPod with Video Playback Black Apple Computer Inc. + + apple electronics music iTunes, Podcasts, Audiobooks diff --git a/solr/example/exampledocs/manufacturers.xml b/solr/example/exampledocs/manufacturers.xml new file mode 100644 index 00000000000..e3121d5db1f --- /dev/null +++ b/solr/example/exampledocs/manufacturers.xml @@ -0,0 +1,75 @@ + + + + + adata + A-Data Technology + 46221 Landing Parkway Fremont, CA 94538 + + + apple + Apple + 1 Infinite Way, Cupertino CA + + + asus + ASUS Computer + 800 Corporate Way Fremont, CA 94539 + + + ati + ATI Technologies + 33 Commerce Valley Drive East Thornhill, ON L3T 7N6 Canada + + + belkin + Belkin + 12045 E. Waterfront Drive Playa Vista, CA 90094 + + + canon + Canon, Inc. + One Canon Plaza Lake Success, NY 11042 + + + corsair + Corsair Microsystems + 46221 Landing Parkway Fremont, CA 94538 + + + dell + Dell, Inc. + One Dell Way Round Rock, Texas 78682 + + + maxtor + Maxtor Corporation + 920 Disc Drive Scotts Valley, CA 95066 + + + samsung + Samsung Electronics Co. Ltd. + 105 Challenger Rd. Ridgefield Park, NJ 07660-0511 + + + viewsonic + ViewSonic Corp + 381 Brea Canyon Road Walnut, CA 91789-0708 + + + diff --git a/solr/example/exampledocs/mem.xml b/solr/example/exampledocs/mem.xml index 1ca858d4a6b..0b89d6785c2 100644 --- a/solr/example/exampledocs/mem.xml +++ b/solr/example/exampledocs/mem.xml @@ -20,6 +20,8 @@ TWINX2048-3200PRO CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail Corsair Microsystems Inc. + + corsair electronics memory CAS latency 2, 2-3-3-6 timing, 2.75v, unbuffered, heat-spreader @@ -38,6 +40,8 @@ VS1GB400C3 CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail Corsair Microsystems Inc. + + corsair electronics memory 74.99 @@ -54,6 +58,8 @@ VDBDB1A16 A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM A-DATA Technology Inc. + + corsair electronics memory CAS latency 3, 2.7v diff --git a/solr/example/exampledocs/monitor.xml b/solr/example/exampledocs/monitor.xml index 035f61891da..db986fa0b7f 100644 --- a/solr/example/exampledocs/monitor.xml +++ b/solr/example/exampledocs/monitor.xml @@ -19,6 +19,8 @@ 3007WFP Dell Widescreen UltraSharp 3007WFP Dell, Inc. + + dell electronics monitor 30" TFT active matrix LCD, 2560 x 1600, .25mm dot pitch, 700:1 contrast diff --git a/solr/example/exampledocs/monitor2.xml b/solr/example/exampledocs/monitor2.xml index 09cc778c3da..79b99494319 100644 --- a/solr/example/exampledocs/monitor2.xml +++ b/solr/example/exampledocs/monitor2.xml @@ -19,6 +19,8 @@ VA902B ViewSonic VA902B - flat panel display - TFT - 19" ViewSonic Corp. + + viewsonic electronics monitor 19" TFT active matrix LCD, 8ms response time, 1280 x 1024 native resolution diff --git a/solr/example/exampledocs/mp500.xml b/solr/example/exampledocs/mp500.xml index 890cd4aadfb..bab401a289b 100644 --- a/solr/example/exampledocs/mp500.xml +++ b/solr/example/exampledocs/mp500.xml @@ -19,6 +19,8 @@ 0579B002 Canon PIXMA MP500 All-In-One Photo Printer Canon Inc. + + canon electronics multifunction printer printer diff --git a/solr/example/exampledocs/sd500.xml b/solr/example/exampledocs/sd500.xml index ff700025da8..145c6fd5de6 100644 --- a/solr/example/exampledocs/sd500.xml +++ b/solr/example/exampledocs/sd500.xml @@ -19,6 +19,8 @@ 9885A004 Canon PowerShot SD500 Canon Inc. + + canon electronics camera 3x zoop, 7.1 megapixel Digital ELPH diff --git a/solr/example/exampledocs/vidcard.xml b/solr/example/exampledocs/vidcard.xml index 9cd3fd1c79c..10b8121fdb1 100644 --- a/solr/example/exampledocs/vidcard.xml +++ b/solr/example/exampledocs/vidcard.xml @@ -19,7 +19,10 @@ EN7800GTX/2DHTV/256M ASUS Extreme N7800GTX/2DHTV (256 MB) + ASUS Computer Inc. + + asus electronics graphics card NVIDIA GeForce 7800 GTX GPU/VPU clocked at 486MHz @@ -39,6 +42,8 @@ 100-435805 ATI Radeon X1900 XTX 512 MB PCIE Video Card ATI Technologies + + ati electronics graphics card ATI RADEON X1900 GPU/VPU clocked at 650MHz diff --git a/solr/example/solr/conf/velocity/doc.vm b/solr/example/solr/conf/velocity/doc.vm index de3ad49aae2..91246389177 100644 --- a/solr/example/solr/conf/velocity/doc.vm +++ b/solr/example/solr/conf/velocity/doc.vm @@ -26,4 +26,17 @@ #if($params.getBool("debugQuery",false))
    toggle explain
    $response.getExplainMap().get($doc.getFirstValue('id'))
    + toggle all fields + + #foreach($fieldname in $doc.fieldNames) +
    + $fieldname : + + #foreach($value in $doc.getFieldValues($fieldname)) + $value + #end + + #end +
    +
    #end \ No newline at end of file diff --git a/solr/example/solr/conf/velocity/footer.vm b/solr/example/solr/conf/velocity/footer.vm index 79c8f820afc..b55e8a5a618 100644 --- a/solr/example/solr/conf/velocity/footer.vm +++ b/solr/example/solr/conf/velocity/footer.vm @@ -4,7 +4,7 @@ #if($request.params.get('debugQuery')) disable debug #else - enable debug + enable debug #end #if($annotate) disable annotation diff --git a/solr/lib/apache-solr-noggit-pom.xml.template b/solr/lib/apache-solr-noggit-pom.xml.template index 1596274c115..85b85a4e7d1 100644 --- a/solr/lib/apache-solr-noggit-pom.xml.template +++ b/solr/lib/apache-solr-noggit-pom.xml.template @@ -31,6 +31,6 @@ solr-noggit Solr Specific Noggit @version@ - Solr Specific Noggit r944541 + Solr Specific Noggit r1099557 jar diff --git a/solr/lib/apache-solr-noggit-r1099557.jar b/solr/lib/apache-solr-noggit-r1099557.jar new file mode 100644 index 00000000000..9fb87b9f301 --- /dev/null +++ b/solr/lib/apache-solr-noggit-r1099557.jar @@ -0,0 +1,2 @@ +AnyObjectId[5c4007c7e74af85d823243153d308f80e084eff0] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/lib/apache-solr-noggit-r944541.jar b/solr/lib/apache-solr-noggit-r944541.jar deleted file mode 100755 index e0624dd525f..00000000000 --- a/solr/lib/apache-solr-noggit-r944541.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[9b434f5760dd0d78350bdf8237273c0d5db0174e] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/solr/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java index c93db874ec7..35ad58d4f9f 100644 --- a/solr/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java @@ -21,6 +21,7 @@ import java.util.Map; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; /** @@ -37,6 +38,8 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory { private char delimiter; private char replacement; + private boolean reverse = false; + private int skip = PathHierarchyTokenizer.DEFAULT_SKIP; /** * Require a configured pattern @@ -70,10 +73,23 @@ public class PathHierarchyTokenizerFactory extends BaseTokenizerFactory { else{ replacement = delimiter; } + + v = args.get( "reverse" ); + if( v != null ){ + reverse = "true".equals( v ); + } + + v = args.get( "skip" ); + if( v != null ){ + skip = Integer.parseInt( v ); + } } public Tokenizer create(Reader input) { - return new PathHierarchyTokenizer(input, delimiter, replacement); + if( reverse ) { + return new ReversePathHierarchyTokenizer(input, delimiter, replacement, skip); + } + return new PathHierarchyTokenizer(input, delimiter, replacement, skip); } } diff --git a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index 571bdcf7f8d..162913e7899 100644 --- a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -61,6 +61,16 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso tokFactory = loadTokenizerFactory( loader, tf, args ); } + Iterable wlist=loadRules( synonyms, loader ); + + synMap = new SynonymMap(ignoreCase); + parseRules(wlist, synMap, "=>", ",", expand,tokFactory); + } + + /** + * @return a list of all rules + */ + protected Iterable loadRules( String synonyms, ResourceLoader loader ) { List wlist=null; try { File synonymFile = new File(synonyms); @@ -77,13 +87,12 @@ public class SynonymFilterFactory extends BaseTokenFilterFactory implements Reso } catch (IOException e) { throw new RuntimeException(e); } - synMap = new SynonymMap(ignoreCase); - parseRules(wlist, synMap, "=>", ",", expand,tokFactory); + return wlist; } private SynonymMap synMap; - static void parseRules(List rules, SynonymMap map, String mappingSep, + static void parseRules(Iterable rules, SynonymMap map, String mappingSep, String synSep, boolean expansion, TokenizerFactory tokFactory) { int count=0; for (String rule : rules) { diff --git a/solr/src/java/org/apache/solr/handler/JsonLoader.java b/solr/src/java/org/apache/solr/handler/JsonLoader.java index c233ce634e4..34118a07402 100644 --- a/solr/src/java/org/apache/solr/handler/JsonLoader.java +++ b/solr/src/java/org/apache/solr/handler/JsonLoader.java @@ -23,6 +23,7 @@ import java.util.Stack; import org.apache.commons.io.IOUtils; import org.apache.noggit.JSONParser; +import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; import org.apache.solr.common.util.ContentStream; @@ -43,10 +44,18 @@ import org.slf4j.LoggerFactory; class JsonLoader extends ContentStreamLoader { final static Logger log = LoggerFactory.getLogger( JsonLoader.class ); - protected UpdateRequestProcessor processor; + protected final UpdateRequestProcessor processor; + protected final SolrQueryRequest req; + protected JSONParser parser; + protected final int commitWithin; + protected final boolean overwrite; - public JsonLoader(UpdateRequestProcessor processor) { + public JsonLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { this.processor = processor; + this.req = req; + + commitWithin = req.getParams().getInt(XmlUpdateRequestHandler.COMMIT_WITHIN, -1); + overwrite = req.getParams().getBool(XmlUpdateRequestHandler.OVERWRITE, true); } @Override @@ -55,14 +64,14 @@ class JsonLoader extends ContentStreamLoader { Reader reader = null; try { reader = stream.getReader(); - if (XmlUpdateRequestHandler.log.isTraceEnabled()) { + if (log.isTraceEnabled()) { String body = IOUtils.toString(reader); - XmlUpdateRequestHandler.log.trace("body", body); + log.trace("body", body); reader = new StringReader(body); } - JSONParser parser = new JSONParser(reader); - this.processUpdate(req, processor, parser); + parser = new JSONParser(reader); + this.processUpdate(); } finally { IOUtils.closeQuietly(reader); @@ -70,39 +79,50 @@ class JsonLoader extends ContentStreamLoader { } @SuppressWarnings("fallthrough") - void processUpdate(SolrQueryRequest req, UpdateRequestProcessor processor, JSONParser parser) throws IOException + void processUpdate() throws IOException { int ev = parser.nextEvent(); while( ev != JSONParser.EOF ) { switch( ev ) { + case JSONParser.ARRAY_START: + handleAdds(); + break; + case JSONParser.STRING: if( parser.wasKey() ) { String v = parser.getString(); if( v.equals( XmlUpdateRequestHandler.ADD ) ) { - processor.processAdd( parseAdd(req, parser ) ); + int ev2 = parser.nextEvent(); + if (ev2 == JSONParser.OBJECT_START) { + processor.processAdd( parseAdd() ); + } else if (ev2 == JSONParser.ARRAY_START) { + handleAdds(); + } else { + assertEvent(ev2, JSONParser.OBJECT_START); + } } else if( v.equals( XmlUpdateRequestHandler.COMMIT ) ) { CommitUpdateCommand cmd = new CommitUpdateCommand(req, false ); cmd.waitFlush = cmd.waitSearcher = true; - parseCommitOptions( parser, cmd ); + parseCommitOptions( cmd ); processor.processCommit( cmd ); } else if( v.equals( XmlUpdateRequestHandler.OPTIMIZE ) ) { CommitUpdateCommand cmd = new CommitUpdateCommand(req, true ); cmd.waitFlush = cmd.waitSearcher = true; - parseCommitOptions( parser, cmd ); + parseCommitOptions( cmd ); processor.processCommit( cmd ); } else if( v.equals( XmlUpdateRequestHandler.DELETE ) ) { - processor.processDelete( parseDelete(req, parser ) ); + processor.processDelete( parseDelete() ); } else if( v.equals( XmlUpdateRequestHandler.ROLLBACK ) ) { - processor.processRollback( parseRollback(req, parser ) ); + processor.processRollback( parseRollback() ); } else { - throw new IOException( "Unknown command: "+v+" ["+parser.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown command: "+v+" ["+parser.getPosition()+"]" ); } break; } @@ -117,12 +137,11 @@ class JsonLoader extends ContentStreamLoader { case JSONParser.OBJECT_START: case JSONParser.OBJECT_END: - case JSONParser.ARRAY_START: case JSONParser.ARRAY_END: break; default: - System.out.println("UNKNOWN_EVENT_ID:"+ev); + log.info("Noggit UNKNOWN_EVENT_ID:"+ev); break; } // read the next event @@ -130,187 +149,211 @@ class JsonLoader extends ContentStreamLoader { } } - DeleteUpdateCommand parseDelete(SolrQueryRequest req, JSONParser js) throws IOException { - assertNextEvent( js, JSONParser.OBJECT_START ); + DeleteUpdateCommand parseDelete() throws IOException { + assertNextEvent( JSONParser.OBJECT_START ); DeleteUpdateCommand cmd = new DeleteUpdateCommand(req); - + while( true ) { - int ev = js.nextEvent(); + int ev = parser.nextEvent(); if( ev == JSONParser.STRING ) { - String key = js.getString(); - if( js.wasKey() ) { + String key = parser.getString(); + if( parser.wasKey() ) { if( "id".equals( key ) ) { - cmd.id = js.getString(); + cmd.id = parser.getString(); } else if( "query".equals(key) ) { - cmd.query = js.getString(); + cmd.query = parser.getString(); } else { - throw new IOException( "Unknown key: "+key+" ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown key: "+key+" ["+parser.getPosition()+"]" ); } } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "invalid string: " + key - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } else if( ev == JSONParser.OBJECT_END ) { if( cmd.id == null && cmd.query == null ) { - throw new IOException( "Missing id or query for delete ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Missing id or query for delete ["+parser.getPosition()+"]" ); } return cmd; } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Got: "+JSONParser.getEventString( ev ) - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } } - RollbackUpdateCommand parseRollback(SolrQueryRequest req, JSONParser js) throws IOException { - assertNextEvent( js, JSONParser.OBJECT_START ); - assertNextEvent( js, JSONParser.OBJECT_END ); + RollbackUpdateCommand parseRollback() throws IOException { + assertNextEvent( JSONParser.OBJECT_START ); + assertNextEvent( JSONParser.OBJECT_END ); return new RollbackUpdateCommand(req); } - void parseCommitOptions( JSONParser js, CommitUpdateCommand cmd ) throws IOException + void parseCommitOptions(CommitUpdateCommand cmd ) throws IOException { - assertNextEvent( js, JSONParser.OBJECT_START ); + assertNextEvent( JSONParser.OBJECT_START ); while( true ) { - int ev = js.nextEvent(); + int ev = parser.nextEvent(); if( ev == JSONParser.STRING ) { - String key = js.getString(); - if( js.wasKey() ) { + String key = parser.getString(); + if( parser.wasKey() ) { if( XmlUpdateRequestHandler.WAIT_SEARCHER.equals( key ) ) { - cmd.waitSearcher = js.getBoolean(); + cmd.waitSearcher = parser.getBoolean(); } else if( XmlUpdateRequestHandler.WAIT_FLUSH.equals( key ) ) { - cmd.waitFlush = js.getBoolean(); + cmd.waitFlush = parser.getBoolean(); } else { - throw new IOException( "Unknown key: "+key+" ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown key: "+key+" ["+parser.getPosition()+"]" ); } } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "invalid string: " + key - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } else if( ev == JSONParser.OBJECT_END ) { return; } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Got: "+JSONParser.getEventString( ev ) - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } } - AddUpdateCommand parseAdd(SolrQueryRequest req, JSONParser js ) throws IOException + AddUpdateCommand parseAdd() throws IOException { - assertNextEvent( js, JSONParser.OBJECT_START ); AddUpdateCommand cmd = new AddUpdateCommand(req); + cmd.commitWithin = commitWithin; + cmd.overwrite = overwrite; + float boost = 1.0f; while( true ) { - int ev = js.nextEvent(); + int ev = parser.nextEvent(); if( ev == JSONParser.STRING ) { - if( js.wasKey() ) { - String key = js.getString(); + if( parser.wasKey() ) { + String key = parser.getString(); if( "doc".equals( key ) ) { if( cmd.solrDoc != null ) { - throw new IOException( "multiple docs in same add command" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "multiple docs in same add command" ); } - ev = assertNextEvent( js, JSONParser.OBJECT_START ); - cmd.solrDoc = parseDoc( ev, js ); + ev = assertNextEvent( JSONParser.OBJECT_START ); + cmd.solrDoc = parseDoc( ev ); } else if( XmlUpdateRequestHandler.OVERWRITE.equals( key ) ) { - cmd.overwrite = js.getBoolean(); // reads next boolean + cmd.overwrite = parser.getBoolean(); // reads next boolean } else if( XmlUpdateRequestHandler.COMMIT_WITHIN.equals( key ) ) { - cmd.commitWithin = (int)js.getLong(); + cmd.commitWithin = (int)parser.getLong(); } else if( "boost".equals( key ) ) { - boost = Float.parseFloat( js.getNumberChars().toString() ); + boost = Float.parseFloat( parser.getNumberChars().toString() ); } else { - throw new IOException( "Unknown key: "+key+" ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown key: "+key+" ["+parser.getPosition()+"]" ); } } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Should be a key " - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } else if( ev == JSONParser.OBJECT_END ) { if( cmd.solrDoc == null ) { - throw new IOException("missing solr document. "+js.getPosition() ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,"missing solr document. "+parser.getPosition() ); } cmd.solrDoc.setDocumentBoost( boost ); return cmd; } else { - throw new IOException( + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Got: "+JSONParser.getEventString( ev ) - +" at ["+js.getPosition()+"]" ); + +" at ["+parser.getPosition()+"]" ); } } } - - int assertNextEvent( JSONParser parser, int ev ) throws IOException + + + void handleAdds() throws IOException + { + while( true ) { + AddUpdateCommand cmd = new AddUpdateCommand(req); + cmd.commitWithin = commitWithin; + cmd.overwrite = overwrite; + + int ev = parser.nextEvent(); + if (ev == JSONParser.ARRAY_END) break; + + assertEvent(ev, JSONParser.OBJECT_START); + cmd.solrDoc = parseDoc(ev); + processor.processAdd(cmd); + } + } + + + int assertNextEvent(int expected ) throws IOException { int got = parser.nextEvent(); - if( ev != got ) { - throw new IOException( - "Expected: "+JSONParser.getEventString( ev ) - +" but got "+JSONParser.getEventString( got ) - +" at ["+parser.getPosition()+"]" ); - } + assertEvent(got, expected); return got; } + + void assertEvent(int ev, int expected) { + if( ev != expected ) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Expected: "+JSONParser.getEventString( expected ) + +" but got "+JSONParser.getEventString( ev ) + +" at ["+parser.getPosition()+"]" ); + } + } - SolrInputDocument parseDoc( int ev, JSONParser js ) throws IOException + SolrInputDocument parseDoc(int ev) throws IOException { Stack stack = new Stack(); Object obj = null; boolean inArray = false; if( ev != JSONParser.OBJECT_START ) { - throw new IOException( "object should already be started" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "object should already be started" ); } while( true ) { - //System.out.println( ev + "["+JSONParser.getEventString(ev)+"] "+js.wasKey() ); //+ js.getString() ); + //System.out.println( ev + "["+JSONParser.getEventString(ev)+"] "+parser.wasKey() ); //+ parser.getString() ); switch (ev) { case JSONParser.STRING: - if( js.wasKey() ) { + if( parser.wasKey() ) { obj = stack.peek(); - String v = js.getString(); + String v = parser.getString(); if( obj instanceof SolrInputField ) { SolrInputField field = (SolrInputField)obj; if( "boost".equals( v ) ) { - ev = js.nextEvent(); + ev = parser.nextEvent(); if( ev != JSONParser.NUMBER && ev != JSONParser.LONG && ev != JSONParser.BIGNUMBER ) { - throw new IOException( "boost should have number! "+JSONParser.getEventString(ev) ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "boost should have number! "+JSONParser.getEventString(ev) ); } - field.setBoost( Float.valueOf( js.getNumberChars().toString() ) ); + field.setBoost( Float.valueOf( parser.getNumberChars().toString() ) ); } else if( "value".equals( v ) ) { // nothing special... stack.push( field ); // so it can be popped } else { - throw new IOException( "invalid key: "+v + " ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "invalid key: "+v + " ["+ parser.getPosition()+"]" ); } } else if( obj instanceof SolrInputDocument ) { @@ -323,22 +366,22 @@ class JsonLoader extends ContentStreamLoader { stack.push( f ); } else { - throw new IOException( "hymmm ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "hymmm ["+ parser.getPosition()+"]" ); } } else { - addValToField(stack, js.getString(), inArray, js); + addValToField(stack, parser.getString(), inArray, parser); } break; case JSONParser.LONG: case JSONParser.NUMBER: case JSONParser.BIGNUMBER: - addValToField(stack, js.getNumberChars().toString(), inArray, js); + addValToField(stack, parser.getNumberChars().toString(), inArray, parser); break; case JSONParser.BOOLEAN: - addValToField(stack, js.getBoolean(),inArray, js); + addValToField(stack, parser.getBoolean(),inArray, parser); break; case JSONParser.OBJECT_START: @@ -351,7 +394,7 @@ class JsonLoader extends ContentStreamLoader { // should alreay be pushed... } else { - throw new IOException( "should not start new object with: "+obj + " ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "should not start new object with: "+obj + " ["+ parser.getPosition()+"]" ); } } break; @@ -365,7 +408,7 @@ class JsonLoader extends ContentStreamLoader { // should already be pushed... } else { - throw new IOException( "should not start new object with: "+obj + " ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "should not start new object with: "+obj + " ["+ parser.getPosition()+"]" ); } break; @@ -383,18 +426,18 @@ class JsonLoader extends ContentStreamLoader { break; } - ev = js.nextEvent(); + ev = parser.nextEvent(); if( ev == JSONParser.EOF ) { - throw new IOException( "should finish doc first!" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "should finish doc first!" ); } } } - static void addValToField( Stack stack, Object val, boolean inArray, JSONParser js ) throws IOException + static void addValToField( Stack stack, Object val, boolean inArray, JSONParser parser ) throws IOException { Object obj = stack.peek(); if( !(obj instanceof SolrInputField) ) { - throw new IOException( "hymmm ["+js.getPosition()+"]" ); + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "hymmm ["+parser.getPosition()+"]" ); } SolrInputField f = inArray diff --git a/solr/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java b/solr/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java index 9f36c37d785..213089d0943 100644 --- a/solr/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java +++ b/solr/src/java/org/apache/solr/handler/JsonUpdateRequestHandler.java @@ -37,7 +37,7 @@ public class JsonUpdateRequestHandler extends ContentStreamHandlerBase { @Override protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) { - return new JsonLoader(processor); + return new JsonLoader(req, processor); } //////////////////////// SolrInfoMBeans methods ////////////////////// diff --git a/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java b/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java index 56b9d4826f2..24c5256576b 100644 --- a/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/TermVectorComponent.java @@ -208,7 +208,7 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar if (keyField != null) { Document document = reader.document(docId, fieldSelector); - Fieldable uniqId = document.getField(uniqFieldName); + Fieldable uniqId = document.getFieldable(uniqFieldName); String uniqVal = null; if (uniqId != null) { uniqVal = keyField.getType().storedToReadable(uniqId); diff --git a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java index fbb1489dba4..b0be39fd0b1 100644 --- a/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java +++ b/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java @@ -401,13 +401,24 @@ public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInf private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries, int docId, Document doc, String fieldName ) throws IOException { + final SolrIndexSearcher searcher = req.getSearcher(); + final IndexSchema schema = searcher.getSchema(); + + // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) - + // so we disable them until fixed (see LUCENE-3080)! + // BEGIN: Hack + final SchemaField schemaField = schema.getFieldOrNull(fieldName); + if (schemaField != null && ( + (schemaField.getType() instanceof org.apache.solr.schema.TrieField) || + (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField) + )) return; + // END: Hack + SolrParams params = req.getParams(); String[] docTexts = doc.getValues(fieldName); // according to Document javadoc, doc.getValues() never returns null. check empty instead of null if (docTexts.length == 0) return; - SolrIndexSearcher searcher = req.getSearcher(); - IndexSchema schema = searcher.getSchema(); TokenStream tstream = null; int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); diff --git a/solr/src/java/org/apache/solr/response/BaseResponseWriter.java b/solr/src/java/org/apache/solr/response/BaseResponseWriter.java deleted file mode 100644 index 696903b198a..00000000000 --- a/solr/src/java/org/apache/solr/response/BaseResponseWriter.java +++ /dev/null @@ -1,319 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.response; - -import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.params.CommonParams; -import org.apache.solr.request.SolrQueryRequest; -import org.apache.solr.search.DocList; -import org.apache.solr.search.ReturnFields; -import org.apache.solr.search.SolrIndexSearcher; -import org.apache.solr.search.DocIterator; -import org.apache.solr.schema.FieldType; -import org.apache.solr.schema.IndexSchema; -import org.apache.solr.schema.SchemaField; - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Fieldable; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.List; -import java.util.ArrayList; - -/** - * THIS HAS NO TESTS and is not used anywhere.... no idea how or if it should work... - * - * I think we should drop it - along with {@link GenericBinaryResponseWriter} and {@link GenericBinaryResponseWriter} - * - * unless I'm missing something (ryan, March 2011) - * - * - * This class serves as a basis from which {@link QueryResponseWriter}s can be - * developed. The class provides a single method - * {@link #write(SingleResponseWriter, SolrQueryRequest, SolrQueryResponse)} - * that allows users to implement a {@link SingleResponseWriter} sub-class which - * defines how to output {@link SolrInputDocument}s or a - * {@link SolrDocumentList}. - * - * @version $Id$ - * @since 1.5 - * - */ -public abstract class BaseResponseWriter { - - private static final Logger LOG = LoggerFactory - .getLogger(BaseResponseWriter.class); - - - /** - * - * The main method that allows users to write {@link SingleResponseWriter}s - * and provide them as the initial parameter responseWriter to - * this method which defines how output should be generated. - * - * @param responseWriter - * The user-provided {@link SingleResponseWriter} implementation. - * @param request - * The provided {@link SolrQueryRequest}. - * @param response - * The provided {@link SolrQueryResponse}. - * @throws IOException - * If any error occurs. - */ - public void write(SingleResponseWriter responseWriter, - SolrQueryRequest request, SolrQueryResponse response) throws IOException { - responseWriter.start(); - NamedList nl = response.getValues(); - for (int i = 0; i < nl.size(); i++) { - String name = nl.getName(i); - Object val = nl.getVal(i); - if ("responseHeader".equals(name)) { - Boolean omitHeader = request.getParams().getBool(CommonParams.OMIT_HEADER); - if (omitHeader == null || !omitHeader) responseWriter.writeResponseHeader((NamedList) val); - } else if (val instanceof SolrDocumentList) { - SolrDocumentList list = (SolrDocumentList) val; - DocListInfo info = new DocListInfo((int)list.getNumFound(), list.size(), (int)list.getStart(), list.getMaxScore()); - if (responseWriter.isStreamingDocs()) { - responseWriter.startDocumentList(name,info); - for (SolrDocument solrDocument : list) - responseWriter.writeDoc(solrDocument); - responseWriter.endDocumentList(); - } else { - responseWriter.writeAllDocs(info, list); - } - } else if (val instanceof DocList) { - DocList docList = (DocList) val; - int sz = docList.size(); - IdxInfo idxInfo = new IdxInfo(request.getSchema(), request - .getSearcher(), response.getReturnFields()); - DocListInfo info = new DocListInfo(docList.matches(), docList.size(),docList.offset(), - docList.maxScore()); - DocIterator iterator = docList.iterator(); - if (responseWriter.isStreamingDocs()) { - responseWriter.startDocumentList(name,info); - for (int j = 0; j < sz; j++) { - SolrDocument sdoc = getDoc(iterator.nextDoc(), idxInfo); - responseWriter.writeDoc(sdoc); - } - } else { - ArrayList list = new ArrayList(docList - .size()); - for (int j = 0; j < sz; j++) { - SolrDocument sdoc = getDoc(iterator.nextDoc(), idxInfo); - list.add(sdoc); - } - responseWriter.writeAllDocs(info, list); - } - - } else { - responseWriter.writeOther(name, val); - - } - } - responseWriter.end(); - - } - - /**No ops implementation so that the implementing classes do not have to do it - */ - public void init(NamedList args){} - - private static class IdxInfo { - IndexSchema schema; - SolrIndexSearcher searcher; - ReturnFields returnFields; - - private IdxInfo(IndexSchema schema, SolrIndexSearcher searcher, - ReturnFields returnFields) { - this.schema = schema; - this.searcher = searcher; - this.returnFields = returnFields; - } - } - - private static SolrDocument getDoc(int id, IdxInfo info) throws IOException { - Document doc = info.searcher.doc(id); - SolrDocument solrDoc = new SolrDocument(); - for (Fieldable f : doc.getFields()) { - String fieldName = f.name(); - if (info.returnFields != null && !info.returnFields.wantsField(fieldName)) - continue; - SchemaField sf = info.schema.getFieldOrNull(fieldName); - FieldType ft = null; - if (sf != null) ft = sf.getType(); - Object val = null; - if (ft == null) { // handle fields not in the schema - if (f.isBinary()) - val = f.getBinaryValue(); - else - val = f.stringValue(); - } else { - try { - if (BinaryResponseWriter.KNOWN_TYPES.contains(ft.getClass())) { - val = ft.toObject(f); - } else { - val = ft.toExternal(f); - } - } catch (Exception e) { - // There is a chance of the underlying field not really matching the - // actual field type . So ,it can throw exception - LOG.warn("Error reading a field from document : " + solrDoc, e); - // if it happens log it and continue - continue; - } - } - if (sf != null && sf.multiValued() && !solrDoc.containsKey(fieldName)) { - ArrayList l = new ArrayList(); - l.add(val); - solrDoc.addField(fieldName, l); - } else { - solrDoc.addField(fieldName, val); - } - } - - return solrDoc; - } - - public static class DocListInfo { - public final int numFound; - public final int start ; - public Float maxScore = null; - public final int size; - - public DocListInfo(int numFound, int sz,int start, Float maxScore) { - this.numFound = numFound; - size = sz; - this.start = start; - this.maxScore = maxScore; - } - } - - /** - * - * Users wanting to define custom {@link QueryResponseWriter}s that deal with - * {@link SolrInputDocument}s and {@link SolrDocumentList} should override the - * methods for this class. All the methods are w/o body because the user is left - * to choose which all methods are required for his purpose - */ - public static abstract class SingleResponseWriter { - - /** - * This method is called at the start of the {@link QueryResponseWriter} - * output. Override this method if you want to provide a header for your - * output, e.g., XML headers, etc. - * - * @throws IOException - * if any error occurs. - */ - public void start() throws IOException { } - - /** - * This method is called at the start of processing a - * {@link SolrDocumentList}. Those that override this method are provided - * with {@link DocListInfo} object to use to inspect the output - * {@link SolrDocumentList}. - * - * @param info Information about the {@link SolrDocumentList} to output. - */ - public void startDocumentList(String name, DocListInfo info) throws IOException { } - - /** - * This method writes out a {@link SolrDocument}, on a doc-by-doc basis. - * This method is only called when {@link #isStreamingDocs()} returns true. - * - * @param solrDocument - * The doc-by-doc {@link SolrDocument} to transform into output as - * part of this {@link QueryResponseWriter}. - */ - public void writeDoc(SolrDocument solrDocument) throws IOException { } - - /** - * This method is called at the end of outputting a {@link SolrDocumentList} - * or on a doc-by-doc {@link SolrDocument} basis. - */ - public void endDocumentList() throws IOException { } - /** - * This method defines how to output the {@link SolrQueryResponse} header - * which is provided as a {@link NamedList} parameter. - * - * @param responseHeader - * The response header to output. - */ - public void writeResponseHeader(NamedList responseHeader) throws IOException { } - - /** - * This method is called at the end of the {@link QueryResponseWriter} - * lifecycle. Implement this method to add a footer to your output, e.g., in - * the case of XML, the outer tag for your tag set, etc. - * - * @throws IOException - * If any error occurs. - */ - public void end() throws IOException { } - - /** - * Define this method to control how output is written by this - * {@link QueryResponseWriter} if the output is not a - * {@link SolrInputDocument} or a {@link SolrDocumentList}. - * - * @param name - * The name of the object to output. - * @param other - * The object to output. - * @throws IOException - * If any error occurs. - */ - public void writeOther(String name, Object other) throws IOException { } - - /** - * Overriding this method to return false forces all - * {@link SolrInputDocument}s to be spit out as a {@link SolrDocumentList} - * so they can be processed as a whole, rather than on a doc-by-doc basis. - * If set to false, this method calls - * {@link #writeAllDocs(BaseResponseWriter.DocListInfo, List)}, else if set to true, then this - * method forces calling {@link #writeDoc(SolrDocument)} on a doc-by-doc - * basis. one - * - * @return True to force {@link #writeDoc(SolrDocument)} to be called, False - * to force {@link #writeAllDocs(BaseResponseWriter.DocListInfo, List)} to be called. - */ - public boolean isStreamingDocs() { return true; } - - /** - * Writes out all {@link SolrInputDocument}s . This is invoked only if - * {@link #isStreamingDocs()} returns false. - * - * @param info - * Information about the {@link List} of {@link SolrDocument}s to - * output. - * @param allDocs - * A {@link List} of {@link SolrDocument}s to output. - * @throws IOException - * If any error occurs. - */ - public void writeAllDocs(DocListInfo info, List allDocs) throws IOException { } - - } - -} diff --git a/solr/src/java/org/apache/solr/response/GenericBinaryResponseWriter.java b/solr/src/java/org/apache/solr/response/GenericBinaryResponseWriter.java deleted file mode 100644 index 1ce707ce4e8..00000000000 --- a/solr/src/java/org/apache/solr/response/GenericBinaryResponseWriter.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.response; - -import java.io.OutputStream; -import java.io.IOException; -import java.io.Writer; - -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.request.SolrQueryRequest; - -import org.apache.solr.response.BaseResponseWriter.SingleResponseWriter; // javadocs - -/** - * - * - * A generic {@link QueryResponseWriter} implementation that requires a user to - * implement the - * {@link #getSingleResponseWriter(OutputStream, SolrQueryRequest, SolrQueryResponse)} - * that defines a {@link SingleResponseWriter} to handle the binary output. - * - * @since 1.5 - * @version $Id$ - * - */ -public abstract class GenericBinaryResponseWriter extends BaseResponseWriter - implements BinaryQueryResponseWriter { - - /** - * - * Writes the binary output data using the {@link SingleResponseWriter} - * provided by a call to - * {@link #getSingleResponseWriter(OutputStream, SolrQueryRequest, SolrQueryResponse)} - * . - * - * @param out - * The {@link OutputStream} to write the binary data to. - * @param request - * The provided {@link SolrQueryRequest}. - * @param response - * The provided {@link SolrQueryResponse}. - */ - public void write(OutputStream out, SolrQueryRequest request, - SolrQueryResponse response) throws IOException { - super.write(getSingleResponseWriter(out, request, response), request, - response); - } - - /** - * Users of this class should implement this method to define a - * {@link SingleResponseWriter} responsible for writing the binary output - * given a {@link SolrDocumentList} or doc-by-doc, given a - * {@link SolrInputDocument}. - * - * @param out - * The {@link OutputStream} to write the binary data response to. - * @param request - * The provided {@link SolrQueryRequest}. - * @param response - * The provided {@link SolrQueryResponse}. - * @return A {@link SingleResponseWriter} that will be used to generate the - * response output from this {@link QueryResponseWriter}. - */ - public abstract SingleResponseWriter getSingleResponseWriter( - OutputStream out, SolrQueryRequest request, SolrQueryResponse response); - - /**Just to throw Exception So that the eimplementing classes do not have to do the same - */ - public void write(Writer writer, SolrQueryRequest request, SolrQueryResponse response) throws IOException { - throw new RuntimeException("This is a binary writer , Cannot write to a characterstream"); - } -} diff --git a/solr/src/java/org/apache/solr/response/GenericTextResponseWriter.java b/solr/src/java/org/apache/solr/response/GenericTextResponseWriter.java deleted file mode 100644 index 0b911b9f0af..00000000000 --- a/solr/src/java/org/apache/solr/response/GenericTextResponseWriter.java +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.response; - -import java.io.Writer; -import java.io.IOException; - -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.request.SolrQueryRequest; - -import org.apache.solr.response.BaseResponseWriter.SingleResponseWriter; // javadocs - -/** - * - * - * A generic {@link QueryResponseWriter} implementation that requires a user to - * implement the - * {@link #getSingleResponseWriter(Writer, SolrQueryRequest, SolrQueryResponse)} - * that defines a {@link SingleResponseWriter} to handle plain ol' text output. - * - * @since 1.5 - * @version $Id$ - * - */ -public abstract class GenericTextResponseWriter extends BaseResponseWriter - implements QueryResponseWriter { - - /** - * - * Writes text output using the {@link SingleResponseWriter} provided by a - * call to - * {@link #getSingleResponseWriter(Writer, SolrQueryRequest, SolrQueryResponse)} - * . - * - * @param writer - * The {@link Writer} to write the text output to. - * @param request - * The provided {@link SolrQueryRequest}. - * @param response - * The provided {@link SolrQueryResponse}. - */ - public void write(Writer writer, SolrQueryRequest request, - SolrQueryResponse response) throws IOException { - super.write(getSingleResponseWriter(writer, request, response), request, - response); - } - - /** - * Users of this class should implement this method to define a - * {@link SingleResponseWriter} responsible for writing text output given a - * {@link SolrDocumentList} or doc-by-doc, given a {@link SolrInputDocument}. - * - * @param writer - * The {@link Writer} to write the text data response to. - * @param request - * The provided {@link SolrQueryRequest}. - * @param response - * The provided {@link SolrQueryResponse}. - * @return A {@link SingleResponseWriter} that will be used to generate the - * response output from this {@link QueryResponseWriter}. - */ - protected abstract SingleResponseWriter getSingleResponseWriter( - Writer writer, SolrQueryRequest request, SolrQueryResponse response); -} diff --git a/solr/src/java/org/apache/solr/schema/IndexSchema.java b/solr/src/java/org/apache/solr/schema/IndexSchema.java index b6b7b6768f5..818f8d85a56 100644 --- a/solr/src/java/org/apache/solr/schema/IndexSchema.java +++ b/solr/src/java/org/apache/solr/schema/IndexSchema.java @@ -797,19 +797,23 @@ public final class IndexSchema { NamedNodeMap attrs = node.getAttributes(); String analyzerName = DOMUtil.getAttr(attrs,"class"); if (analyzerName != null) { - // No need to be core-aware as Analyzers are not in the core-aware list - final Class clazz = loader.findClass(analyzerName).asSubclass(Analyzer.class); try { + // No need to be core-aware as Analyzers are not in the core-aware list + final Class clazz = loader.findClass + (analyzerName).asSubclass(Analyzer.class); + try { - // first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore) + // first try to use a ctor with version parameter + // (needed for many new Analyzers that have no default one anymore) Constructor cnstr = clazz.getConstructor(Version.class); final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM); final Version luceneMatchVersion = (matchVersionStr == null) ? solrConfig.luceneMatchVersion : Config.parseLuceneVersionString(matchVersionStr); if (luceneMatchVersion == null) { - throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, - "Configuration Error: Analyzer '" + clazz.getName() + - "' needs a 'luceneMatchVersion' parameter"); + throw new SolrException + ( SolrException.ErrorCode.SERVER_ERROR, + "Configuration Error: Analyzer '" + clazz.getName() + + "' needs a 'luceneMatchVersion' parameter"); } return cnstr.newInstance(luceneMatchVersion); } catch (NoSuchMethodException nsme) { @@ -817,8 +821,9 @@ public final class IndexSchema { return clazz.newInstance(); } } catch (Exception e) { + log.error("Cannot load analyzer: "+analyzerName, e); throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, - "Cannot load analyzer: "+analyzerName ); + "Cannot load analyzer: "+analyzerName, e ); } } diff --git a/solr/src/java/org/apache/solr/schema/SchemaField.java b/solr/src/java/org/apache/solr/schema/SchemaField.java index 41ad8e051c1..bb2d3e7a523 100644 --- a/solr/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/src/java/org/apache/solr/schema/SchemaField.java @@ -19,7 +19,6 @@ package org.apache.solr.schema; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.SortField; import org.apache.solr.search.QParser; diff --git a/solr/src/java/org/apache/solr/schema/TrieDateField.java b/solr/src/java/org/apache/solr/schema/TrieDateField.java index 7e3b30d240f..8d58fa55213 100755 --- a/solr/src/java/org/apache/solr/schema/TrieDateField.java +++ b/solr/src/java/org/apache/solr/schema/TrieDateField.java @@ -18,210 +18,125 @@ package org.apache.solr.schema; import org.apache.noggit.CharArr; -import org.apache.solr.common.SolrException; -import org.apache.solr.analysis.CharFilterFactory; -import org.apache.solr.analysis.TokenFilterFactory; -import org.apache.solr.analysis.TokenizerChain; -import org.apache.solr.analysis.TrieTokenizerFactory; -import org.apache.solr.search.function.*; +import org.apache.solr.search.function.ValueSource; import org.apache.solr.search.QParser; import org.apache.solr.response.TextResponseWriter; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.document.Field; import org.apache.lucene.search.SortField; -import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.Query; import org.apache.lucene.search.NumericRangeQuery; -import org.apache.lucene.search.cache.CachedArrayCreator; -import org.apache.lucene.search.cache.LongValuesCreator; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.NumericUtils; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.NumericTokenStream; import java.util.Map; import java.util.Date; import java.io.IOException; public class TrieDateField extends DateField { - protected int precisionStepArg = TrieField.DEFAULT_PRECISION_STEP; // the one passed in or defaulted - protected int precisionStep = precisionStepArg; // normalized + + final TrieField wrappedField = new TrieField() {{ + type = TrieTypes.DATE; + }}; @Override protected void init(IndexSchema schema, Map args) { - String p = args.remove("precisionStep"); - if (p != null) { - precisionStepArg = Integer.parseInt(p); - } - // normalize the precisionStep - precisionStep = precisionStepArg; - if (precisionStep<=0 || precisionStep>=64) precisionStep=Integer.MAX_VALUE; - - CharFilterFactory[] filterFactories = new CharFilterFactory[0]; - TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; - analyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(TrieField.TrieTypes.DATE, precisionStep), tokenFilterFactories); - // for query time we only need one token, so we use the biggest possible precisionStep: - queryAnalyzer = new TokenizerChain(filterFactories, new TrieTokenizerFactory(TrieField.TrieTypes.DATE, Integer.MAX_VALUE), tokenFilterFactories); + wrappedField.init(schema, args); + analyzer = wrappedField.analyzer; + queryAnalyzer = wrappedField.queryAnalyzer; } @Override public Date toObject(Fieldable f) { - byte[] arr = f.getBinaryValue(); - if (arr==null) throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,TrieField.badFieldString(f)); - return new Date(TrieFieldHelper.toLong(arr)); + return (Date) wrappedField.toObject(f); } @Override public Object toObject(SchemaField sf, BytesRef term) { - return new Date(NumericUtils.prefixCodedToLong(term)); + return wrappedField.toObject(sf, term); } @Override public SortField getSortField(SchemaField field, boolean top) { - field.checkSortability(); - - int flags = CachedArrayCreator.CACHE_VALUES_AND_BITS; - boolean sortMissingLast = field.sortMissingLast(); - boolean sortMissingFirst = field.sortMissingFirst(); - - Object missingValue = null; - if( sortMissingLast ) { - missingValue = top ? Long.MIN_VALUE : Long.MAX_VALUE; - } else if( sortMissingFirst ) { - missingValue = top ? Long.MAX_VALUE : Long.MIN_VALUE; - } - return new SortField(new LongValuesCreator(field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER, flags), top).setMissingValue(missingValue); + return wrappedField.getSortField(field, top); } @Override public ValueSource getValueSource(SchemaField field, QParser parser) { - field.checkFieldCacheSource(parser); - return new TrieDateFieldSource( new LongValuesCreator( field.getName(), FieldCache.NUMERIC_UTILS_LONG_PARSER, CachedArrayCreator.CACHE_VALUES_AND_BITS )); - } - - @Override - public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException { - byte[] arr = f.getBinaryValue(); - if (arr==null) { - writer.writeStr(name, TrieField.badFieldString(f),true); - return; - } - - writer.writeDate(name,new Date(TrieFieldHelper.toLong(arr))); - } - - @Override - public boolean isTokenized() { - return true; + return wrappedField.getValueSource(field, parser); } /** * @return the precisionStep used to index values into the field */ public int getPrecisionStep() { - return precisionStepArg; + return wrappedField.getPrecisionStep(); } + @Override + public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException { + wrappedField.write(writer, name, f); + } + + @Override + public boolean isTokenized() { + return wrappedField.isTokenized(); + } + + @Override + public boolean multiValuedFieldCache() { + return wrappedField.multiValuedFieldCache(); + } @Override public String storedToReadable(Fieldable f) { - return toExternal(f); + return wrappedField.storedToReadable(f); } @Override public String readableToIndexed(String val) { - // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! - BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); - NumericUtils.longToPrefixCoded(super.parseMath(null, val).getTime(), 0, bytes); - return bytes.utf8ToString(); + return wrappedField.readableToIndexed(val); } @Override public String toInternal(String val) { - return readableToIndexed(val); + return wrappedField.toInternal(val); } @Override public String toExternal(Fieldable f) { - byte[] arr = f.getBinaryValue(); - if (arr==null) return TrieField.badFieldString(f); - return super.toExternal(new Date(TrieFieldHelper.toLong(arr))); + return wrappedField.toExternal(f); } @Override public String indexedToReadable(String _indexedForm) { - final BytesRef indexedForm = new BytesRef(_indexedForm); - return super.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); + return wrappedField.indexedToReadable(_indexedForm); } @Override public void indexedToReadable(BytesRef input, CharArr out) { - String ext = super.toExternal( new Date(NumericUtils.prefixCodedToLong(input)) ); - out.write(ext); + wrappedField.indexedToReadable(input, out); } @Override public String storedToIndexed(Fieldable f) { - // TODO: optimize to remove redundant string conversion - return readableToIndexed(storedToReadable(f)); + return wrappedField.storedToIndexed(f); } @Override public Fieldable createField(SchemaField field, Object value, float boost) { - boolean indexed = field.indexed(); - boolean stored = field.stored(); - - if (!indexed && !stored) { - if (log.isTraceEnabled()) - log.trace("Ignoring unindexed/unstored field: " + field); - return null; - } - - int ps = precisionStep; - - byte[] arr=null; - TokenStream ts=null; - - long time = (value instanceof Date) - ? ((Date)value).getTime() - : super.parseMath(null, value.toString()).getTime(); - - if (stored) arr = TrieFieldHelper.toArr(time); - if (indexed) ts = new NumericTokenStream(ps).setLongValue(time); - - Field f; - if (stored) { - f = new Field(field.getName(), arr); - if (indexed) f.setTokenStream(ts); - } else { - f = new Field(field.getName(), ts); - } - - // term vectors aren't supported - - f.setOmitNorms(field.omitNorms()); - f.setOmitTermFreqAndPositions(field.omitTf()); - f.setBoost(boost); - return f; + return wrappedField.createField(field, value, boost); } @Override public Query getRangeQuery(QParser parser, SchemaField field, String min, String max, boolean minInclusive, boolean maxInclusive) { - return getRangeQuery(parser, field, - min==null ? null : super.parseMath(null,min), - max==null ? null : super.parseMath(null,max), - minInclusive, maxInclusive); + return wrappedField.getRangeQuery(parser, field, min, max, minInclusive, maxInclusive); } @Override public Query getRangeQuery(QParser parser, SchemaField sf, Date min, Date max, boolean minInclusive, boolean maxInclusive) { - int ps = precisionStep; - Query query = NumericRangeQuery.newLongRange(sf.getName(), ps, + return NumericRangeQuery.newLongRange(sf.getName(), wrappedField.precisionStep, min == null ? null : min.getTime(), max == null ? null : max.getTime(), minInclusive, maxInclusive); - - return query; } } diff --git a/solr/src/java/org/apache/solr/schema/TrieField.java b/solr/src/java/org/apache/solr/schema/TrieField.java index e670ba0e338..eb78e1bbfd8 100644 --- a/solr/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/src/java/org/apache/solr/schema/TrieField.java @@ -17,6 +17,8 @@ package org.apache.solr.schema; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.search.*; import org.apache.lucene.search.cache.CachedArrayCreator; import org.apache.lucene.search.cache.DoubleValuesCreator; @@ -40,17 +42,17 @@ import java.util.Map; import java.util.Date; /** - * Provides field types to support for Lucene's Trie Range Queries. + * Provides field types to support for Lucene's {@link NumericField}. * See {@link org.apache.lucene.search.NumericRangeQuery} for more details. * It supports integer, float, long, double and date types. *

    * For each number being added to this field, multiple terms are generated as per the algorithm described in the above - * link. The possible number of terms increases dramatically with higher precision steps (factor 2^precisionStep). For + * link. The possible number of terms increases dramatically with lower precision steps. For * the fast range search to work, trie fields must be indexed. *

    * Trie fields are sortable in numerical order and can be used in function queries. *

    - * Note that if you use a precisionStep of 32 for int/float and 64 for long/double, then multiple terms will not be + * Note that if you use a precisionStep of 32 for int/float and 64 for long/double/date, then multiple terms will not be * generated, range search will be no faster than any other number field, but sorting will still be possible. * * @version $Id$ @@ -101,21 +103,28 @@ public class TrieField extends FieldType { @Override public Object toObject(Fieldable f) { - byte[] arr = f.getBinaryValue(); - if (arr==null) return badFieldString(f); - switch (type) { - case INTEGER: - return TrieFieldHelper.toInt(arr); - case FLOAT: - return TrieFieldHelper.toFloat(arr); - case LONG: - return TrieFieldHelper.toLong(arr); - case DOUBLE: - return TrieFieldHelper.toDouble(arr); - case DATE: - return new Date(TrieFieldHelper.toLong(arr)); - default: - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); + if (f instanceof NumericField) { + final Number val = ((NumericField) f).getNumericValue(); + if (val==null) return badFieldString(f); + return (type == TrieTypes.DATE) ? new Date(val.longValue()) : val; + } else { + // the following code is "deprecated" and only to support pre-3.2 indexes using the old BinaryField encoding: + final byte[] arr = f.getBinaryValue(); + if (arr==null) return badFieldString(f); + switch (type) { + case INTEGER: + return toInt(arr); + case FLOAT: + return Float.intBitsToFloat(toInt(arr)); + case LONG: + return toLong(arr); + case DOUBLE: + return Double.longBitsToDouble(toLong(arr)); + case DATE: + return new Date(toLong(arr)); + default: + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); + } } } @@ -198,30 +207,7 @@ public class TrieField extends FieldType { @Override public void write(TextResponseWriter writer, String name, Fieldable f) throws IOException { - byte[] arr = f.getBinaryValue(); - if (arr==null) { - writer.writeStr(name, badFieldString(f),true); - return; - } - switch (type) { - case INTEGER: - writer.writeInt(name,TrieFieldHelper.toInt(arr)); - break; - case FLOAT: - writer.writeFloat(name,TrieFieldHelper.toFloat(arr)); - break; - case LONG: - writer.writeLong(name,TrieFieldHelper.toLong(arr)); - break; - case DOUBLE: - writer.writeDouble(name,TrieFieldHelper.toDouble(arr)); - break; - case DATE: - writer.writeDate(name,new Date(TrieFieldHelper.toLong(arr))); - break; - default: - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); - } + writer.writeVal(name, toObject(f)); } @Override @@ -290,6 +276,17 @@ public class TrieField extends FieldType { return query; } + @Deprecated + static int toInt(byte[] arr) { + return (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + } + + @Deprecated + static long toLong(byte[] arr) { + int high = (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + int low = (arr[4]<<24) | ((arr[5]&0xff)<<16) | ((arr[6]&0xff)<<8) | (arr[7]&0xff); + return (((long)high)<<32) | (low&0x0ffffffffL); + } @Override public String storedToReadable(Fieldable f) { @@ -341,22 +338,9 @@ public class TrieField extends FieldType { @Override public String toExternal(Fieldable f) { - byte[] arr = f.getBinaryValue(); - if (arr==null) return badFieldString(f); - switch (type) { - case INTEGER: - return Integer.toString(TrieFieldHelper.toInt(arr)); - case FLOAT: - return Float.toString(TrieFieldHelper.toFloat(arr)); - case LONG: - return Long.toString(TrieFieldHelper.toLong(arr)); - case DOUBLE: - return Double.toString(TrieFieldHelper.toDouble(arr)); - case DATE: - return dateField.formatDate(new Date(TrieFieldHelper.toLong(arr))); - default: - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); - } + return (type == TrieTypes.DATE) + ? dateField.toExternal((Date) toObject(f)) + : toObject(f).toString(); } @Override @@ -372,7 +356,7 @@ public class TrieField extends FieldType { case DOUBLE: return Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ); case DATE: - return dateField.formatDate( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); + return dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type); } @@ -397,7 +381,7 @@ public class TrieField extends FieldType { s = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ); break; case DATE: - s = dateField.formatDate( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); + s = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); break; default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type); @@ -426,59 +410,117 @@ public class TrieField extends FieldType { @Override public String storedToIndexed(Fieldable f) { - // TODO: optimize to remove redundant string conversion - return readableToIndexed(storedToReadable(f)); + final BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); + if (f instanceof NumericField) { + final Number val = ((NumericField) f).getNumericValue(); + if (val==null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field contents: "+f.name()); + switch (type) { + case INTEGER: + NumericUtils.intToPrefixCoded(val.intValue(), 0, bytes); + break; + case FLOAT: + NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(val.floatValue()), 0, bytes); + break; + case LONG: //fallthrough! + case DATE: + NumericUtils.longToPrefixCoded(val.longValue(), 0, bytes); + break; + case DOUBLE: + NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(val.doubleValue()), 0, bytes); + break; + default: + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); + } + } else { + // the following code is "deprecated" and only to support pre-3.2 indexes using the old BinaryField encoding: + final byte[] arr = f.getBinaryValue(); + if (arr==null) + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field contents: "+f.name()); + switch (type) { + case INTEGER: + NumericUtils.intToPrefixCoded(toInt(arr), 0, bytes); + break; + case FLOAT: { + // WARNING: Code Duplication! Keep in sync with o.a.l.util.NumericUtils! + // copied from NumericUtils to not convert to/from float two times + // code in next 2 lines is identical to: int v = NumericUtils.floatToSortableInt(Float.intBitsToFloat(toInt(arr))); + int v = toInt(arr); + if (v<0) v ^= 0x7fffffff; + NumericUtils.intToPrefixCoded(v, 0, bytes); + break; + } + case LONG: //fallthrough! + case DATE: + NumericUtils.longToPrefixCoded(toLong(arr), 0, bytes); + break; + case DOUBLE: { + // WARNING: Code Duplication! Keep in sync with o.a.l.util.NumericUtils! + // copied from NumericUtils to not convert to/from double two times + // code in next 2 lines is identical to: long v = NumericUtils.doubleToSortableLong(Double.longBitsToDouble(toLong(arr))); + long v = toLong(arr); + if (v<0) v ^= 0x7fffffffffffffffL; + NumericUtils.longToPrefixCoded(v, 0, bytes); + break; + } + default: + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name()); + } + } + return bytes.utf8ToString(); } @Override public Fieldable createField(SchemaField field, Object value, float boost) { - TrieFieldHelper.FieldInfo info = new TrieFieldHelper.FieldInfo(); - info.index = field.indexed(); - info.store = field.stored(); - info.precisionStep = precisionStep; - info.omitNorms = field.omitNorms(); - info.omitTF = field.omitTf(); - - if (!info.index && !info.store) { + boolean indexed = field.indexed(); + boolean stored = field.stored(); + + if (!indexed && !stored) { if (log.isTraceEnabled()) log.trace("Ignoring unindexed/unstored field: " + field); return null; } + final NumericField f = new NumericField(field.getName(), precisionStep, stored ? Field.Store.YES : Field.Store.NO, indexed); switch (type) { case INTEGER: int i = (value instanceof Number) ? ((Number)value).intValue() : Integer.parseInt(value.toString()); - return TrieFieldHelper.createIntField(field.getName(), i, info, boost); - + f.setIntValue(i); + break; case FLOAT: - float f = (value instanceof Number) + float fl = (value instanceof Number) ? ((Number)value).floatValue() : Float.parseFloat(value.toString()); - return TrieFieldHelper.createFloatField(field.getName(), f, info, boost); - + f.setFloatValue(fl); + break; case LONG: long l = (value instanceof Number) ? ((Number)value).longValue() : Long.parseLong(value.toString()); - return TrieFieldHelper.createLongField(field.getName(), l, info, boost); - + f.setLongValue(l); + break; case DOUBLE: double d = (value instanceof Number) ? ((Number)value).doubleValue() : Double.parseDouble(value.toString()); - return TrieFieldHelper.createDoubleField(field.getName(), d, info, boost); - + f.setDoubleValue(d); + break; case DATE: Date date = (value instanceof Date) ? ((Date)value) : dateField.parseMath(null, value.toString()); - return TrieFieldHelper.createDateField(field.getName(), date, info, boost); - + f.setLongValue(date.getTime()); + break; default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type); } + + f.setOmitNorms(field.omitNorms()); + f.setOmitTermFreqAndPositions(field.omitTf()); + f.setBoost(boost); + return f; } public enum TrieTypes { @@ -498,14 +540,12 @@ public class TrieField extends FieldType { * that indexes multiple precisions per value. */ public static String getMainValuePrefix(FieldType ft) { - if (ft instanceof TrieDateField) { - int step = ((TrieDateField)ft).getPrecisionStep(); - if (step <= 0 || step >=64) return null; - return LONG_PREFIX; - } else if (ft instanceof TrieField) { - TrieField trie = (TrieField)ft; - if (trie.precisionStep == Integer.MAX_VALUE) return null; - + if (ft instanceof TrieDateField) + ft = ((TrieDateField) ft).wrappedField; + if (ft instanceof TrieField) { + final TrieField trie = (TrieField)ft; + if (trie.precisionStep == Integer.MAX_VALUE) + return null; switch (trie.type) { case INTEGER: case FLOAT: diff --git a/solr/src/java/org/apache/solr/schema/TrieFieldHelper.java b/solr/src/java/org/apache/solr/schema/TrieFieldHelper.java deleted file mode 100644 index c40ecd87a78..00000000000 --- a/solr/src/java/org/apache/solr/schema/TrieFieldHelper.java +++ /dev/null @@ -1,166 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.schema; - -import java.util.Date; - -import org.apache.lucene.analysis.NumericTokenStream; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.Fieldable; - -/** - * Helper class to make TrieFields compatible with ones written in solr - * - * TODO -- Something like this should be in in lucene - * see: LUCENE-3001 - */ -public class TrieFieldHelper { - - private TrieFieldHelper() {} - - public static class FieldInfo { - public int precisionStep = 8; // same as solr default - public boolean store = true; - public boolean index = true; - public boolean omitNorms = true; - public boolean omitTF = true; - } - - //---------------------------------------------- - // Create Field - //---------------------------------------------- - - private static Fieldable createField(String name, byte[] arr, TokenStream ts, FieldInfo info, float boost) { - - Field f; - if (info.store) { - f = new Field(name, arr); - if (info.index) f.setTokenStream(ts); - } else { - f = new Field(name, ts); - } - - // term vectors aren't supported - f.setOmitNorms(info.omitNorms); - f.setOmitTermFreqAndPositions(info.omitTF); - f.setBoost(boost); - return f; - } - - public static Fieldable createIntField(String name, int value, FieldInfo info, float boost) { - - byte[] arr=null; - TokenStream ts=null; - - if (info.store) arr = TrieFieldHelper.toArr(value); - if (info.index) ts = new NumericTokenStream(info.precisionStep).setIntValue(value); - - return createField(name, arr, ts, info, boost); - } - - public static Fieldable createFloatField(String name, float value, FieldInfo info, float boost) { - - byte[] arr=null; - TokenStream ts=null; - - if (info.store) arr = TrieFieldHelper.toArr(value); - if (info.index) ts = new NumericTokenStream(info.precisionStep).setFloatValue(value); - - return createField(name, arr, ts, info, boost); - } - - public static Fieldable createLongField(String name, long value, FieldInfo info, float boost) { - - byte[] arr=null; - TokenStream ts=null; - - if (info.store) arr = TrieFieldHelper.toArr(value); - if (info.index) ts = new NumericTokenStream(info.precisionStep).setLongValue(value); - - return createField(name, arr, ts, info, boost); - } - - public static Fieldable createDoubleField(String name, double value, FieldInfo info, float boost) { - - byte[] arr=null; - TokenStream ts=null; - - if (info.store) arr = TrieFieldHelper.toArr(value); - if (info.index) ts = new NumericTokenStream(info.precisionStep).setDoubleValue(value); - - return createField(name, arr, ts, info, boost); - } - - public static Fieldable createDateField(String name, Date value, FieldInfo info, float boost) { - // TODO, make sure the date is within long range! - return createLongField(name, value.getTime(), info, boost); - } - - - //---------------------------------------------- - // number <=> byte[] - //---------------------------------------------- - - public static int toInt(byte[] arr) { - return (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); - } - - public static long toLong(byte[] arr) { - int high = (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); - int low = (arr[4]<<24) | ((arr[5]&0xff)<<16) | ((arr[6]&0xff)<<8) | (arr[7]&0xff); - return (((long)high)<<32) | (low&0x0ffffffffL); - } - - public static float toFloat(byte[] arr) { - return Float.intBitsToFloat(toInt(arr)); - } - - public static double toDouble(byte[] arr) { - return Double.longBitsToDouble(toLong(arr)); - } - - public static byte[] toArr(int val) { - byte[] arr = new byte[4]; - arr[0] = (byte)(val>>>24); - arr[1] = (byte)(val>>>16); - arr[2] = (byte)(val>>>8); - arr[3] = (byte)(val); - return arr; - } - - public static byte[] toArr(long val) { - byte[] arr = new byte[8]; - arr[0] = (byte)(val>>>56); - arr[1] = (byte)(val>>>48); - arr[2] = (byte)(val>>>40); - arr[3] = (byte)(val>>>32); - arr[4] = (byte)(val>>>24); - arr[5] = (byte)(val>>>16); - arr[6] = (byte)(val>>>8); - arr[7] = (byte)(val); - return arr; - } - - public static byte[] toArr(float val) { - return toArr(Float.floatToRawIntBits(val)); - } - - public static byte[] toArr(double val) { - return toArr(Double.doubleToRawLongBits(val)); - } -} diff --git a/solr/src/java/org/apache/solr/search/SolrQueryParser.java b/solr/src/java/org/apache/solr/search/SolrQueryParser.java index 34192b8fd8b..80db3314c7d 100644 --- a/solr/src/java/org/apache/solr/search/SolrQueryParser.java +++ b/solr/src/java/org/apache/solr/search/SolrQueryParser.java @@ -67,7 +67,7 @@ public class SolrQueryParser extends QueryParser { } public SolrQueryParser(QParser parser, String defaultField, Analyzer analyzer) { - super(parser.getReq().getCore().getSolrConfig().getLuceneVersion("luceneMatchVersion", Version.LUCENE_30), defaultField, analyzer); + super(parser.getReq().getCore().getSolrConfig().luceneMatchVersion, defaultField, analyzer); this.schema = parser.getReq().getSchema(); this.parser = parser; this.defaultField = defaultField; diff --git a/solr/src/java/org/apache/solr/update/AddUpdateCommand.java b/solr/src/java/org/apache/solr/update/AddUpdateCommand.java index 84632ee0b39..6a02010bf43 100644 --- a/solr/src/java/org/apache/solr/update/AddUpdateCommand.java +++ b/solr/src/java/org/apache/solr/update/AddUpdateCommand.java @@ -18,7 +18,7 @@ package org.apache.solr.update; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.Term; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; @@ -74,7 +74,7 @@ public class AddUpdateCommand extends UpdateCommand { if (sf != null) { if (doc != null) { schema.getUniqueKeyField(); - Field storedId = doc.getField(sf.getName()); + Fieldable storedId = doc.getFieldable(sf.getName()); indexedId = sf.getType().storedToIndexed(storedId); } if (solrDoc != null) { diff --git a/solr/src/java/org/apache/solr/update/DocumentBuilder.java b/solr/src/java/org/apache/solr/update/DocumentBuilder.java index e78e18d3340..a7b80e0ded8 100644 --- a/solr/src/java/org/apache/solr/update/DocumentBuilder.java +++ b/solr/src/java/org/apache/solr/update/DocumentBuilder.java @@ -159,7 +159,7 @@ public class DocumentBuilder { // default value are defacto 'required' fields. List missingFields = null; for (SchemaField field : schema.getRequiredFields()) { - if (doc.getField(field.getName() ) == null) { + if (doc.getFieldable(field.getName() ) == null) { if (field.getDefaultValue() != null) { addField(doc, field, field.getDefaultValue(), 1.0f); } else { @@ -313,7 +313,7 @@ public class DocumentBuilder { // Now validate required fields or add default values // fields with default values are defacto 'required' for (SchemaField field : schema.getRequiredFields()) { - if (out.getField(field.getName() ) == null) { + if (out.getFieldable(field.getName() ) == null) { if (field.getDefaultValue() != null) { addField(out, field, field.getDefaultValue(), 1.0f); } @@ -339,8 +339,7 @@ public class DocumentBuilder { */ public SolrDocument loadStoredFields( SolrDocument doc, Document luceneDoc ) { - for( Object f : luceneDoc.getFields() ) { - Fieldable field = (Fieldable)f; + for( Fieldable field : luceneDoc.getFields() ) { if( field.isStored() ) { SchemaField sf = schema.getField( field.name() ); if( !schema.isCopyFieldTarget( sf ) ) { diff --git a/solr/src/java/org/apache/solr/update/UpdateHandler.java b/solr/src/java/org/apache/solr/update/UpdateHandler.java index e7332349dfd..cd13a4935ab 100644 --- a/solr/src/java/org/apache/solr/update/UpdateHandler.java +++ b/solr/src/java/org/apache/solr/update/UpdateHandler.java @@ -21,7 +21,6 @@ package org.apache.solr.update; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Scorer; @@ -125,7 +124,7 @@ public abstract class UpdateHandler implements SolrInfoMBean { protected final String getIndexedIdOptional(Document doc) { if (idField == null) return null; - Field f = doc.getField(idField.getName()); + Fieldable f = doc.getFieldable(idField.getName()); if (f == null) return null; return idFieldType.storedToIndexed(f); } diff --git a/solr/src/test/org/apache/solr/BasicFunctionalityTest.java b/solr/src/test/org/apache/solr/BasicFunctionalityTest.java index f19d9b2b8c0..3b12f7978e4 100644 --- a/solr/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/solr/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -561,7 +561,7 @@ public class BasicFunctionalityTest extends SolrTestCaseJ4 { DocList dl = ((ResultContext) rsp.getValues().get("response")).docs; org.apache.lucene.document.Document d = req.getSearcher().doc(dl.iterator().nextDoc()); - // ensure field is not lazy + // ensure field is not lazy, only works for Non-Numeric fields currently (if you change schema behind test, this may fail) assertTrue( d.getFieldable("test_hlt") instanceof Field ); assertTrue( d.getFieldable("title") instanceof Field ); req.close(); diff --git a/solr/src/test/org/apache/solr/analysis/TestSynonymMap.java b/solr/src/test/org/apache/solr/analysis/TestSynonymMap.java index e00cd35c426..d3a6ee77873 100644 --- a/solr/src/test/org/apache/solr/analysis/TestSynonymMap.java +++ b/solr/src/test/org/apache/solr/analysis/TestSynonymMap.java @@ -17,6 +17,8 @@ package org.apache.solr.analysis; +import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -25,6 +27,8 @@ import java.util.Map; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.util.LuceneTestCase; +import org.apache.solr.common.ResourceLoader; + public class TestSynonymMap extends LuceneTestCase { @@ -257,6 +261,43 @@ public class TestSynonymMap extends LuceneTestCase { assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); } + + public void testLoadRules() throws Exception { + Map args = new HashMap(); + args.put( "synonyms", "something.txt" ); + SynonymFilterFactory ff = new SynonymFilterFactory(); + ff.init(args); + ff.inform( new ResourceLoader() { + @Override + public List getLines(String resource) throws IOException { + if( !"something.txt".equals(resource) ) { + throw new RuntimeException( "should not get a differnt resource" ); + } + List rules = new ArrayList(); + rules.add( "a,b" ); + return rules; + } + + @Override + public Object newInstance(String cname, String... subpackages) { + throw new RuntimeException("stub"); + } + + @Override + public InputStream openResource(String resource) throws IOException { + throw new RuntimeException("stub"); + } + }); + + SynonymMap synMap = ff.getSynonymMap(); + assertEquals( 2, synMap.submap.size() ); + assertTokIncludes( synMap, "a", "a" ); + assertTokIncludes( synMap, "a", "b" ); + assertTokIncludes( synMap, "b", "a" ); + assertTokIncludes( synMap, "b", "b" ); + } + + private void assertTokIncludes( SynonymMap map, String src, String exp ) throws Exception { Token[] tokens = map.submap.get( src ).synonyms; boolean inc = false; diff --git a/solr/src/test/org/apache/solr/handler/JsonLoaderTest.java b/solr/src/test/org/apache/solr/handler/JsonLoaderTest.java index e6635475356..5deec94f01b 100644 --- a/solr/src/test/org/apache/solr/handler/JsonLoaderTest.java +++ b/solr/src/test/org/apache/solr/handler/JsonLoaderTest.java @@ -26,7 +26,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; +import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.CommitUpdateCommand; import org.apache.solr.update.DeleteUpdateCommand; @@ -81,13 +83,11 @@ public class JsonLoaderTest extends SolrTestCaseJ4 { public void testParsing() throws Exception { SolrQueryRequest req = req(); - Reader reader = new StringReader(input); - + SolrQueryResponse rsp = new SolrQueryResponse(); BufferingRequestProcessor p = new BufferingRequestProcessor(null); - JsonLoader loader = new JsonLoader( p ); - - loader.processUpdate(req, p, new JSONParser(reader) ); - + JsonLoader loader = new JsonLoader( req, p ); + loader.load(req, rsp, new ContentStreamBase.StringStream(input)); + assertEquals( 2, p.addCommands.size() ); AddUpdateCommand add = p.addCommands.get(0); @@ -133,8 +133,67 @@ public class JsonLoaderTest extends SolrTestCaseJ4 { req.close(); } + + + public void testSimpleFormat() throws Exception + { + String str = "[{'id':'1'},{'id':'2'}]".replace('\'', '"'); + SolrQueryRequest req = req("commitWithin","100", "overwrite","false"); + SolrQueryResponse rsp = new SolrQueryResponse(); + BufferingRequestProcessor p = new BufferingRequestProcessor(null); + JsonLoader loader = new JsonLoader( req, p ); + loader.load(req, rsp, new ContentStreamBase.StringStream(str)); + + assertEquals( 2, p.addCommands.size() ); + + AddUpdateCommand add = p.addCommands.get(0); + SolrInputDocument d = add.solrDoc; + SolrInputField f = d.getField( "id" ); + assertEquals("1", f.getValue()); + assertEquals(add.commitWithin, 100); + assertEquals(add.overwrite, false); + + add = p.addCommands.get(1); + d = add.solrDoc; + f = d.getField( "id" ); + assertEquals("2", f.getValue()); + assertEquals(add.commitWithin, 100); + assertEquals(add.overwrite, false); + + req.close(); + } + + public void testSimpleFormatInAdd() throws Exception + { + String str = "{'add':[{'id':'1'},{'id':'2'}]}".replace('\'', '"'); + SolrQueryRequest req = req(); + SolrQueryResponse rsp = new SolrQueryResponse(); + BufferingRequestProcessor p = new BufferingRequestProcessor(null); + JsonLoader loader = new JsonLoader( req, p ); + loader.load(req, rsp, new ContentStreamBase.StringStream(str)); + + assertEquals( 2, p.addCommands.size() ); + + AddUpdateCommand add = p.addCommands.get(0); + SolrInputDocument d = add.solrDoc; + SolrInputField f = d.getField( "id" ); + assertEquals("1", f.getValue()); + assertEquals(add.commitWithin, -1); + assertEquals(add.overwrite, true); + + add = p.addCommands.get(1); + d = add.solrDoc; + f = d.getField( "id" ); + assertEquals("2", f.getValue()); + assertEquals(add.commitWithin, -1); + assertEquals(add.overwrite, true); + + req.close(); + } + } + class BufferingRequestProcessor extends UpdateRequestProcessor { List addCommands = new ArrayList(); diff --git a/solr/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java b/solr/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java index 6dbae21f244..c7d8a392201 100644 --- a/solr/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java +++ b/solr/src/test/org/apache/solr/handler/MoreLikeThisHandlerTest.java @@ -79,7 +79,7 @@ public class MoreLikeThisHandlerTest extends SolrTestCaseJ4 { params.set(CommonParams.Q, "id:42"); params.set(MoreLikeThisParams.MLT, "true"); - params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword,foo_ti"); + params.set(MoreLikeThisParams.SIMILARITY_FIELDS, "name,subword"); params.set(MoreLikeThisParams.INTERESTING_TERMS, "details"); params.set(MoreLikeThisParams.MIN_TERM_FREQ,"1"); params.set(MoreLikeThisParams.MIN_DOC_FREQ,"1"); diff --git a/solr/src/test/org/apache/solr/update/DocumentBuilderTest.java b/solr/src/test/org/apache/solr/update/DocumentBuilderTest.java index 4a4df13b0f7..991295d1339 100644 --- a/solr/src/test/org/apache/solr/update/DocumentBuilderTest.java +++ b/solr/src/test/org/apache/solr/update/DocumentBuilderTest.java @@ -109,8 +109,8 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 { doc.addField( "home", "2.2,3.3", 1.0f ); Document out = DocumentBuilder.toDocument( doc, core.getSchema() ); assertNotNull( out.get( "home" ) );//contains the stored value and term vector, if there is one - assertNotNull( out.getField( "home_0" + FieldType.POLY_FIELD_SEPARATOR + "double" ) ); - assertNotNull( out.getField( "home_1" + FieldType.POLY_FIELD_SEPARATOR + "double" ) ); + assertNotNull( out.getFieldable( "home_0" + FieldType.POLY_FIELD_SEPARATOR + "double" ) ); + assertNotNull( out.getFieldable( "home_1" + FieldType.POLY_FIELD_SEPARATOR + "double" ) ); } }