From 32a0f402d66a1da33498a12d98ab366579f40f1c Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Mon, 23 Jul 2012 17:33:24 +0000 Subject: [PATCH 1/6] SOLR-3623: Fixed inconsistent treatment of third-party dependencies for solr contribs analysis-extras & uima git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364728 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 2 + solr/common-build.xml | 53 ++++++++-------- solr/contrib/analysis-extras/README.txt | 7 ++- solr/contrib/analysis-extras/build.xml | 19 +++--- solr/contrib/analysis-extras/ivy.xml | 3 + .../lib/morfologik-fsa-1.5.3.jar.sha1 | 1 + .../lib/morfologik-fsa-LICENSE-BSD.txt | 29 +++++++++ .../lib/morfologik-fsa-NOTICE.txt | 2 + .../lib/morfologik-polish-1.5.3.jar.sha1 | 1 + .../lib/morfologik-polish-LICENSE-BSD.txt | 62 +++++++++++++++++++ .../lib/morfologik-polish-NOTICE.txt | 6 ++ .../lib/morfologik-stemming-1.5.3.jar.sha1 | 1 + .../lib/morfologik-stemming-LICENSE-BSD.txt | 29 +++++++++ .../lib/morfologik-stemming-NOTICE.txt | 2 + solr/contrib/uima/README.txt | 1 + 15 files changed, 182 insertions(+), 36 deletions(-) create mode 100644 solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1 create mode 100644 solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt create mode 100644 solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt create mode 100644 solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1 create mode 100644 solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt create mode 100644 solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt create mode 100644 solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1 create mode 100644 solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt create mode 100644 solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 2508771e4df..9900f4a37c6 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -131,6 +131,8 @@ Bug Fixes * SOLR-3663: There are a couple of bugs in the sync process when a leader goes down and a new leader is elected. (Mark Miller) +* SOLR-3623: Fixed inconsistent treatment of third-party dependencies for + solr contribs analysis-extras & uima (hossman) Other Changes ---------------------- diff --git a/solr/common-build.xml b/solr/common-build.xml index e97a2aaa8a6..20170152fac 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -70,21 +70,32 @@ --> + + + + + + + + + + + + + + + + - - - - - - - - - - - - + @@ -125,7 +136,7 @@ @@ -137,19 +148,11 @@ + + - - - - - - - - - - - - diff --git a/solr/contrib/analysis-extras/README.txt b/solr/contrib/analysis-extras/README.txt index c9329438fe7..c7cfbc37f60 100644 --- a/solr/contrib/analysis-extras/README.txt +++ b/solr/contrib/analysis-extras/README.txt @@ -9,8 +9,11 @@ Relies upon the following lucene components (in lucene-libs/): * lucene-analyzers-icu-X.Y.jar * lucene-analyzers-smartcn-X.Y.jar * lucene-analyzers-stempel-X.Y.jar - -And the ICU library (in lib/): + * lucene-analyzers-morfologik-X.Y.jar + * lucene-analyzers-smartcn-X.Y.jar + +And the following third-party library (in lib/): * icu4j-X.Y.jar + * morfologik-*.jar diff --git a/solr/contrib/analysis-extras/build.xml b/solr/contrib/analysis-extras/build.xml index 93519b05d54..398b7201ea6 100644 --- a/solr/contrib/analysis-extras/build.xml +++ b/solr/contrib/analysis-extras/build.xml @@ -24,13 +24,17 @@ + + + + + + + - - - - - + + @@ -38,10 +42,7 @@ depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik"> - - - - + diff --git a/solr/contrib/analysis-extras/ivy.xml b/solr/contrib/analysis-extras/ivy.xml index 62fcffbab96..6329c419bb1 100644 --- a/solr/contrib/analysis-extras/ivy.xml +++ b/solr/contrib/analysis-extras/ivy.xml @@ -20,6 +20,9 @@ + + + diff --git a/solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1 b/solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1 new file mode 100644 index 00000000000..3d3b86d5f8c --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1 @@ -0,0 +1 @@ +d1f729cd3019e6d86485226202f84458141a5688 diff --git a/solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt b/solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt new file mode 100644 index 00000000000..f97fb7dfe38 --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt @@ -0,0 +1,29 @@ + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt b/solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt new file mode 100644 index 00000000000..18ba2f3e39c --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt @@ -0,0 +1,2 @@ +This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski +(http://morfologik.blogspot.com/). diff --git a/solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1 b/solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1 new file mode 100644 index 00000000000..6eb48a47896 --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1 @@ -0,0 +1 @@ +8217b6f7ad018ceda0e824b2e60340000da4397a diff --git a/solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt b/solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt new file mode 100644 index 00000000000..04ffd07ece9 --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt @@ -0,0 +1,62 @@ +BSD-licensed dictionary of Polish (Morfologik) + +Copyright (c) 2012, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-- + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt b/solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt new file mode 100644 index 00000000000..a8a3aa11a3d --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt @@ -0,0 +1,6 @@ + +This product includes data from BSD-licensed dictionary of Polish (Morfologik) +(http://morfologik.blogspot.com/) + +This product includes data from BSD-licensed dictionary of Polish (SGJP) +(http://sgjp.pl/morfeusz/) diff --git a/solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1 b/solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1 new file mode 100644 index 00000000000..c31642be45d --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1 @@ -0,0 +1 @@ +c4ead57b78fa71b00553ff21da6fb5a326e914e8 diff --git a/solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt b/solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt new file mode 100644 index 00000000000..f97fb7dfe38 --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt @@ -0,0 +1,29 @@ + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt b/solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt new file mode 100644 index 00000000000..18ba2f3e39c --- /dev/null +++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt @@ -0,0 +1,2 @@ +This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski +(http://morfologik.blogspot.com/). diff --git a/solr/contrib/uima/README.txt b/solr/contrib/uima/README.txt index 9d45910666e..70d49f8ff37 100644 --- a/solr/contrib/uima/README.txt +++ b/solr/contrib/uima/README.txt @@ -6,6 +6,7 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f or set tags in solrconfig.xml appropriately to point those jar files. + 2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options: From c0f8cd69a8a8e267305c3d3383bed5616fde4b01 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 23 Jul 2012 19:26:00 +0000 Subject: [PATCH 2/6] LUCENE-4248: add producer assertions to Codec API / fix producer inconsistencies git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364763 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/codecs/BlockTreeTermsWriter.java | 2 +- .../lucene/codecs/PostingsConsumer.java | 2 +- .../apache/lucene/codecs/TermsConsumer.java | 2 +- .../index/FreqProxTermsWriterPerField.java | 4 +- .../org/apache/lucene/index/TestCodecs.java | 4 +- .../lucene/index/TestPostingsFormat.java | 4 +- .../asserting/AssertingPostingsFormat.java | 88 ++++++++++++++++++- 7 files changed, 95 insertions(+), 11 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java index 80156f869a7..8dc99b3d27d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java @@ -896,7 +896,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer { // w.close(); // } } else { - assert sumTotalTermFreq == 0; + assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1; assert sumDocFreq == 0; assert docCount == 0; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java index f9db84a90ff..e31f4b79343 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java @@ -146,6 +146,6 @@ public abstract class PostingsConsumer { df++; } } - return new TermStats(df, totTF); + return new TermStats(df, indexOptions == IndexOptions.DOCS_ONLY ? -1 : totTF); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java index 30419c95919..4148430f408 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java @@ -205,6 +205,6 @@ public abstract class TermsConsumer { } } } - finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); + finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); } } diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 425f158afce..6a5f1f119bf 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -542,11 +542,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem } postingsConsumer.finishDoc(); } - termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); + termsConsumer.finishTerm(text, new TermStats(numDocs, writeTermFreq ? totTF : -1)); sumTotalTermFreq += totTF; sumDocFreq += numDocs; } - termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); + termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality()); } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 7f50fa8ed08..8be1027cd0a 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -116,7 +116,7 @@ public class TestCodecs extends LuceneTestCase { sumDF += term.docs.length; sumTotalTermCount += term.write(termsConsumer); } - termsConsumer.finish(sumTotalTermCount, sumDF, (int) visitedDocs.cardinality()); + termsConsumer.finish(omitTF ? -1 : sumTotalTermCount, sumDF, (int) visitedDocs.cardinality()); } } @@ -168,7 +168,7 @@ public class TestCodecs extends LuceneTestCase { postingsConsumer.finishDoc(); } } - termsConsumer.finishTerm(text, new TermStats(docs.length, totTF)); + termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF)); return totTF; } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java index c5bdba8ee29..7c55e092df8 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java @@ -428,12 +428,12 @@ public class TestPostingsFormat extends LuceneTestCase { postingsConsumer.finishDoc(); docCount++; } - termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF)); + termsConsumer.finishTerm(term, new TermStats(postings.size(), doFreq ? totalTF : -1)); sumTotalTF += totalTF; sumDF += postings.size(); } - termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality()); + termsConsumer.finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.cardinality()); } fieldsConsumer.close(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java index fb33e38deb2..863de992625 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java @@ -18,16 +18,23 @@ package org.apache.lucene.codecs.asserting; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsConsumer; import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.TermStats; +import org.apache.lucene.codecs.TermsConsumer; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.index.AssertingAtomicReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Terms; +import org.apache.lucene.util.BytesRef; /** * Just like {@link Lucene40PostingsFormat} but with additional asserts. @@ -39,10 +46,9 @@ public class AssertingPostingsFormat extends PostingsFormat { super("Asserting"); } - // TODO: we could add some useful checks here? @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return in.fieldsConsumer(state); + return new AssertingFieldsConsumer(in.fieldsConsumer(state)); } @Override @@ -85,4 +91,82 @@ public class AssertingPostingsFormat extends PostingsFormat { return in.getUniqueTermCount(); } } + + static class AssertingFieldsConsumer extends FieldsConsumer { + private final FieldsConsumer in; + + AssertingFieldsConsumer(FieldsConsumer in) { + this.in = in; + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + TermsConsumer consumer = in.addField(field); + assert consumer != null; + return new AssertingTermsConsumer(consumer, field); + } + + @Override + public void close() throws IOException { + in.close(); + } + } + + static enum TermsConsumerState { INITIAL, START, FINISHED }; + static class AssertingTermsConsumer extends TermsConsumer { + private final TermsConsumer in; + private final FieldInfo fieldInfo; + private BytesRef lastTerm = null; + private TermsConsumerState state = TermsConsumerState.INITIAL; + + AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) { + this.in = in; + this.fieldInfo = fieldInfo; + } + + // TODO: AssertingPostingsConsumer + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed. + // TODO: this makes the api really confusing! we should try to clean this up! + assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START; + state = TermsConsumerState.START; + assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0; + lastTerm = BytesRef.deepCopyOf(text); + return in.startTerm(text); + } + + @Override + public void finishTerm(BytesRef text, TermStats stats) throws IOException { + assert state == TermsConsumerState.START; + state = TermsConsumerState.INITIAL; + assert text.equals(lastTerm); + assert stats.docFreq > 0; // otherwise, this method should not be called. + if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { + assert stats.totalTermFreq == -1; + } + in.finishTerm(text, stats); + } + + @Override + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + // TODO: assert that if state == START (no finishTerm called), that no actual docs were fed. + // TODO: this makes the api really confusing! we should try to clean this up! + assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START; + state = TermsConsumerState.FINISHED; + assert docCount >= 0; + assert sumDocFreq >= docCount; + if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) { + assert sumTotalTermFreq == -1; + } else { + assert sumTotalTermFreq >= sumDocFreq; + } + in.finish(sumTotalTermFreq, sumDocFreq, docCount); + } + + @Override + public Comparator getComparator() throws IOException { + return in.getComparator(); + } + } } From 618c38fd98b582579e82759315cf477c1956ea82 Mon Sep 17 00:00:00 2001 From: David Wayne Smiley Date: Mon, 23 Jul 2012 20:19:02 +0000 Subject: [PATCH 3/6] LUCENE-4223 spatial docs: overview.html, SpatialStrategy, and added SpatialExample.java sample git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364782 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/spatial/SpatialStrategy.java | 22 ++- lucene/spatial/src/java/overview.html | 43 ++++- .../apache/lucene/spatial/SpatialExample.java | 180 ++++++++++++++++++ 3 files changed, 241 insertions(+), 4 deletions(-) create mode 100644 lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java b/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java index a04a5805861..81eadf8083e 100644 --- a/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java @@ -28,10 +28,26 @@ import org.apache.lucene.search.Query; import org.apache.lucene.spatial.query.SpatialArgs; /** - * The SpatialStrategy encapsulates an approach to indexing and searching based on shapes. + * The SpatialStrategy encapsulates an approach to indexing and searching based + * on shapes. *

- * Note that a SpatialStrategy is not involved with the Lucene stored field values of shapes, which is - * immaterial to indexing & search. + * Different implementations will support different features. A strategy should + * document these common elements: + *

    + *
  • Can it index more than one shape per field?
  • + *
  • What types of shapes can be indexed?
  • + *
  • What types of query shapes can be used?
  • + *
  • What types of query operations are supported? + * This might vary per shape.
  • + *
  • Are there caches? Under what circumstances are they used? + * Roughly how big are they? Is it segmented by Lucene segments, such as is + * done by the Lucene {@link org.apache.lucene.search.FieldCache} and + * {@link org.apache.lucene.index.DocValues} (ideal) or is it for the entire + * index? + *
+ *

+ * Note that a SpatialStrategy is not involved with the Lucene stored field + * values of shapes, which is immaterial to indexing & search. *

* Thread-safe. * diff --git a/lucene/spatial/src/java/overview.html b/lucene/spatial/src/java/overview.html index 9ba5f0b63fe..51fe0031fec 100644 --- a/lucene/spatial/src/java/overview.html +++ b/lucene/spatial/src/java/overview.html @@ -16,8 +16,49 @@ --> - Apache Lucene Spatial Strategies + Apache Lucene Spatial Module + +

The Spatial Module for Apache Lucene

+ +

+ The spatial module is new is Lucene 4, replacing the old contrib module + that came before it. The principle interface to the module is + a {@link org.apache.lucene.spatial.SpatialStrategy} + which encapsulates an approach to indexing and searching + based on shapes. Different Strategies have different features and + performance profiles, which are documented at each Strategy class level. +

+

+ For some sample code showing how to use the API, see SpatialExample.java in + the tests. +

+

+ The spatial module uses + Spatial4j + heavily. Spatial4j is an ASL licensed library with these capabilities: +

    +
  • Provides shape implementations, namely point, rectangle, + and circle. Both geospatial contexts and plain 2D Euclidean/Cartesian contexts + are supported. + With an additional dependency, it adds polygon and other geometry shape + support via integration with + JTS Topology Suite. + This includes dateline wrap support.
  • +
  • Shape parsing and serialization, including + Well-Known Text (WKT) + (via JTS).
  • +
  • Distance and other spatial related math calculations.
  • +
+

+

+ Historical note: The new spatial module was once known as + Lucene Spatial Playground (LSP) as an external project. In ~March 2012, LSP + split into this new module as part of Lucene and Spatial4j externally. A + large chunk of the LSP implementation originated as SOLR-2155 which uses + trie/prefix-tree algorithms with a geohash encoding. +

+ \ No newline at end of file diff --git a/lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java b/lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java new file mode 100644 index 00000000000..a3963c02bf8 --- /dev/null +++ b/lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java @@ -0,0 +1,180 @@ +package org.apache.lucene.spatial; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.spatial4j.core.context.SpatialContext; +import com.spatial4j.core.context.simple.SimpleSpatialContext; +import com.spatial4j.core.shape.Shape; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.queries.function.ValueSource; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy; +import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree; +import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree; +import org.apache.lucene.spatial.query.SpatialArgs; +import org.apache.lucene.spatial.query.SpatialArgsParser; +import org.apache.lucene.spatial.query.SpatialOperation; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; + +import java.io.IOException; + +/** + * This class serves as example code to show how to use the Lucene spatial + * module. + */ +public class SpatialExample extends RandomizedTest { + + public static void main(String[] args) throws IOException { + new SpatialExample().test(); + } + + public void test() throws IOException { + init(); + indexPoints(); + search(); + } + + /** + * The Spatial4j {@link SpatialContext} is a sort of global-ish singleton + * needed by Lucene spatial. It's a facade to the rest of Spatial4j, acting + * as a factory for {@link Shape}s and provides access to reading and writing + * them from Strings. + */ + private SpatialContext ctx;//"ctx" is the conventional variable name + + /** + * The Lucene spatial {@link SpatialStrategy} encapsulates an approach to + * indexing and searching shapes, and providing relevancy scores for them. + * It's a simple API to unify different approaches. + *

+ * Note that these are initialized with a field name. + */ + private SpatialStrategy strategy; + + private Directory directory; + + protected void init() { + //Typical geospatial context with kilometer units. + // These can also be constructed from a factory: SpatialContextFactory + this.ctx = SimpleSpatialContext.GEO_KM; + + int maxLevels = 10;//results in sub-meter precision for geohash + //TODO demo lookup by detail distance + // This can also be constructed from a factory: SpatialPrefixTreeFactory + SpatialPrefixTree grid = new GeohashPrefixTree(ctx, maxLevels); + + this.strategy = new RecursivePrefixTreeStrategy(grid, "myGeoField"); + + this.directory = new RAMDirectory(); + } + + private void indexPoints() throws IOException { + IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_50,null); + IndexWriter indexWriter = new IndexWriter(directory, iwConfig); + + //Spatial4j is x-y order for arguments + indexWriter.addDocument(newSampleDocument( + 2, ctx.makePoint(-80.93, 33.77))); + + //When parsing a string to a shape, the presence of a comma means it's y-x + // order (lon, lat) + indexWriter.addDocument(newSampleDocument( + 4, ctx.readShape("-50.7693246, 60.9289094"))); + + indexWriter.addDocument(newSampleDocument( + 20, ctx.makePoint(0.1,0.1), ctx.makePoint(0, 0))); + + indexWriter.close(); + } + + private Document newSampleDocument(int id, Shape... shapes) { + Document doc = new Document(); + doc.add(new IntField("id", id, Field.Store.YES)); + //Potentially more than one shape in this field is supported by some + // strategies; see the javadocs of the SpatialStrategy impl to see. + for (Shape shape : shapes) { + for (IndexableField f : strategy.createIndexableFields(shape)) { + doc.add(f); + } + //store it too; the format is up to you + doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape))); + } + + return doc; + } + + private void search() throws IOException { + IndexReader indexReader = DirectoryReader.open(directory); + IndexSearcher indexSearcher = new IndexSearcher(indexReader); + Sort idSort = new Sort(new SortField("id", SortField.Type.INT)); + + //--Filter by circle (<= distance from a point) + { + //Search with circle + //note: SpatialArgs can be parsed from a string + SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects, + ctx.makeCircle(-80.0, 33.0, 200));//200km (since km == ctx.getDistanceUnits + Filter filter = strategy.makeFilter(args); + TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), filter, 10, idSort); + assertDocMatchedIds(indexSearcher, docs, 2); + } + //--Match all, order by distance + { + SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,//doesn't matter + ctx.makePoint(60, -50)); + ValueSource valueSource = strategy.makeValueSource(args);//the distance + Sort reverseDistSort = new Sort(valueSource.getSortField(false)).rewrite(indexSearcher);//true=asc dist + TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), 10, reverseDistSort); + assertDocMatchedIds(indexSearcher, docs, 4, 20, 2); + } + //demo arg parsing + { + SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects, + ctx.makeCircle(-80.0, 33.0, 200)); + SpatialArgs args2 = new SpatialArgsParser().parse("Intersects(Circle(33,-80 d=200))", ctx); + assertEquals(args.toString(),args2.toString()); + } + + indexReader.close(); + } + + private void assertDocMatchedIds(IndexSearcher indexSearcher, TopDocs docs, int... ids) throws IOException { + int[] gotIds = new int[docs.totalHits]; + for (int i = 0; i < gotIds.length; i++) { + gotIds[i] = indexSearcher.doc(docs.scoreDocs[i].doc).getField("id").numericValue().intValue(); + } + assertArrayEquals(ids,gotIds); + } + +} From 87182914a334d928b966cfcd8a2983c1973e01ac Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Mon, 23 Jul 2012 20:28:13 +0000 Subject: [PATCH 4/6] since we raised the session timeout, this needs to be willing to poll longer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364786 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/solr/cloud/LeaderElectionIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java index 887c8f2e6ad..d55ed7ec9f0 100644 --- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java @@ -191,7 +191,7 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 { int newLeaderPort = getLeaderPort(leader); int retry = 0; while (leaderPort == newLeaderPort) { - if (retry++ == 20) { + if (retry++ == 60) { break; } Thread.sleep(1000); From e0d137f8e2ad41039a8486601e1ff0463f25840e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 23 Jul 2012 20:46:08 +0000 Subject: [PATCH 5/6] LUCENE-4828: add AssertingPostingsConsumer, fix minor inconsistencies in producers git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1364792 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/codecs/PostingsConsumer.java | 9 +- .../apache/lucene/codecs/TermsConsumer.java | 8 +- .../index/FreqProxTermsWriterPerField.java | 6 +- .../org/apache/lucene/index/TestCodecs.java | 4 +- .../lucene/index/TestPostingsFormat.java | 2 +- .../asserting/AssertingPostingsFormat.java | 101 ++++++++++++++++-- 6 files changed, 110 insertions(+), 20 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java index e31f4b79343..099a1f26622 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java @@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet; */ public abstract class PostingsConsumer { - /** Adds a new doc in this term. */ + /** Adds a new doc in this term. + * freq will be -1 when term frequencies are omitted + * for the field. */ public abstract void startDoc(int docID, int freq) throws IOException; /** Add a new position & payload, and start/end offset. A * null payload means no payload; a non-null payload with * zero length also means no payload. Caller may reuse * the {@link BytesRef} for the payload between calls - * (method must fully consume the payload). */ + * (method must fully consume the payload). startOffset + * and endOffset will be -1 when offsets are not indexed. */ public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException; /** Called when we are done adding positions & payloads @@ -78,7 +81,7 @@ public abstract class PostingsConsumer { break; } visitedDocs.set(doc); - this.startDoc(doc, 0); + this.startDoc(doc, -1); this.finishDoc(); df++; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java index 4148430f408..85dc6132b49 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java @@ -57,10 +57,14 @@ public abstract class TermsConsumer { * no docs. */ public abstract PostingsConsumer startTerm(BytesRef text) throws IOException; - /** Finishes the current term; numDocs must be > 0. */ + /** Finishes the current term; numDocs must be > 0. + * stats.totalTermFreq will be -1 when term + * frequencies are omitted for the field. */ public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; - /** Called when we are done adding terms to this field */ + /** Called when we are done adding terms to this field. + * sumTotalTermFreq will be -1 when term + * frequencies are omitted for the field. */ public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException; /** Return the BytesRef Comparator used to sort terms diff --git a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index 6a5f1f119bf..658ea591791 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem if (readTermFreq) { termDocFreq = postings.docFreqs[termID]; } else { - termDocFreq = 0; + termDocFreq = -1; } postings.lastDocCodes[termID] = -1; } else { @@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final int code = freq.readVInt(); if (!readTermFreq) { docID += code; - termDocFreq = 0; + termDocFreq = -1; } else { docID += code >>> 1; if ((code & 1) != 0) { @@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem // 2nd sweep does the real flush, but I suspect // that'd add too much time to flush. visitedDocs.set(docID); - postingsConsumer.startDoc(docID, termDocFreq); + postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1); if (docID < delDocLimit) { // Mark it deleted. TODO: we could also skip // writing its postings; this would be diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index 8be1027cd0a..fdb9307731c 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase { for(int i=0;i