merge trunk (1364720-1364799)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1364800 13f79535-47bb-0310-9956-ffa450edef68
2012-07-23 20:57:31 +00:00 · 2012-07-23 20:57:31 +00:00 · 33f6da286e
parent 3a0464f165 29c15ddb20
commit 33f6da286e
26 changed files with 620 additions and 63 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java
@ -896,7 +896,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
        //   w.close();
        // }
      } else {
-        assert sumTotalTermFreq == 0;
+        assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
        assert sumDocFreq == 0;
        assert docCount == 0;
      }
--- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsConsumer.java
@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet;
 */
 public abstract class PostingsConsumer {

-  /** Adds a new doc in this term. */
+  /** Adds a new doc in this term. 
+   * <code>freq</code> will be -1 when term frequencies are omitted
+   * for the field. */
  public abstract void startDoc(int docID, int freq) throws IOException;

  /** Add a new position & payload, and start/end offset.  A
   *  null payload means no payload; a non-null payload with
   *  zero length also means no payload.  Caller may reuse
   *  the {@link BytesRef} for the payload between calls
-   *  (method must fully consume the payload). */
+   *  (method must fully consume the payload). <code>startOffset</code>
+   *  and <code>endOffset</code> will be -1 when offsets are not indexed. */
  public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;

  /** Called when we are done adding positions & payloads
@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
          break;
        }
        visitedDocs.set(doc);
-        this.startDoc(doc, 0);
+        this.startDoc(doc, -1);
        this.finishDoc();
        df++;
      }
@ -146,6 +149,6 @@ public abstract class PostingsConsumer {
        df++;
      }
    }
-    return new TermStats(df, totTF);
+    return new TermStats(df, indexOptions == IndexOptions.DOCS_ONLY ? -1 : totTF);
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/TermsConsumer.java
@ -57,10 +57,14 @@ public abstract class TermsConsumer {
   *  no docs. */
  public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;

-  /** Finishes the current term; numDocs must be > 0. */
+  /** Finishes the current term; numDocs must be > 0.
+   *  <code>stats.totalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
  public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;

-  /** Called when we are done adding terms to this field */
+  /** Called when we are done adding terms to this field.
+   *  <code>sumTotalTermFreq</code> will be -1 when term 
+   *  frequencies are omitted for the field. */
  public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;

  /** Return the BytesRef Comparator used to sort terms
@ -205,6 +209,6 @@ public abstract class TermsConsumer {
        }
      }
    }
-    finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
+    finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
            if (readTermFreq) {
              termDocFreq = postings.docFreqs[termID];
            } else {
-              termDocFreq = 0;
+              termDocFreq = -1;
            }
            postings.lastDocCodes[termID] = -1;
          } else {
@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
          final int code = freq.readVInt();
          if (!readTermFreq) {
            docID += code;
-            termDocFreq = 0;
+            termDocFreq = -1;
          } else {
            docID += code >>> 1;
            if ((code & 1) != 0) {
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
        // 2nd sweep does the real flush, but I suspect
        // that'd add too much time to flush.
        visitedDocs.set(docID);
-        postingsConsumer.startDoc(docID, termDocFreq);
+        postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
        if (docID < delDocLimit) {
          // Mark it deleted.  TODO: we could also skip
          // writing its postings; this would be
@ -542,11 +542,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
        }
        postingsConsumer.finishDoc();
      }
-      termsConsumer.finishTerm(text, new TermStats(numDocs, totTF));
+      termsConsumer.finishTerm(text, new TermStats(numDocs, writeTermFreq ? totTF : -1));
      sumTotalTermFreq += totTF;
      sumDocFreq += numDocs;
    }

-    termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
+    termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality());
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java
@ -116,7 +116,7 @@ public class TestCodecs extends LuceneTestCase {
        sumDF += term.docs.length;
        sumTotalTermCount += term.write(termsConsumer);
      }
-      termsConsumer.finish(sumTotalTermCount, sumDF, (int) visitedDocs.cardinality());
+      termsConsumer.finish(omitTF ? -1 : sumTotalTermCount, sumDF, (int) visitedDocs.cardinality());
    }
  }

@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase {
      for(int i=0;i<docs.length;i++) {
        final int termDocFreq;
        if (field.omitTF) {
-          termDocFreq = 0;
+          termDocFreq = -1;
        } else {
          termDocFreq = positions[i].length;
        }
@ -165,10 +165,10 @@ public class TestCodecs extends LuceneTestCase {
            final PositionData pos = positions[i][j];
            postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
          }
-          postingsConsumer.finishDoc();
        }
+        postingsConsumer.finishDoc();
      }
-      termsConsumer.finishTerm(text, new TermStats(docs.length, totTF));
+      termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
      return totTF;
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPostingsFormat.java
@ -406,7 +406,7 @@ public class TestPostingsFormat extends LuceneTestCase {
          if (VERBOSE) {
            System.out.println("    " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
          }
-          postingsConsumer.startDoc(posting.docID, posting.positions.size());
+          postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
          seenDocs.set(posting.docID);
          if (doPos) {
            totalTF += posting.positions.size();
@ -428,12 +428,12 @@ public class TestPostingsFormat extends LuceneTestCase {
          postingsConsumer.finishDoc();
          docCount++;
        }
-        termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF));
+        termsConsumer.finishTerm(term, new TermStats(postings.size(), doFreq ? totalTF : -1));
        sumTotalTF += totalTF;
        sumDF += postings.size();
      }

-      termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality());
+      termsConsumer.finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.cardinality());
    }

    fieldsConsumer.close();
--- a/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java
+++ b/lucene/spatial/src/java/org/apache/lucene/spatial/SpatialStrategy.java
@ -28,10 +28,26 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.spatial.query.SpatialArgs;

 /**
- * The SpatialStrategy encapsulates an approach to indexing and searching based on shapes.
+ * The SpatialStrategy encapsulates an approach to indexing and searching based
+ * on shapes.
 * <p/>
- * Note that a SpatialStrategy is not involved with the Lucene stored field values of shapes, which is
- * immaterial to indexing & search.
+ * Different implementations will support different features. A strategy should
+ * document these common elements:
+ * <ul>
+ *   <li>Can it index more than one shape per field?</li>
+ *   <li>What types of shapes can be indexed?</li>
+ *   <li>What types of query shapes can be used?</li>
+ *   <li>What types of query operations are supported?
+ *   This might vary per shape.</li>
+ *   <li>Are there caches?  Under what circumstances are they used?
+ *   Roughly how big are they?  Is it segmented by Lucene segments, such as is
+ *   done by the Lucene {@link org.apache.lucene.search.FieldCache} and
+ *   {@link org.apache.lucene.index.DocValues} (ideal) or is it for the entire
+ *   index?
+ * </ul>
+ * <p/>
+ * Note that a SpatialStrategy is not involved with the Lucene stored field
+ * values of shapes, which is immaterial to indexing & search.
 * <p/>
 * Thread-safe.
 *
--- a/lucene/spatial/src/java/overview.html
+++ b/lucene/spatial/src/java/overview.html
@ -16,8 +16,49 @@
 -->
 <html>
  <head>
-    <title>Apache Lucene Spatial Strategies</title>
+    <title>Apache Lucene Spatial Module</title>
  </head>
  <body>
+
+  <h1>The Spatial Module for Apache Lucene</h1>
+
+  <p>
+    The spatial module is new is Lucene 4, replacing the old contrib module
+    that came before it. The principle interface to the module is
+    a {@link org.apache.lucene.spatial.SpatialStrategy}
+    which encapsulates an approach to indexing and searching
+    based on shapes.  Different Strategies have different features and
+    performance profiles, which are documented at each Strategy class level.
+  </p>
+  <p>
+    For some sample code showing how to use the API, see SpatialExample.java in
+    the tests.
+  </p>
+  <p>
+    The spatial module uses
+    <a href="https://github.com/spatial4j/spatial4j">Spatial4j</a>
+    heavily.  Spatial4j is an ASL licensed library with these capabilities:
+    <ul>
+    <li>Provides shape implementations, namely point, rectangle,
+      and circle.  Both geospatial contexts and plain 2D Euclidean/Cartesian contexts
+      are supported.
+      With an additional dependency, it adds polygon and other geometry shape
+      support via integration with
+      <a href="http://sourceforge.net/projects/jts-topo-suite/">JTS Topology Suite</a>.
+      This includes dateline wrap support.</li>
+    <li>Shape parsing and serialization, including
+      <a href="http://en.wikipedia.org/wiki/Well-known_text">Well-Known Text (WKT)</a>
+      (via JTS).</li>
+    <li>Distance and other spatial related math calculations.</li>
+    </ul>
+  </p>
+  <p>
+    Historical note: The new spatial module was once known as
+    Lucene Spatial Playground (LSP) as an external project.  In ~March 2012, LSP
+    split into this new module as part of Lucene and Spatial4j externally. A
+    large chunk of the LSP implementation originated as SOLR-2155 which uses
+    trie/prefix-tree algorithms with a geohash encoding.
+  </p>
+
  </body>
 </html>
--- a/lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java
+++ b/lucene/spatial/src/test/org/apache/lucene/spatial/SpatialExample.java
@ -0,0 +1,180 @@
+package org.apache.lucene.spatial;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.spatial4j.core.context.SpatialContext;
+import com.spatial4j.core.context.simple.SimpleSpatialContext;
+import com.spatial4j.core.shape.Shape;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queries.function.ValueSource;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
+import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree;
+import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
+import org.apache.lucene.spatial.query.SpatialArgs;
+import org.apache.lucene.spatial.query.SpatialArgsParser;
+import org.apache.lucene.spatial.query.SpatialOperation;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.Version;
+
+import java.io.IOException;
+
+/**
+ * This class serves as example code to show how to use the Lucene spatial
+ * module.
+ */
+public class SpatialExample extends LuceneTestCase {
+
+  public static void main(String[] args) throws IOException {
+    new SpatialExample().test();
+  }
+
+  public void test() throws IOException {
+    init();
+    indexPoints();
+    search();
+  }
+
+  /**
+   * The Spatial4j {@link SpatialContext} is a sort of global-ish singleton
+   * needed by Lucene spatial.  It's a facade to the rest of Spatial4j, acting
+   * as a factory for {@link Shape}s and provides access to reading and writing
+   * them from Strings.
+   */
+  private SpatialContext ctx;//"ctx" is the conventional variable name
+
+  /**
+   * The Lucene spatial {@link SpatialStrategy} encapsulates an approach to
+   * indexing and searching shapes, and providing relevancy scores for them.
+   * It's a simple API to unify different approaches.
+   * <p />
+   * Note that these are initialized with a field name.
+   */
+  private SpatialStrategy strategy;
+
+  private Directory directory;
+
+  protected void init() {
+    //Typical geospatial context with kilometer units.
+    //  These can also be constructed from a factory: SpatialContextFactory
+    this.ctx = SimpleSpatialContext.GEO_KM;
+
+    int maxLevels = 10;//results in sub-meter precision for geohash
+    //TODO demo lookup by detail distance
+    //  This can also be constructed from a factory: SpatialPrefixTreeFactory
+    SpatialPrefixTree grid = new GeohashPrefixTree(ctx, maxLevels);
+
+    this.strategy = new RecursivePrefixTreeStrategy(grid, "myGeoField");
+
+    this.directory = new RAMDirectory();
+  }
+
+  private void indexPoints() throws IOException {
+    IndexWriterConfig iwConfig = new IndexWriterConfig(TEST_VERSION_CURRENT,null);
+    IndexWriter indexWriter = new IndexWriter(directory, iwConfig);
+
+    //Spatial4j is x-y order for arguments
+    indexWriter.addDocument(newSampleDocument(
+        2, ctx.makePoint(-80.93, 33.77)));
+
+    //When parsing a string to a shape, the presence of a comma means it's y-x
+    // order (lon, lat)
+    indexWriter.addDocument(newSampleDocument(
+        4, ctx.readShape("-50.7693246, 60.9289094")));
+
+    indexWriter.addDocument(newSampleDocument(
+        20, ctx.makePoint(0.1,0.1), ctx.makePoint(0, 0)));
+
+    indexWriter.close();
+  }
+
+  private Document newSampleDocument(int id, Shape... shapes) {
+    Document doc = new Document();
+    doc.add(new IntField("id", id, Field.Store.YES));
+    //Potentially more than one shape in this field is supported by some
+    // strategies; see the javadocs of the SpatialStrategy impl to see.
+    for (Shape shape : shapes) {
+      for (IndexableField f : strategy.createIndexableFields(shape)) {
+        doc.add(f);
+      }
+      //store it too; the format is up to you
+      doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape)));
+    }
+
+    return doc;
+  }
+
+  private void search() throws IOException {
+    IndexReader indexReader = DirectoryReader.open(directory);
+    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+    Sort idSort = new Sort(new SortField("id", SortField.Type.INT));
+
+    //--Filter by circle (<= distance from a point)
+    {
+      //Search with circle
+      //note: SpatialArgs can be parsed from a string
+      SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,
+          ctx.makeCircle(-80.0, 33.0, 200));//200km (since km == ctx.getDistanceUnits
+      Filter filter = strategy.makeFilter(args);
+      TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), filter, 10, idSort);
+      assertDocMatchedIds(indexSearcher, docs, 2);
+    }
+    //--Match all, order by distance
+    {
+      SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,//doesn't matter
+          ctx.makePoint(60, -50));
+      ValueSource valueSource = strategy.makeValueSource(args);//the distance
+      Sort reverseDistSort = new Sort(valueSource.getSortField(false)).rewrite(indexSearcher);//true=asc dist
+      TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), 10, reverseDistSort);
+      assertDocMatchedIds(indexSearcher, docs, 4, 20, 2);
+    }
+    //demo arg parsing
+    {
+      SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,
+          ctx.makeCircle(-80.0, 33.0, 200));
+      SpatialArgs args2 = new SpatialArgsParser().parse("Intersects(Circle(33,-80 d=200))", ctx);
+      assertEquals(args.toString(),args2.toString());
+    }
+
+    indexReader.close();
+  }
+
+  private void assertDocMatchedIds(IndexSearcher indexSearcher, TopDocs docs, int... ids) throws IOException {
+    int[] gotIds = new int[docs.totalHits];
+    for (int i = 0; i < gotIds.length; i++) {
+      gotIds[i] = indexSearcher.doc(docs.scoreDocs[i].doc).getField("id").numericValue().intValue();
+    }
+    assertArrayEquals(ids,gotIds);
+  }
+
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/asserting/AssertingPostingsFormat.java
@ -18,16 +18,24 @@ package org.apache.lucene.codecs.asserting;
 */

 import java.io.IOException;
+import java.util.Comparator;

 import org.apache.lucene.codecs.FieldsConsumer;
 import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsConsumer;
 import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.TermsConsumer;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.index.AssertingAtomicReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.FieldsEnum;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.OpenBitSet;

 /**
 * Just like {@link Lucene40PostingsFormat} but with additional asserts.
@ -39,10 +47,9 @@ public class AssertingPostingsFormat extends PostingsFormat {
    super("Asserting");
  }
  
-  // TODO: we could add some useful checks here?
  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    return in.fieldsConsumer(state);
+    return new AssertingFieldsConsumer(in.fieldsConsumer(state));
  }

  @Override
@ -85,4 +92,164 @@ public class AssertingPostingsFormat extends PostingsFormat {
      return in.getUniqueTermCount();
    }
  }
+  
+  static class AssertingFieldsConsumer extends FieldsConsumer {
+    private final FieldsConsumer in;
+    
+    AssertingFieldsConsumer(FieldsConsumer in) {
+      this.in = in;
+    }
+    
+    @Override
+    public TermsConsumer addField(FieldInfo field) throws IOException {
+      TermsConsumer consumer = in.addField(field);
+      assert consumer != null;
+      return new AssertingTermsConsumer(consumer, field);
+    }
+
+    @Override
+    public void close() throws IOException {
+      in.close();
+    }
+  }
+  
+  static enum TermsConsumerState { INITIAL, START, FINISHED };
+  static class AssertingTermsConsumer extends TermsConsumer {
+    private final TermsConsumer in;
+    private final FieldInfo fieldInfo;
+    private BytesRef lastTerm = null;
+    private TermsConsumerState state = TermsConsumerState.INITIAL;
+    private AssertingPostingsConsumer lastPostingsConsumer = null;
+    private long sumTotalTermFreq = 0;
+    private long sumDocFreq = 0;
+    private OpenBitSet visitedDocs = new OpenBitSet();
+    
+    AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
+      this.in = in;
+      this.fieldInfo = fieldInfo;
+    }
+    
+    @Override
+    public PostingsConsumer startTerm(BytesRef text) throws IOException {
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
+      state = TermsConsumerState.START;
+      assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
+      lastTerm = BytesRef.deepCopyOf(text);
+      return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
+    }
+
+    @Override
+    public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+      assert state == TermsConsumerState.START;
+      state = TermsConsumerState.INITIAL;
+      assert text.equals(lastTerm);
+      assert stats.docFreq > 0; // otherwise, this method should not be called.
+      assert stats.docFreq == lastPostingsConsumer.docFreq;
+      sumDocFreq += stats.docFreq;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+        assert stats.totalTermFreq == -1;
+      } else {
+        assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
+        sumTotalTermFreq += stats.totalTermFreq;
+      }
+      in.finishTerm(text, stats);
+    }
+
+    @Override
+    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
+      assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
+      state = TermsConsumerState.FINISHED;
+      assert docCount >= 0;
+      assert docCount == visitedDocs.cardinality();
+      assert sumDocFreq >= docCount;
+      assert sumDocFreq == this.sumDocFreq;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+        assert sumTotalTermFreq == -1;
+      } else {
+        assert sumTotalTermFreq >= sumDocFreq;
+        assert sumTotalTermFreq == this.sumTotalTermFreq;
+      }
+      in.finish(sumTotalTermFreq, sumDocFreq, docCount);
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() throws IOException {
+      return in.getComparator();
+    }
+  }
+  
+  static enum PostingsConsumerState { INITIAL, START };
+  static class AssertingPostingsConsumer extends PostingsConsumer {
+    private final PostingsConsumer in;
+    private final FieldInfo fieldInfo;
+    private final OpenBitSet visitedDocs;
+    private PostingsConsumerState state = PostingsConsumerState.INITIAL;
+    private int freq;
+    private int positionCount;
+    private int lastPosition = 0;
+    private int lastStartOffset = 0;
+    int docFreq = 0;
+    long totalTermFreq = 0;
+    
+    AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
+      this.in = in;
+      this.fieldInfo = fieldInfo;
+      this.visitedDocs = visitedDocs;
+    }
+
+    @Override
+    public void startDoc(int docID, int freq) throws IOException {
+      assert state == PostingsConsumerState.INITIAL;
+      state = PostingsConsumerState.START;
+      assert docID >= 0;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
+        assert freq == -1;
+        this.freq = 0; // we don't expect any positions here
+      } else {
+        assert freq > 0;
+        this.freq = freq;
+        totalTermFreq += freq;
+      }
+      this.positionCount = 0;
+      this.lastPosition = 0;
+      this.lastStartOffset = 0;
+      docFreq++;
+      visitedDocs.set(docID);
+      in.startDoc(docID, freq);
+    }
+
+    @Override
+    public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
+      assert state == PostingsConsumerState.START;
+      assert positionCount < freq;
+      positionCount++;
+      assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
+      lastPosition = position;
+      if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+        assert startOffset >= 0;
+        assert startOffset >= lastStartOffset;
+        lastStartOffset = startOffset;
+        assert endOffset >= startOffset;
+      } else {
+        assert startOffset == -1;
+        assert endOffset == -1;
+      }
+      if (payload != null) {
+        assert fieldInfo.hasPayloads();
+      }
+      in.addPosition(position, payload, startOffset, endOffset);
+    }
+
+    @Override
+    public void finishDoc() throws IOException {
+      assert state == PostingsConsumerState.START;
+      state = PostingsConsumerState.INITIAL;
+      if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
+        assert positionCount == 0; // we should not have fed any positions!
+      } else {
+        assert positionCount == freq;
+      }
+      in.finishDoc();
+    }
+  }
 }
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -131,6 +131,8 @@ Bug Fixes
 * SOLR-3663: There are a couple of bugs in the sync process when a leader goes down and a 
  new leader is elected. (Mark Miller)

+* SOLR-3623: Fixed inconsistent treatment of third-party dependencies for 
+  solr contribs analysis-extras & uima (hossman) 

 Other Changes
 ----------------------
--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@ -70,21 +70,32 @@
    -->
  <property name="solr.spec.version" value="5.0.0.${dateversion}" />

+  <path id="solr.lucene.libs">
+    <!-- List of jars that will be used as the foundation for both
+         the base classpath, as well as copied into the lucene-libs dir 
+	 in the release.
+    -->
+    <!-- NOTE: lucene-core is explicitly not included because of the 
+	 base.classpath (compilation & tests are done directly against   
+	 the class files w/o needing to build the jar)
+    -->
+    <pathelement location="${analyzers-common.jar}"/>
+    <pathelement location="${analyzers-kuromoji.jar}"/>
+    <pathelement location="${analyzers-phonetic.jar}"/>
+    <pathelement location="${highlighter.jar}"/>
+    <pathelement location="${memory.jar}"/>
+    <pathelement location="${misc.jar}"/>
+    <pathelement location="${spatial.jar}"/>
+    <pathelement location="${suggest.jar}"/>
+    <pathelement location="${grouping.jar}"/>
+    <pathelement location="${queries.jar}"/>
+    <pathelement location="${queryparser.jar}"/>
+  </path>
+
  <path id="solr.base.classpath">
-  	<pathelement path="${analyzers-common.jar}"/>
-  	<pathelement path="${analyzers-kuromoji.jar}"/>
-  	<pathelement path="${analyzers-phonetic.jar}"/>
-    <pathelement path="${analyzers-uima.jar}"/>
-  	<pathelement path="${highlighter.jar}"/>
-  	<pathelement path="${memory.jar}"/>
-  	<pathelement path="${misc.jar}"/>
-  	<pathelement path="${spatial.jar}"/>
-  	<pathelement path="${suggest.jar}"/>
-    <pathelement path="${grouping.jar}"/>
-    <pathelement path="${queries.jar}"/>
-    <pathelement path="${queryparser.jar}"/>
    <pathelement location="${common-solr.dir}/build/solr-solrj/classes/java"/>
    <pathelement location="${common-solr.dir}/build/solr-core/classes/java"/>
+    <path refid="solr.lucene.libs" />
    <path refid="additional.dependencies"/>
    <path refid="base.classpath"/>
  </path>
@ -125,7 +136,7 @@
  </target>

  <target name="prep-lucene-jars" 
-  	      depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-analyzers-morfologik, jar-suggest, jar-highlighter, jar-memory,
+  	      depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-suggest, jar-highlighter, jar-memory,
  	               jar-misc, jar-spatial, jar-grouping, jar-queries, jar-queryparser">
  	  <property name="solr.deps.compiled" value="true"/>
  </target>
@ -137,19 +148,11 @@
      <propertyset refid="uptodate.and.compiled.properties"/>
    </ant>
    <copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
+      <path refid="solr.lucene.libs" />
+      <!-- NOTE: lucene-core is not already included in "solr.lucene.libs" 
+	   because of it's use in classpaths.
+      -->
      <fileset file="${lucene-core.jar}" />
-      <fileset file="${analyzers-common.jar}" />
-      <fileset file="${analyzers-kuromoji.jar}" />
-      <fileset file="${analyzers-phonetic.jar}" />
-      <fileset file="${suggest.jar}" />
-      <fileset file="${grouping.jar}" />
-      <fileset file="${queries.jar}" />
-      <fileset file="${queryparser.jar}" />
-      <fileset file="${highlighter.jar}" />
-      <fileset file="${memory.jar}" />
-      <fileset file="${misc.jar}" />
-      <fileset file="${spatial.jar}" />
-      <fileset refid="analyzers-morfologik.fileset" />
    </copy>
    </sequential>
  </target>
--- a/solr/contrib/analysis-extras/README.txt
+++ b/solr/contrib/analysis-extras/README.txt
@ -9,8 +9,11 @@ Relies upon the following lucene components (in lucene-libs/):
 * lucene-analyzers-icu-X.Y.jar
 * lucene-analyzers-smartcn-X.Y.jar
 * lucene-analyzers-stempel-X.Y.jar
- 
-And the ICU library (in lib/):
+ * lucene-analyzers-morfologik-X.Y.jar
+ * lucene-analyzers-smartcn-X.Y.jar
+
+And the following third-party library (in lib/):

 * icu4j-X.Y.jar
+ * morfologik-*.jar
 
--- a/solr/contrib/analysis-extras/build.xml
+++ b/solr/contrib/analysis-extras/build.xml
@ -24,13 +24,17 @@
  </description>

  <import file="../contrib-build.xml"/>
+  
+  <path id="analysis.extras.lucene.libs">
+    <pathelement location="${analyzers-icu.jar}"/>
+    <pathelement location="${analyzers-smartcn.jar}"/>
+    <pathelement location="${analyzers-stempel.jar}"/>
+    <pathelement location="${analyzers-morfologik.jar}"/>
+  </path>

  <path id="classpath">
-        <fileset dir="lib" excludes="${common.classpath.excludes}"/>
-  	<pathelement path="${analyzers-icu.jar}"/>
-  	<pathelement path="${analyzers-smartcn.jar}"/>
-  	<pathelement path="${analyzers-stempel.jar}"/>
-  	<fileset refid="analyzers-morfologik.fileset" />
+    <fileset dir="lib" excludes="${common.classpath.excludes}"/>
+    <path refid="analysis.extras.lucene.libs" />
    <path refid="solr.base.classpath"/>
  </path>

@ -38,10 +42,7 @@
          depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik">
    <mkdir dir="${build.dir}/lucene-libs"/>
    <copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
-      <fileset file="${analyzers-icu.jar}"/>
-      <fileset file="${analyzers-smartcn.jar}"/>
-      <fileset file="${analyzers-stempel.jar}"/>
-      <fileset refid="analyzers-morfologik.fileset" />
+      <path refid="analysis.extras.lucene.libs" />
    </copy>
  </target>

--- a/solr/contrib/analysis-extras/ivy.xml
+++ b/solr/contrib/analysis-extras/ivy.xml
@ -20,6 +20,9 @@
    <info organisation="org.apache.solr" module="analysis-extras"/>
    <dependencies>
      <dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
      <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
    </dependencies>
 </ivy-module>
--- a/solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1
+++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-1.5.3.jar.sha1
@ -0,0 +1 @@
+d1f729cd3019e6d86485226202f84458141a5688
--- a/solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-LICENSE-BSD.txt
@ -0,0 +1,29 @@
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-fsa-NOTICE.txt
@ -0,0 +1,2 @@
+This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
+(http://morfologik.blogspot.com/).
--- a/solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1
+++ b/solr/contrib/analysis-extras/lib/morfologik-polish-1.5.3.jar.sha1
@ -0,0 +1 @@
+8217b6f7ad018ceda0e824b2e60340000da4397a
--- a/solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-polish-LICENSE-BSD.txt
@ -0,0 +1,62 @@
+BSD-licensed dictionary of Polish (Morfologik)
+
+Copyright (c) 2012, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+--
+
+BSD-licensed dictionary of Polish (SGJP)
+http://sgjp.pl/morfeusz/
+
+Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 
+	    	 Marcin Woliński, Robert Wołosz
+
+All rights reserved.
+
+Redistribution and  use in  source and binary  forms, with  or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
+OR  IMPLIED WARRANTIES,  INCLUDING, BUT  NOT LIMITED  TO,  THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED.  IN NO EVENT  SHALL COPYRIGHT  HOLDERS OR  CONTRIBUTORS BE
+LIABLE FOR  ANY DIRECT,  INDIRECT, INCIDENTAL, SPECIAL,  EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES  (INCLUDING, BUT NOT LIMITED  TO, PROCUREMENT OF
+SUBSTITUTE  GOODS OR  SERVICES;  LOSS  OF USE,  DATA,  OR PROFITS;  OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF LIABILITY,
+WHETHER IN  CONTRACT, STRICT LIABILITY, OR  TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-polish-NOTICE.txt
@ -0,0 +1,6 @@
+
+This product includes data from BSD-licensed dictionary of Polish (Morfologik)
+(http://morfologik.blogspot.com/)
+
+This product includes data from BSD-licensed dictionary of Polish (SGJP)
+(http://sgjp.pl/morfeusz/)
--- a/solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1
+++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-1.5.3.jar.sha1
@ -0,0 +1 @@
+c4ead57b78fa71b00553ff21da6fb5a326e914e8
--- a/solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-LICENSE-BSD.txt
@ -0,0 +1,29 @@
+
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt
+++ b/solr/contrib/analysis-extras/lib/morfologik-stemming-NOTICE.txt
@ -0,0 +1,2 @@
+This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
+(http://morfologik.blogspot.com/).
--- a/solr/contrib/uima/README.txt
+++ b/solr/contrib/uima/README.txt
@ -6,6 +6,7 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f
   or set <lib/> tags in solrconfig.xml appropriately to point those jar files.

   <lib dir="../../contrib/uima/lib" />
+   <lib dir="../../contrib/uima/lucene-libs" />
   <lib dir="../../dist/" regex="apache-solr-uima-\d.*\.jar" />

 2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:
--- a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@ -191,7 +191,7 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 {
      int newLeaderPort = getLeaderPort(leader);
      int retry = 0;
      while (leaderPort == newLeaderPort) {
-        if (retry++ == 20) {
+        if (retry++ == 60) {
          break;
        }
        Thread.sleep(1000);