merge trunk (1364720-1364799)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1364800 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-23 20:57:31 +00:00
commit 33f6da286e
26 changed files with 620 additions and 63 deletions

View File

@ -896,7 +896,7 @@ public class BlockTreeTermsWriter extends FieldsConsumer {
// w.close(); // w.close();
// } // }
} else { } else {
assert sumTotalTermFreq == 0; assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY && sumTotalTermFreq == -1;
assert sumDocFreq == 0; assert sumDocFreq == 0;
assert docCount == 0; assert docCount == 0;
} }

View File

@ -49,14 +49,17 @@ import org.apache.lucene.util.FixedBitSet;
*/ */
public abstract class PostingsConsumer { public abstract class PostingsConsumer {
/** Adds a new doc in this term. */ /** Adds a new doc in this term.
* <code>freq</code> will be -1 when term frequencies are omitted
* for the field. */
public abstract void startDoc(int docID, int freq) throws IOException; public abstract void startDoc(int docID, int freq) throws IOException;
/** Add a new position & payload, and start/end offset. A /** Add a new position & payload, and start/end offset. A
* null payload means no payload; a non-null payload with * null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse * zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls * the {@link BytesRef} for the payload between calls
* (method must fully consume the payload). */ * (method must fully consume the payload). <code>startOffset</code>
* and <code>endOffset</code> will be -1 when offsets are not indexed. */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException; public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions & payloads /** Called when we are done adding positions & payloads
@ -78,7 +81,7 @@ public abstract class PostingsConsumer {
break; break;
} }
visitedDocs.set(doc); visitedDocs.set(doc);
this.startDoc(doc, 0); this.startDoc(doc, -1);
this.finishDoc(); this.finishDoc();
df++; df++;
} }
@ -146,6 +149,6 @@ public abstract class PostingsConsumer {
df++; df++;
} }
} }
return new TermStats(df, totTF); return new TermStats(df, indexOptions == IndexOptions.DOCS_ONLY ? -1 : totTF);
} }
} }

View File

@ -57,10 +57,14 @@ public abstract class TermsConsumer {
* no docs. */ * no docs. */
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException; public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */ /** Finishes the current term; numDocs must be > 0.
* <code>stats.totalTermFreq</code> will be -1 when term
* frequencies are omitted for the field. */
public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException; public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */ /** Called when we are done adding terms to this field.
* <code>sumTotalTermFreq</code> will be -1 when term
* frequencies are omitted for the field. */
public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException; public abstract void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException;
/** Return the BytesRef Comparator used to sort terms /** Return the BytesRef Comparator used to sort terms
@ -205,6 +209,6 @@ public abstract class TermsConsumer {
} }
} }
} }
finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); finish(indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality());
} }
} }

View File

@ -430,7 +430,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
if (readTermFreq) { if (readTermFreq) {
termDocFreq = postings.docFreqs[termID]; termDocFreq = postings.docFreqs[termID];
} else { } else {
termDocFreq = 0; termDocFreq = -1;
} }
postings.lastDocCodes[termID] = -1; postings.lastDocCodes[termID] = -1;
} else { } else {
@ -441,7 +441,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final int code = freq.readVInt(); final int code = freq.readVInt();
if (!readTermFreq) { if (!readTermFreq) {
docID += code; docID += code;
termDocFreq = 0; termDocFreq = -1;
} else { } else {
docID += code >>> 1; docID += code >>> 1;
if ((code & 1) != 0) { if ((code & 1) != 0) {
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// 2nd sweep does the real flush, but I suspect // 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush. // that'd add too much time to flush.
visitedDocs.set(docID); visitedDocs.set(docID);
postingsConsumer.startDoc(docID, termDocFreq); postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
if (docID < delDocLimit) { if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip // Mark it deleted. TODO: we could also skip
// writing its postings; this would be // writing its postings; this would be
@ -542,11 +542,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} }
postingsConsumer.finishDoc(); postingsConsumer.finishDoc();
} }
termsConsumer.finishTerm(text, new TermStats(numDocs, totTF)); termsConsumer.finishTerm(text, new TermStats(numDocs, writeTermFreq ? totTF : -1));
sumTotalTermFreq += totTF; sumTotalTermFreq += totTF;
sumDocFreq += numDocs; sumDocFreq += numDocs;
} }
termsConsumer.finish(sumTotalTermFreq, sumDocFreq, visitedDocs.cardinality()); termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality());
} }
} }

View File

@ -116,7 +116,7 @@ public class TestCodecs extends LuceneTestCase {
sumDF += term.docs.length; sumDF += term.docs.length;
sumTotalTermCount += term.write(termsConsumer); sumTotalTermCount += term.write(termsConsumer);
} }
termsConsumer.finish(sumTotalTermCount, sumDF, (int) visitedDocs.cardinality()); termsConsumer.finish(omitTF ? -1 : sumTotalTermCount, sumDF, (int) visitedDocs.cardinality());
} }
} }
@ -154,7 +154,7 @@ public class TestCodecs extends LuceneTestCase {
for(int i=0;i<docs.length;i++) { for(int i=0;i<docs.length;i++) {
final int termDocFreq; final int termDocFreq;
if (field.omitTF) { if (field.omitTF) {
termDocFreq = 0; termDocFreq = -1;
} else { } else {
termDocFreq = positions[i].length; termDocFreq = positions[i].length;
} }
@ -165,10 +165,10 @@ public class TestCodecs extends LuceneTestCase {
final PositionData pos = positions[i][j]; final PositionData pos = positions[i][j];
postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1); postingsConsumer.addPosition(pos.pos, pos.payload, -1, -1);
} }
postingsConsumer.finishDoc();
} }
postingsConsumer.finishDoc();
} }
termsConsumer.finishTerm(text, new TermStats(docs.length, totTF)); termsConsumer.finishTerm(text, new TermStats(docs.length, field.omitTF ? -1 : totTF));
return totTF; return totTF;
} }
} }

View File

@ -406,7 +406,7 @@ public class TestPostingsFormat extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size()); System.out.println(" " + docCount + ": docID=" + posting.docID + " freq=" + posting.positions.size());
} }
postingsConsumer.startDoc(posting.docID, posting.positions.size()); postingsConsumer.startDoc(posting.docID, doFreq ? posting.positions.size() : -1);
seenDocs.set(posting.docID); seenDocs.set(posting.docID);
if (doPos) { if (doPos) {
totalTF += posting.positions.size(); totalTF += posting.positions.size();
@ -428,12 +428,12 @@ public class TestPostingsFormat extends LuceneTestCase {
postingsConsumer.finishDoc(); postingsConsumer.finishDoc();
docCount++; docCount++;
} }
termsConsumer.finishTerm(term, new TermStats(postings.size(), totalTF)); termsConsumer.finishTerm(term, new TermStats(postings.size(), doFreq ? totalTF : -1));
sumTotalTF += totalTF; sumTotalTF += totalTF;
sumDF += postings.size(); sumDF += postings.size();
} }
termsConsumer.finish(sumTotalTF, sumDF, seenDocs.cardinality()); termsConsumer.finish(doFreq ? sumTotalTF : -1, sumDF, seenDocs.cardinality());
} }
fieldsConsumer.close(); fieldsConsumer.close();

View File

@ -28,10 +28,26 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.spatial.query.SpatialArgs; import org.apache.lucene.spatial.query.SpatialArgs;
/** /**
* The SpatialStrategy encapsulates an approach to indexing and searching based on shapes. * The SpatialStrategy encapsulates an approach to indexing and searching based
* on shapes.
* <p/> * <p/>
* Note that a SpatialStrategy is not involved with the Lucene stored field values of shapes, which is * Different implementations will support different features. A strategy should
* immaterial to indexing & search. * document these common elements:
* <ul>
* <li>Can it index more than one shape per field?</li>
* <li>What types of shapes can be indexed?</li>
* <li>What types of query shapes can be used?</li>
* <li>What types of query operations are supported?
* This might vary per shape.</li>
* <li>Are there caches? Under what circumstances are they used?
* Roughly how big are they? Is it segmented by Lucene segments, such as is
* done by the Lucene {@link org.apache.lucene.search.FieldCache} and
* {@link org.apache.lucene.index.DocValues} (ideal) or is it for the entire
* index?
* </ul>
* <p/>
* Note that a SpatialStrategy is not involved with the Lucene stored field
* values of shapes, which is immaterial to indexing & search.
* <p/> * <p/>
* Thread-safe. * Thread-safe.
* *

View File

@ -16,8 +16,49 @@
--> -->
<html> <html>
<head> <head>
<title>Apache Lucene Spatial Strategies</title> <title>Apache Lucene Spatial Module</title>
</head> </head>
<body> <body>
<h1>The Spatial Module for Apache Lucene</h1>
<p>
The spatial module is new is Lucene 4, replacing the old contrib module
that came before it. The principle interface to the module is
a {@link org.apache.lucene.spatial.SpatialStrategy}
which encapsulates an approach to indexing and searching
based on shapes. Different Strategies have different features and
performance profiles, which are documented at each Strategy class level.
</p>
<p>
For some sample code showing how to use the API, see SpatialExample.java in
the tests.
</p>
<p>
The spatial module uses
<a href="https://github.com/spatial4j/spatial4j">Spatial4j</a>
heavily. Spatial4j is an ASL licensed library with these capabilities:
<ul>
<li>Provides shape implementations, namely point, rectangle,
and circle. Both geospatial contexts and plain 2D Euclidean/Cartesian contexts
are supported.
With an additional dependency, it adds polygon and other geometry shape
support via integration with
<a href="http://sourceforge.net/projects/jts-topo-suite/">JTS Topology Suite</a>.
This includes dateline wrap support.</li>
<li>Shape parsing and serialization, including
<a href="http://en.wikipedia.org/wiki/Well-known_text">Well-Known Text (WKT)</a>
(via JTS).</li>
<li>Distance and other spatial related math calculations.</li>
</ul>
</p>
<p>
Historical note: The new spatial module was once known as
Lucene Spatial Playground (LSP) as an external project. In ~March 2012, LSP
split into this new module as part of Lucene and Spatial4j externally. A
large chunk of the LSP implementation originated as SOLR-2155 which uses
trie/prefix-tree algorithms with a geohash encoding.
</p>
</body> </body>
</html> </html>

View File

@ -0,0 +1,180 @@
package org.apache.lucene.spatial;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.context.simple.SimpleSpatialContext;
import com.spatial4j.core.shape.Shape;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.spatial.query.SpatialArgs;
import org.apache.lucene.spatial.query.SpatialArgsParser;
import org.apache.lucene.spatial.query.SpatialOperation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
import java.io.IOException;
/**
* This class serves as example code to show how to use the Lucene spatial
* module.
*/
public class SpatialExample extends LuceneTestCase {
public static void main(String[] args) throws IOException {
new SpatialExample().test();
}
public void test() throws IOException {
init();
indexPoints();
search();
}
/**
* The Spatial4j {@link SpatialContext} is a sort of global-ish singleton
* needed by Lucene spatial. It's a facade to the rest of Spatial4j, acting
* as a factory for {@link Shape}s and provides access to reading and writing
* them from Strings.
*/
private SpatialContext ctx;//"ctx" is the conventional variable name
/**
* The Lucene spatial {@link SpatialStrategy} encapsulates an approach to
* indexing and searching shapes, and providing relevancy scores for them.
* It's a simple API to unify different approaches.
* <p />
* Note that these are initialized with a field name.
*/
private SpatialStrategy strategy;
private Directory directory;
protected void init() {
//Typical geospatial context with kilometer units.
// These can also be constructed from a factory: SpatialContextFactory
this.ctx = SimpleSpatialContext.GEO_KM;
int maxLevels = 10;//results in sub-meter precision for geohash
//TODO demo lookup by detail distance
// This can also be constructed from a factory: SpatialPrefixTreeFactory
SpatialPrefixTree grid = new GeohashPrefixTree(ctx, maxLevels);
this.strategy = new RecursivePrefixTreeStrategy(grid, "myGeoField");
this.directory = new RAMDirectory();
}
private void indexPoints() throws IOException {
IndexWriterConfig iwConfig = new IndexWriterConfig(TEST_VERSION_CURRENT,null);
IndexWriter indexWriter = new IndexWriter(directory, iwConfig);
//Spatial4j is x-y order for arguments
indexWriter.addDocument(newSampleDocument(
2, ctx.makePoint(-80.93, 33.77)));
//When parsing a string to a shape, the presence of a comma means it's y-x
// order (lon, lat)
indexWriter.addDocument(newSampleDocument(
4, ctx.readShape("-50.7693246, 60.9289094")));
indexWriter.addDocument(newSampleDocument(
20, ctx.makePoint(0.1,0.1), ctx.makePoint(0, 0)));
indexWriter.close();
}
private Document newSampleDocument(int id, Shape... shapes) {
Document doc = new Document();
doc.add(new IntField("id", id, Field.Store.YES));
//Potentially more than one shape in this field is supported by some
// strategies; see the javadocs of the SpatialStrategy impl to see.
for (Shape shape : shapes) {
for (IndexableField f : strategy.createIndexableFields(shape)) {
doc.add(f);
}
//store it too; the format is up to you
doc.add(new StoredField(strategy.getFieldName(), ctx.toString(shape)));
}
return doc;
}
private void search() throws IOException {
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
Sort idSort = new Sort(new SortField("id", SortField.Type.INT));
//--Filter by circle (<= distance from a point)
{
//Search with circle
//note: SpatialArgs can be parsed from a string
SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,
ctx.makeCircle(-80.0, 33.0, 200));//200km (since km == ctx.getDistanceUnits
Filter filter = strategy.makeFilter(args);
TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), filter, 10, idSort);
assertDocMatchedIds(indexSearcher, docs, 2);
}
//--Match all, order by distance
{
SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,//doesn't matter
ctx.makePoint(60, -50));
ValueSource valueSource = strategy.makeValueSource(args);//the distance
Sort reverseDistSort = new Sort(valueSource.getSortField(false)).rewrite(indexSearcher);//true=asc dist
TopDocs docs = indexSearcher.search(new MatchAllDocsQuery(), 10, reverseDistSort);
assertDocMatchedIds(indexSearcher, docs, 4, 20, 2);
}
//demo arg parsing
{
SpatialArgs args = new SpatialArgs(SpatialOperation.Intersects,
ctx.makeCircle(-80.0, 33.0, 200));
SpatialArgs args2 = new SpatialArgsParser().parse("Intersects(Circle(33,-80 d=200))", ctx);
assertEquals(args.toString(),args2.toString());
}
indexReader.close();
}
private void assertDocMatchedIds(IndexSearcher indexSearcher, TopDocs docs, int... ids) throws IOException {
int[] gotIds = new int[docs.totalHits];
for (int i = 0; i < gotIds.length; i++) {
gotIds[i] = indexSearcher.doc(docs.scoreDocs[i].doc).getField("id").numericValue().intValue();
}
assertArrayEquals(ids,gotIds);
}
}

View File

@ -18,16 +18,24 @@ package org.apache.lucene.codecs.asserting;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.index.AssertingAtomicReader; import org.apache.lucene.index.AssertingAtomicReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
/** /**
* Just like {@link Lucene40PostingsFormat} but with additional asserts. * Just like {@link Lucene40PostingsFormat} but with additional asserts.
@ -39,10 +47,9 @@ public class AssertingPostingsFormat extends PostingsFormat {
super("Asserting"); super("Asserting");
} }
// TODO: we could add some useful checks here?
@Override @Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return in.fieldsConsumer(state); return new AssertingFieldsConsumer(in.fieldsConsumer(state));
} }
@Override @Override
@ -85,4 +92,164 @@ public class AssertingPostingsFormat extends PostingsFormat {
return in.getUniqueTermCount(); return in.getUniqueTermCount();
} }
} }
static class AssertingFieldsConsumer extends FieldsConsumer {
private final FieldsConsumer in;
AssertingFieldsConsumer(FieldsConsumer in) {
this.in = in;
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
TermsConsumer consumer = in.addField(field);
assert consumer != null;
return new AssertingTermsConsumer(consumer, field);
}
@Override
public void close() throws IOException {
in.close();
}
}
static enum TermsConsumerState { INITIAL, START, FINISHED };
static class AssertingTermsConsumer extends TermsConsumer {
private final TermsConsumer in;
private final FieldInfo fieldInfo;
private BytesRef lastTerm = null;
private TermsConsumerState state = TermsConsumerState.INITIAL;
private AssertingPostingsConsumer lastPostingsConsumer = null;
private long sumTotalTermFreq = 0;
private long sumDocFreq = 0;
private OpenBitSet visitedDocs = new OpenBitSet();
AssertingTermsConsumer(TermsConsumer in, FieldInfo fieldInfo) {
this.in = in;
this.fieldInfo = fieldInfo;
}
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.START;
assert lastTerm == null || in.getComparator().compare(text, lastTerm) > 0;
lastTerm = BytesRef.deepCopyOf(text);
return lastPostingsConsumer = new AssertingPostingsConsumer(in.startTerm(text), fieldInfo, visitedDocs);
}
@Override
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert state == TermsConsumerState.START;
state = TermsConsumerState.INITIAL;
assert text.equals(lastTerm);
assert stats.docFreq > 0; // otherwise, this method should not be called.
assert stats.docFreq == lastPostingsConsumer.docFreq;
sumDocFreq += stats.docFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert stats.totalTermFreq == -1;
} else {
assert stats.totalTermFreq == lastPostingsConsumer.totalTermFreq;
sumTotalTermFreq += stats.totalTermFreq;
}
in.finishTerm(text, stats);
}
@Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
assert state == TermsConsumerState.INITIAL || state == TermsConsumerState.START && lastPostingsConsumer.docFreq == 0;
state = TermsConsumerState.FINISHED;
assert docCount >= 0;
assert docCount == visitedDocs.cardinality();
assert sumDocFreq >= docCount;
assert sumDocFreq == this.sumDocFreq;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert sumTotalTermFreq == -1;
} else {
assert sumTotalTermFreq >= sumDocFreq;
assert sumTotalTermFreq == this.sumTotalTermFreq;
}
in.finish(sumTotalTermFreq, sumDocFreq, docCount);
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return in.getComparator();
}
}
static enum PostingsConsumerState { INITIAL, START };
static class AssertingPostingsConsumer extends PostingsConsumer {
private final PostingsConsumer in;
private final FieldInfo fieldInfo;
private final OpenBitSet visitedDocs;
private PostingsConsumerState state = PostingsConsumerState.INITIAL;
private int freq;
private int positionCount;
private int lastPosition = 0;
private int lastStartOffset = 0;
int docFreq = 0;
long totalTermFreq = 0;
AssertingPostingsConsumer(PostingsConsumer in, FieldInfo fieldInfo, OpenBitSet visitedDocs) {
this.in = in;
this.fieldInfo = fieldInfo;
this.visitedDocs = visitedDocs;
}
@Override
public void startDoc(int docID, int freq) throws IOException {
assert state == PostingsConsumerState.INITIAL;
state = PostingsConsumerState.START;
assert docID >= 0;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY) {
assert freq == -1;
this.freq = 0; // we don't expect any positions here
} else {
assert freq > 0;
this.freq = freq;
totalTermFreq += freq;
}
this.positionCount = 0;
this.lastPosition = 0;
this.lastStartOffset = 0;
docFreq++;
visitedDocs.set(docID);
in.startDoc(docID, freq);
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
assert state == PostingsConsumerState.START;
assert positionCount < freq;
positionCount++;
assert position >= lastPosition || position == -1; /* we still allow -1 from old 3.x indexes */
lastPosition = position;
if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
assert startOffset >= 0;
assert startOffset >= lastStartOffset;
lastStartOffset = startOffset;
assert endOffset >= startOffset;
} else {
assert startOffset == -1;
assert endOffset == -1;
}
if (payload != null) {
assert fieldInfo.hasPayloads();
}
in.addPosition(position, payload, startOffset, endOffset);
}
@Override
public void finishDoc() throws IOException {
assert state == PostingsConsumerState.START;
state = PostingsConsumerState.INITIAL;
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
assert positionCount == 0; // we should not have fed any positions!
} else {
assert positionCount == freq;
}
in.finishDoc();
}
}
} }

View File

@ -131,6 +131,8 @@ Bug Fixes
* SOLR-3663: There are a couple of bugs in the sync process when a leader goes down and a * SOLR-3663: There are a couple of bugs in the sync process when a leader goes down and a
new leader is elected. (Mark Miller) new leader is elected. (Mark Miller)
* SOLR-3623: Fixed inconsistent treatment of third-party dependencies for
solr contribs analysis-extras & uima (hossman)
Other Changes Other Changes
---------------------- ----------------------

View File

@ -70,21 +70,32 @@
--> -->
<property name="solr.spec.version" value="5.0.0.${dateversion}" /> <property name="solr.spec.version" value="5.0.0.${dateversion}" />
<path id="solr.lucene.libs">
<!-- List of jars that will be used as the foundation for both
the base classpath, as well as copied into the lucene-libs dir
in the release.
-->
<!-- NOTE: lucene-core is explicitly not included because of the
base.classpath (compilation & tests are done directly against
the class files w/o needing to build the jar)
-->
<pathelement location="${analyzers-common.jar}"/>
<pathelement location="${analyzers-kuromoji.jar}"/>
<pathelement location="${analyzers-phonetic.jar}"/>
<pathelement location="${highlighter.jar}"/>
<pathelement location="${memory.jar}"/>
<pathelement location="${misc.jar}"/>
<pathelement location="${spatial.jar}"/>
<pathelement location="${suggest.jar}"/>
<pathelement location="${grouping.jar}"/>
<pathelement location="${queries.jar}"/>
<pathelement location="${queryparser.jar}"/>
</path>
<path id="solr.base.classpath"> <path id="solr.base.classpath">
<pathelement path="${analyzers-common.jar}"/>
<pathelement path="${analyzers-kuromoji.jar}"/>
<pathelement path="${analyzers-phonetic.jar}"/>
<pathelement path="${analyzers-uima.jar}"/>
<pathelement path="${highlighter.jar}"/>
<pathelement path="${memory.jar}"/>
<pathelement path="${misc.jar}"/>
<pathelement path="${spatial.jar}"/>
<pathelement path="${suggest.jar}"/>
<pathelement path="${grouping.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement location="${common-solr.dir}/build/solr-solrj/classes/java"/> <pathelement location="${common-solr.dir}/build/solr-solrj/classes/java"/>
<pathelement location="${common-solr.dir}/build/solr-core/classes/java"/> <pathelement location="${common-solr.dir}/build/solr-core/classes/java"/>
<path refid="solr.lucene.libs" />
<path refid="additional.dependencies"/> <path refid="additional.dependencies"/>
<path refid="base.classpath"/> <path refid="base.classpath"/>
</path> </path>
@ -125,7 +136,7 @@
</target> </target>
<target name="prep-lucene-jars" <target name="prep-lucene-jars"
depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-analyzers-morfologik, jar-suggest, jar-highlighter, jar-memory, depends="jar-lucene-core, jar-analyzers-phonetic, jar-analyzers-kuromoji, jar-suggest, jar-highlighter, jar-memory,
jar-misc, jar-spatial, jar-grouping, jar-queries, jar-queryparser"> jar-misc, jar-spatial, jar-grouping, jar-queries, jar-queryparser">
<property name="solr.deps.compiled" value="true"/> <property name="solr.deps.compiled" value="true"/>
</target> </target>
@ -137,19 +148,11 @@
<propertyset refid="uptodate.and.compiled.properties"/> <propertyset refid="uptodate.and.compiled.properties"/>
</ant> </ant>
<copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true"> <copy todir="${lucene-libs}" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
<path refid="solr.lucene.libs" />
<!-- NOTE: lucene-core is not already included in "solr.lucene.libs"
because of it's use in classpaths.
-->
<fileset file="${lucene-core.jar}" /> <fileset file="${lucene-core.jar}" />
<fileset file="${analyzers-common.jar}" />
<fileset file="${analyzers-kuromoji.jar}" />
<fileset file="${analyzers-phonetic.jar}" />
<fileset file="${suggest.jar}" />
<fileset file="${grouping.jar}" />
<fileset file="${queries.jar}" />
<fileset file="${queryparser.jar}" />
<fileset file="${highlighter.jar}" />
<fileset file="${memory.jar}" />
<fileset file="${misc.jar}" />
<fileset file="${spatial.jar}" />
<fileset refid="analyzers-morfologik.fileset" />
</copy> </copy>
</sequential> </sequential>
</target> </target>

View File

@ -9,8 +9,11 @@ Relies upon the following lucene components (in lucene-libs/):
* lucene-analyzers-icu-X.Y.jar * lucene-analyzers-icu-X.Y.jar
* lucene-analyzers-smartcn-X.Y.jar * lucene-analyzers-smartcn-X.Y.jar
* lucene-analyzers-stempel-X.Y.jar * lucene-analyzers-stempel-X.Y.jar
* lucene-analyzers-morfologik-X.Y.jar
* lucene-analyzers-smartcn-X.Y.jar
And the ICU library (in lib/): And the following third-party library (in lib/):
* icu4j-X.Y.jar * icu4j-X.Y.jar
* morfologik-*.jar

View File

@ -25,12 +25,16 @@
<import file="../contrib-build.xml"/> <import file="../contrib-build.xml"/>
<path id="analysis.extras.lucene.libs">
<pathelement location="${analyzers-icu.jar}"/>
<pathelement location="${analyzers-smartcn.jar}"/>
<pathelement location="${analyzers-stempel.jar}"/>
<pathelement location="${analyzers-morfologik.jar}"/>
</path>
<path id="classpath"> <path id="classpath">
<fileset dir="lib" excludes="${common.classpath.excludes}"/> <fileset dir="lib" excludes="${common.classpath.excludes}"/>
<pathelement path="${analyzers-icu.jar}"/> <path refid="analysis.extras.lucene.libs" />
<pathelement path="${analyzers-smartcn.jar}"/>
<pathelement path="${analyzers-stempel.jar}"/>
<fileset refid="analyzers-morfologik.fileset" />
<path refid="solr.base.classpath"/> <path refid="solr.base.classpath"/>
</path> </path>
@ -38,10 +42,7 @@
depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik"> depends="jar-analyzers-icu, jar-analyzers-smartcn, jar-analyzers-stempel, jar-analyzers-morfologik">
<mkdir dir="${build.dir}/lucene-libs"/> <mkdir dir="${build.dir}/lucene-libs"/>
<copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true"> <copy todir="${build.dir}/lucene-libs" preservelastmodified="true" flatten="true" failonerror="true" overwrite="true">
<fileset file="${analyzers-icu.jar}"/> <path refid="analysis.extras.lucene.libs" />
<fileset file="${analyzers-smartcn.jar}"/>
<fileset file="${analyzers-stempel.jar}"/>
<fileset refid="analyzers-morfologik.fileset" />
</copy> </copy>
</target> </target>

View File

@ -20,6 +20,9 @@
<info organisation="org.apache.solr" module="analysis-extras"/> <info organisation="org.apache.solr" module="analysis-extras"/>
<dependencies> <dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/> <dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

View File

@ -0,0 +1 @@
d1f729cd3019e6d86485226202f84458141a5688

View File

@ -0,0 +1,29 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Morfologik nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,2 @@
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
(http://morfologik.blogspot.com/).

View File

@ -0,0 +1 @@
8217b6f7ad018ceda0e824b2e60340000da4397a

View File

@ -0,0 +1,62 @@
BSD-licensed dictionary of Polish (Morfologik)
Copyright (c) 2012, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
BSD-licensed dictionary of Polish (SGJP)
http://sgjp.pl/morfeusz/
Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
Marcin Woliński, Robert Wołosz
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,6 @@
This product includes data from BSD-licensed dictionary of Polish (Morfologik)
(http://morfologik.blogspot.com/)
This product includes data from BSD-licensed dictionary of Polish (SGJP)
(http://sgjp.pl/morfeusz/)

View File

@ -0,0 +1 @@
c4ead57b78fa71b00553ff21da6fb5a326e914e8

View File

@ -0,0 +1,29 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2012 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Morfologik nor the names of its contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,2 @@
This product includes BSD-licensed software developed by Dawid Weiss and Marcin Miłkowski
(http://morfologik.blogspot.com/).

View File

@ -6,6 +6,7 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f
or set <lib/> tags in solrconfig.xml appropriately to point those jar files. or set <lib/> tags in solrconfig.xml appropriately to point those jar files.
<lib dir="../../contrib/uima/lib" /> <lib dir="../../contrib/uima/lib" />
<lib dir="../../contrib/uima/lucene-libs" />
<lib dir="../../dist/" regex="apache-solr-uima-\d.*\.jar" /> <lib dir="../../dist/" regex="apache-solr-uima-\d.*\.jar" />
2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options: 2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:

View File

@ -191,7 +191,7 @@ public class LeaderElectionIntegrationTest extends SolrTestCaseJ4 {
int newLeaderPort = getLeaderPort(leader); int newLeaderPort = getLeaderPort(leader);
int retry = 0; int retry = 0;
while (leaderPort == newLeaderPort) { while (leaderPort == newLeaderPort) {
if (retry++ == 20) { if (retry++ == 60) {
break; break;
} }
Thread.sleep(1000); Thread.sleep(1000);