mirror of https://github.com/apache/lucene.git
LUCENE-6489: Move Payload queries to queries module and PayloadSpanUtil to sandbox
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1703392 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1b01528b95
commit
b570fb0352
|
@ -90,6 +90,9 @@ API Changes
|
|||
* LUCENE-6716: SpanPayloadCheckQuery now takes a List<BytesRef> rather than
|
||||
a Collection<byte[]>. (Alan Woodward)
|
||||
|
||||
* LUCENE-6489: The various span payload queries have been moved to the queries
|
||||
submodule, and PayloadSpanUtil is now in sandbox. (Alan Woodward)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-6708: TopFieldCollector does not compute the score several times on the
|
||||
|
|
|
@ -24,9 +24,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
* The payload of a Token.
|
||||
* <p>
|
||||
* The payload is stored in the index at each position, and can
|
||||
* be used to influence scoring when using Payload-based queries
|
||||
* in the {@link org.apache.lucene.search.payloads} and
|
||||
* {@link org.apache.lucene.search.spans} packages.
|
||||
* be used to influence scoring when using Payload-based queries.
|
||||
* <p>
|
||||
* NOTE: because the payload will be stored at each position, it's usually
|
||||
* best to use the minimum number of bytes necessary. Some codec implementations
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
* <h2>Search Basics</h2>
|
||||
* <p>
|
||||
* Lucene offers a wide variety of {@link org.apache.lucene.search.Query} implementations, most of which are in
|
||||
* this package, its subpackages ({@link org.apache.lucene.search.spans spans}, {@link org.apache.lucene.search.payloads payloads}),
|
||||
* this package, its subpackage ({@link org.apache.lucene.search.spans spans},
|
||||
* or the <a href="{@docRoot}/../queries/overview-summary.html">queries module</a>. These implementations can be combined in a wide
|
||||
* variety of ways to provide complex querying capabilities along with information about where matches took place in the document
|
||||
* collection. The <a href="#query">Query Classes</a> section below highlights some of the more important Query classes. For details
|
||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockPayloadAnalyzer;
|
||||
|
@ -38,9 +38,8 @@ import org.apache.lucene.index.PostingsEnum;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.payloads.PayloadSpanCollector;
|
||||
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
||||
import org.apache.lucene.search.spans.MultiSpansWrapper;
|
||||
import org.apache.lucene.search.spans.SpanCollector;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
|
@ -201,6 +200,22 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
store.close();
|
||||
}
|
||||
|
||||
static class PayloadSpanCollector implements SpanCollector {
|
||||
|
||||
List<BytesRef> payloads = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
|
||||
if (postings.getPayload() != null)
|
||||
payloads.add(BytesRef.deepCopyOf(postings.getPayload()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
payloads.clear();
|
||||
}
|
||||
}
|
||||
|
||||
public void testPayloadsPos0() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockPayloadAnalyzer());
|
||||
|
@ -248,12 +263,11 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
}
|
||||
collector.reset();
|
||||
pspans.collect(collector);
|
||||
Collection<byte[]> payloads = collector.getPayloads();
|
||||
sawZero |= pspans.startPosition() == 0;
|
||||
for (byte[] bytes : payloads) {
|
||||
for (BytesRef payload : collector.payloads) {
|
||||
count++;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" payload: " + new String(bytes, StandardCharsets.UTF_8));
|
||||
System.out.println(" payload: " + Term.toString(payload));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -276,17 +290,6 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
assertEquals(4, count);
|
||||
assertTrue(sawZero);
|
||||
|
||||
sawZero = false;
|
||||
PayloadSpanUtil psu = new PayloadSpanUtil(is.getTopReaderContext());
|
||||
Collection<byte[]> pls = psu.getPayloadsForQuery(snq);
|
||||
count = pls.size();
|
||||
for (byte[] bytes : pls) {
|
||||
String s = new String(bytes, StandardCharsets.UTF_8);
|
||||
//System.out.println(s);
|
||||
sawZero |= s.equals("pos: 0");
|
||||
}
|
||||
assertEquals(8, count);
|
||||
assertTrue(sawZero);
|
||||
writer.close();
|
||||
is.getIndexReader().close();
|
||||
dir.close();
|
||||
|
|
|
@ -83,7 +83,7 @@ import org.apache.lucene.search.join.QueryBitSetProducer;
|
|||
import org.apache.lucene.search.join.ScoreMode;
|
||||
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
|
||||
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
|
||||
import org.apache.lucene.search.payloads.SpanPayloadCheckQuery;
|
||||
import org.apache.lucene.queries.payloads.SpanPayloadCheckQuery;
|
||||
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -22,7 +22,7 @@ import org.apache.lucene.search.Explanation;
|
|||
* An abstract class that defines a way for PayloadScoreQuery instances to transform
|
||||
* the cumulative effects of payload scores for a document.
|
||||
*
|
||||
* @see org.apache.lucene.search.payloads.PayloadScoreQuery for more information
|
||||
* @see org.apache.lucene.queries.payloads.PayloadScoreQuery for more information
|
||||
*
|
||||
* @lucene.experimental This class and its derivations are experimental and subject to
|
||||
* change
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -20,8 +20,8 @@
|
|||
* <p>
|
||||
* The following Query implementations are provided:
|
||||
* <ol>
|
||||
* <li>{@link org.apache.lucene.search.payloads.PayloadScoreQuery PayloadScoreQuery} -- For all terms matched by
|
||||
* <li>{@link org.apache.lucene.queries.payloads.PayloadScoreQuery PayloadScoreQuery} -- For all terms matched by
|
||||
* a SpanQuery, boost the score based on the value of the payload located at those terms.</li>
|
||||
* </ol>
|
||||
*/
|
||||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
|
||||
/*
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
|
@ -16,6 +16,13 @@ package org.apache.lucene.search.payloads;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -32,11 +39,11 @@ import org.apache.lucene.index.PostingsEnum;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.search.spans.MultiSpansWrapper;
|
||||
import org.apache.lucene.search.spans.SpanCollector;
|
||||
import org.apache.lucene.search.spans.SpanFirstQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
|
@ -48,13 +55,6 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
public class TestPayloadSpans extends LuceneTestCase {
|
||||
private IndexSearcher searcher;
|
||||
private Similarity similarity = new ClassicSimilarity();
|
||||
|
@ -74,15 +74,15 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanTermQuery stq;
|
||||
Spans spans;
|
||||
stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "seventy"));
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
|
||||
spans = MultiSpansWrapper.wrap(indexReader, stq, SpanWeight.Postings.PAYLOADS);
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 100, 1, 1, 1);
|
||||
checkSpans(spans, 100, 1, 1, 1);
|
||||
|
||||
stq = new SpanTermQuery(new Term(PayloadHelper.NO_PAYLOAD_FIELD, "seventy"));
|
||||
spans = MultiSpansWrapper.wrap(indexReader, stq, SpanWeight.Postings.PAYLOADS);
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 100, 0, 0, 0);
|
||||
checkSpans(spans, 100, 0, 0, 0);
|
||||
}
|
||||
|
||||
public void testSpanFirst() throws IOException {
|
||||
|
@ -91,20 +91,19 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanFirstQuery sfq;
|
||||
match = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
|
||||
sfq = new SpanFirstQuery(match, 2);
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
Spans spans = MultiSpansWrapper.wrap(indexReader, sfq, SpanWeight.Postings.PAYLOADS);
|
||||
checkSpans(spans, collector, 109, 1, 1, 1);
|
||||
checkSpans(spans, 109, 1, 1, 1);
|
||||
//Test more complicated subclause
|
||||
SpanQuery[] clauses = new SpanQuery[2];
|
||||
clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "one"));
|
||||
clauses[1] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "hundred"));
|
||||
match = new SpanNearQuery(clauses, 0, true);
|
||||
sfq = new SpanFirstQuery(match, 2);
|
||||
checkSpans(MultiSpansWrapper.wrap(indexReader, sfq, SpanWeight.Postings.PAYLOADS), collector, 100, 2, 1, 1);
|
||||
checkSpans(MultiSpansWrapper.wrap(indexReader, sfq, SpanWeight.Postings.PAYLOADS), 100, 2, 1, 1);
|
||||
|
||||
match = new SpanNearQuery(clauses, 0, false);
|
||||
sfq = new SpanFirstQuery(match, 2);
|
||||
checkSpans(MultiSpansWrapper.wrap(indexReader, sfq, SpanWeight.Postings.PAYLOADS), collector, 100, 2, 1, 1);
|
||||
checkSpans(MultiSpansWrapper.wrap(indexReader, sfq, SpanWeight.Postings.PAYLOADS), 100, 2, 1, 1);
|
||||
|
||||
}
|
||||
|
||||
|
@ -127,8 +126,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
checkSpans(MultiSpansWrapper.wrap(reader, snq, SpanWeight.Postings.PAYLOADS), collector, 1, new int[]{2});
|
||||
checkSpans(MultiSpansWrapper.wrap(reader, snq, SpanWeight.Postings.PAYLOADS), 1, new int[]{2});
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
@ -137,7 +135,6 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanTermQuery stq;
|
||||
Spans spans;
|
||||
IndexSearcher searcher = getSearcher();
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
|
||||
stq = new SpanTermQuery(new Term(PayloadHelper.FIELD, "mark"));
|
||||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), stq, SpanWeight.Postings.PAYLOADS);
|
||||
|
@ -151,7 +148,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
|
||||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), spanNearQuery, SpanWeight.Postings.PAYLOADS);
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 2, new int[]{3,3});
|
||||
checkSpans(spans, 2, new int[]{3,3});
|
||||
|
||||
|
||||
clauses[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "xx"));
|
||||
|
@ -163,7 +160,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), spanNearQuery, SpanWeight.Postings.PAYLOADS);
|
||||
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 1, new int[]{3});
|
||||
checkSpans(spans, 1, new int[]{3});
|
||||
|
||||
clauses = new SpanQuery[2];
|
||||
|
||||
|
@ -184,7 +181,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
// yy within 6 of xx within 6 of rr
|
||||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), nestedSpanNearQuery, SpanWeight.Postings.PAYLOADS);
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 2, new int[]{3,3});
|
||||
checkSpans(spans, 2, new int[]{3,3});
|
||||
closeIndexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
@ -212,12 +209,11 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
clauses3[0] = new SpanTermQuery(new Term(PayloadHelper.FIELD, "np"));
|
||||
clauses3[1] = snq;
|
||||
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
|
||||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), nestedSpanNearQuery, SpanWeight.Postings.PAYLOADS);
|
||||
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 1, new int[]{3});
|
||||
checkSpans(spans, 1, new int[]{3});
|
||||
closeIndexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
@ -252,10 +248,9 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
|
||||
SpanNearQuery nestedSpanNearQuery = new SpanNearQuery(clauses3, 6, false);
|
||||
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
spans = MultiSpansWrapper.wrap(searcher.getIndexReader(), nestedSpanNearQuery, SpanWeight.Postings.PAYLOADS);
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
checkSpans(spans, collector, 2, new int[]{8, 8});
|
||||
checkSpans(spans, 2, new int[]{8, 8});
|
||||
closeIndexReader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
@ -277,7 +272,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
|
||||
SpanQuery[] sqs = { stq1, stq2 };
|
||||
SpanNearQuery snq = new SpanNearQuery(sqs, 1, true);
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
VerifyingCollector collector = new VerifyingCollector();
|
||||
Spans spans = MultiSpansWrapper.wrap(is.getIndexReader(), snq, SpanWeight.Postings.PAYLOADS);
|
||||
|
||||
TopDocs topDocs = is.search(snq, 1);
|
||||
|
@ -287,9 +282,8 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
|
||||
collector.reset();
|
||||
spans.collect(collector);
|
||||
Collection<byte[]> payloads = collector.getPayloads();
|
||||
for (final byte [] payload : payloads) {
|
||||
payloadSet.add(new String(payload, StandardCharsets.UTF_8));
|
||||
for (final BytesRef payload : collector.payloads) {
|
||||
payloadSet.add(Term.toString(payload));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -317,7 +311,7 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
|
||||
SpanQuery[] sqs = { stq1, stq2 };
|
||||
SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
VerifyingCollector collector = new VerifyingCollector();
|
||||
Spans spans = MultiSpansWrapper.wrap(is.getIndexReader(), snq, SpanWeight.Postings.PAYLOADS);
|
||||
|
||||
TopDocs topDocs = is.search(snq, 1);
|
||||
|
@ -327,10 +321,8 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
|
||||
collector.reset();
|
||||
spans.collect(collector);
|
||||
Collection<byte[]> payloads = collector.getPayloads();
|
||||
|
||||
for (final byte [] payload : payloads) {
|
||||
payloadSet.add(new String(payload, StandardCharsets.UTF_8));
|
||||
for (final BytesRef payload: collector.payloads) {
|
||||
payloadSet.add(Term.toString(payload));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -358,20 +350,18 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
|
||||
SpanQuery[] sqs = { stq1, stq2 };
|
||||
SpanNearQuery snq = new SpanNearQuery(sqs, 0, true);
|
||||
PayloadSpanCollector collector = new PayloadSpanCollector();
|
||||
Spans spans = MultiSpansWrapper.wrap(is.getIndexReader(), snq, SpanWeight.Postings.PAYLOADS);
|
||||
|
||||
TopDocs topDocs = is.search(snq, 1);
|
||||
Set<String> payloadSet = new HashSet<>();
|
||||
VerifyingCollector collector = new VerifyingCollector();
|
||||
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
|
||||
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
|
||||
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
|
||||
collector.reset();
|
||||
spans.collect(collector);
|
||||
Collection<byte[]> payloads = collector.getPayloads();
|
||||
|
||||
for (final byte [] payload : payloads) {
|
||||
payloadSet.add(new String(payload, StandardCharsets.UTF_8));
|
||||
for (final BytesRef payload : collector.payloads) {
|
||||
payloadSet.add(Term.toString(payload));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -387,57 +377,51 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testPayloadSpanUtil() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||
newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(similarity));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField(PayloadHelper.FIELD, "xx rr yy mm pp", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
static class VerifyingCollector implements SpanCollector {
|
||||
|
||||
PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());
|
||||
|
||||
Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(PayloadHelper.FIELD, "rr")));
|
||||
if(VERBOSE) {
|
||||
System.out.println("Num payloads:" + payloads.size());
|
||||
for (final byte [] bytes : payloads) {
|
||||
System.out.println(new String(bytes, StandardCharsets.UTF_8));
|
||||
List<BytesRef> payloads = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
|
||||
if (postings.getPayload() != null) {
|
||||
payloads.add(BytesRef.deepCopyOf(postings.getPayload()));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
payloads.clear();
|
||||
}
|
||||
|
||||
public void verify(int expectedLength, int expectedFirstByte) {
|
||||
for (BytesRef payload : payloads) {
|
||||
assertEquals("Incorrect payload length", expectedLength, payload.length);
|
||||
assertEquals("Incorrect first byte", expectedFirstByte, payload.bytes[0]);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
private void checkSpans(Spans spans, PayloadSpanCollector collector, int expectedNumSpans, int expectedNumPayloads,
|
||||
private void checkSpans(Spans spans, int expectedNumSpans, int expectedNumPayloads,
|
||||
int expectedPayloadLength, int expectedFirstByte) throws IOException {
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
//each position match should have a span associated with it, since there is just one underlying term query, there should
|
||||
//only be one entry in the span
|
||||
VerifyingCollector collector = new VerifyingCollector();
|
||||
int seen = 0;
|
||||
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
|
||||
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
|
||||
collector.reset();
|
||||
spans.collect(collector);
|
||||
|
||||
Collection<byte[]> payload = collector.getPayloads();
|
||||
assertEquals("payload size", expectedNumPayloads, payload.size());
|
||||
for (final byte [] thePayload : payload) {
|
||||
assertEquals("payload length", expectedPayloadLength, thePayload.length);
|
||||
assertEquals("payload first byte", expectedFirstByte, thePayload[0]);
|
||||
}
|
||||
|
||||
collector.verify(expectedPayloadLength, expectedFirstByte);
|
||||
assertEquals("expectedNumPayloads", expectedNumPayloads, collector.payloads.size());
|
||||
seen++;
|
||||
}
|
||||
}
|
||||
assertEquals("expectedNumSpans", expectedNumSpans, seen);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private IndexSearcher getSearcher() throws Exception {
|
||||
directory = newDirectory();
|
||||
String[] docs = new String[]{"xx rr yy mm pp","xx yy mm rr pp", "nopayload qq ss pp np", "one two three four five six seven eight nine ten eleven", "nine one two three four five six seven eight eleven ten"};
|
||||
|
@ -459,25 +443,16 @@ public class TestPayloadSpans extends LuceneTestCase {
|
|||
return searcher;
|
||||
}
|
||||
|
||||
private void checkSpans(Spans spans, PayloadSpanCollector collector, int numSpans, int[] numPayloads) throws IOException {
|
||||
private void checkSpans(Spans spans, int numSpans, int[] numPayloads) throws IOException {
|
||||
int cnt = 0;
|
||||
|
||||
VerifyingCollector collector = new VerifyingCollector();
|
||||
while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
|
||||
while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
|
||||
if(VERBOSE)
|
||||
System.out.println("\nSpans Dump --");
|
||||
collector.reset();
|
||||
spans.collect(collector);
|
||||
|
||||
Collection<byte[]> payload = collector.getPayloads();
|
||||
if(VERBOSE) {
|
||||
System.out.println("payloads for span:" + payload.size());
|
||||
for (final byte [] bytes : payload) {
|
||||
System.out.println("doc:" + spans.docID() + " s:" + spans.startPosition() + " e:" + spans.endPosition() + " "
|
||||
+ new String(bytes, StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
assertEquals("payload size", numPayloads[cnt], payload.size());
|
||||
assertEquals("payload size", numPayloads[cnt], collector.payloads.size());
|
||||
|
||||
cnt++;
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.queries.payloads;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
|
@ -3,8 +3,8 @@ package org.apache.lucene.queryparser.xml.builders;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryparser.xml.DOMUtils;
|
||||
import org.apache.lucene.queryparser.xml.ParserException;
|
||||
import org.apache.lucene.search.payloads.AveragePayloadFunction;
|
||||
import org.apache.lucene.search.payloads.PayloadScoreQuery;
|
||||
import org.apache.lucene.queries.payloads.AveragePayloadFunction;
|
||||
import org.apache.lucene.queries.payloads.PayloadScoreQuery;
|
||||
import org.apache.lucene.search.spans.SpanBoostQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.payloads;
|
||||
package org.apache.lucene.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,6 +17,12 @@ package org.apache.lucene.search.payloads;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
@ -36,12 +42,6 @@ import org.apache.lucene.search.spans.SpanTermQuery;
|
|||
import org.apache.lucene.search.spans.SpanWeight;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Experimental class to get set of payloads for most standard Lucene queries.
|
||||
* Operates like Highlighter - IndexReader should only contain doc of interest,
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Experimental classes for interacting with payloads
|
||||
*/
|
||||
package org.apache.lucene.payloads;
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.payloads;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestPayloadSpanUtil extends LuceneTestCase {
|
||||
|
||||
public static final String FIELD = "f";
|
||||
|
||||
public void testPayloadSpanUtil() throws Exception {
|
||||
Directory directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
|
||||
newIndexWriterConfig(new PayloadAnalyzer()).setSimilarity(new ClassicSimilarity()));
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField(FIELD, "xx rr yy mm pp", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
PayloadSpanUtil psu = new PayloadSpanUtil(searcher.getTopReaderContext());
|
||||
|
||||
Collection<byte[]> payloads = psu.getPayloadsForQuery(new TermQuery(new Term(FIELD, "rr")));
|
||||
if(VERBOSE) {
|
||||
System.out.println("Num payloads:" + payloads.size());
|
||||
for (final byte [] bytes : payloads) {
|
||||
System.out.println(new String(bytes, StandardCharsets.UTF_8));
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
directory.close();
|
||||
}
|
||||
|
||||
final class PayloadAnalyzer extends Analyzer {
|
||||
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer result = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
||||
return new TokenStreamComponents(result, new PayloadFilter(result));
|
||||
}
|
||||
}
|
||||
|
||||
final class PayloadFilter extends TokenFilter {
|
||||
Set<String> entities = new HashSet<>();
|
||||
Set<String> nopayload = new HashSet<>();
|
||||
int pos;
|
||||
PayloadAttribute payloadAtt;
|
||||
CharTermAttribute termAtt;
|
||||
PositionIncrementAttribute posIncrAtt;
|
||||
|
||||
public PayloadFilter(TokenStream input) {
|
||||
super(input);
|
||||
pos = 0;
|
||||
entities.add("xx");
|
||||
entities.add("one");
|
||||
nopayload.add("nopayload");
|
||||
nopayload.add("np");
|
||||
termAtt = addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String token = termAtt.toString();
|
||||
|
||||
if (!nopayload.contains(token)) {
|
||||
if (entities.contains(token)) {
|
||||
payloadAtt.setPayload(new BytesRef(token + ":Entity:"+ pos ));
|
||||
} else {
|
||||
payloadAtt.setPayload(new BytesRef(token + ":Noise:" + pos ));
|
||||
}
|
||||
}
|
||||
pos += posIncrAtt.getPositionIncrement();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -17,14 +17,14 @@ package org.apache.lucene.search.spans;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.SlowCompositeReaderWrapper;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
*
|
||||
* A wrapper to perform span operations on a non-leaf reader context
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.payloads.SpanPayloadCheckQuery;
|
||||
import org.apache.lucene.queries.payloads.SpanPayloadCheckQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
|
|
Loading…
Reference in New Issue