mirror of https://github.com/apache/lucene.git
LUCENE-3602: Added query time joining.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232223 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
23bd21c968
commit
7cdb8028c1
|
@ -58,6 +58,8 @@ New Features
|
|||
way as DirectSpellChecker. This can be used to merge top-N results from more than one
|
||||
SpellChecker. (James Dyer via Robert Muir)
|
||||
|
||||
* LUCENE-3602: Added query time joining under the join module. (Martijn van Groningen, Michael McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-2606: Changed RegexCapabilities interface to fix thread
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Utility for query time joining using {@link TermsQuery} and {@link TermsCollector}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class JoinUtil {
|
||||
|
||||
// No instances allowed
|
||||
private JoinUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Method for query time joining.
|
||||
* <p/>
|
||||
* Execute the returned query with a {@link IndexSearcher} to retrieve all documents that have the same terms in the
|
||||
* to field that match with documents matching the specified fromQuery and have the same terms in the from field.
|
||||
*
|
||||
* @param fromField The from field to join from
|
||||
* @param multipleValuesPerDocument Whether the from field has multiple terms per document
|
||||
* @param toField The to field to join to
|
||||
* @param fromQuery The query to match documents on the from side
|
||||
* @param fromSearcher The searcher that executed the specified fromQuery
|
||||
* @return a {@link Query} instance that can be used to join documents based on the
|
||||
* terms in the from and to field
|
||||
* @throws IOException If I/O related errors occur
|
||||
*/
|
||||
public static Query createJoinQuery(String fromField,
|
||||
boolean multipleValuesPerDocument,
|
||||
String toField,
|
||||
Query fromQuery,
|
||||
IndexSearcher fromSearcher) throws IOException {
|
||||
TermsCollector termsCollector = TermsCollector.create(fromField, multipleValuesPerDocument);
|
||||
fromSearcher.search(fromQuery, termsCollector);
|
||||
return new TermsQuery(toField, termsCollector.getCollectorTerms());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocTermOrds;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A collector that collects all terms from a specified field matching the query.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
abstract class TermsCollector extends Collector {
|
||||
|
||||
final String field;
|
||||
final BytesRefHash collectorTerms = new BytesRefHash();
|
||||
|
||||
TermsCollector(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
public BytesRefHash getCollectorTerms() {
|
||||
return collectorTerms;
|
||||
}
|
||||
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
}
|
||||
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Chooses the right {@link TermsCollector} implementation.
|
||||
*
|
||||
* @param field The field to collect terms for
|
||||
* @param multipleValuesPerDocument Whether the field to collect terms for has multiple values per document.
|
||||
* @return a {@link TermsCollector} instance
|
||||
*/
|
||||
static TermsCollector create(String field, boolean multipleValuesPerDocument) {
|
||||
return multipleValuesPerDocument ? new MV(field) : new SV(field);
|
||||
}
|
||||
|
||||
// impl that works with multiple values per document
|
||||
static class MV extends TermsCollector {
|
||||
|
||||
private DocTermOrds docTermOrds;
|
||||
private TermsEnum docTermsEnum;
|
||||
private DocTermOrds.TermOrdsIterator reuse;
|
||||
|
||||
MV(String field) {
|
||||
super(field);
|
||||
}
|
||||
|
||||
public void collect(int doc) throws IOException {
|
||||
reuse = docTermOrds.lookup(doc, reuse);
|
||||
int[] buffer = new int[5];
|
||||
|
||||
int chunk;
|
||||
do {
|
||||
chunk = reuse.read(buffer);
|
||||
if (chunk == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < chunk; idx++) {
|
||||
int key = buffer[idx];
|
||||
docTermsEnum.seekExact((long) key);
|
||||
collectorTerms.add(docTermsEnum.term());
|
||||
}
|
||||
} while (chunk >= buffer.length);
|
||||
}
|
||||
|
||||
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
|
||||
docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader, field);
|
||||
docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader);
|
||||
reuse = null; // LUCENE-3377 needs to be fixed first then this statement can be removed...
|
||||
}
|
||||
}
|
||||
|
||||
// impl that works with single value per document
|
||||
static class SV extends TermsCollector {
|
||||
|
||||
final BytesRef spare = new BytesRef();
|
||||
private FieldCache.DocTerms fromDocTerms;
|
||||
|
||||
SV(String field) {
|
||||
super(field);
|
||||
}
|
||||
|
||||
public void collect(int doc) throws IOException {
|
||||
collectorTerms.add(fromDocTerms.getTerm(doc, spare));
|
||||
}
|
||||
|
||||
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
|
||||
fromDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.FilteredTermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* A query that has an array of terms from a specific field. This query will match documents have one or more terms in
|
||||
* the specified field that match with the terms specified in the array.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
class TermsQuery extends MultiTermQuery {
|
||||
|
||||
private final BytesRefHash terms;
|
||||
|
||||
/**
|
||||
* @param field The field that should contain terms that are specified in the previous parameter
|
||||
* @param terms The terms that matching documents should have. The terms must be sorted by natural order.
|
||||
*/
|
||||
TermsQuery(String field, BytesRefHash terms) {
|
||||
super(field);
|
||||
this.terms = terms;
|
||||
}
|
||||
|
||||
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
|
||||
if (this.terms.size() == 0) {
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
|
||||
return new SeekingTermSetTermsEnum(terms.iterator(null), this.terms);
|
||||
}
|
||||
|
||||
public String toString(String string) {
|
||||
return "TermsQuery{" +
|
||||
"field=" + field +
|
||||
'}';
|
||||
}
|
||||
|
||||
static class SeekingTermSetTermsEnum extends FilteredTermsEnum {
|
||||
|
||||
private final BytesRefHash terms;
|
||||
private final int[] ords;
|
||||
private final int lastElement;
|
||||
|
||||
private final BytesRef lastTerm;
|
||||
private final BytesRef spare = new BytesRef();
|
||||
private final Comparator<BytesRef> comparator;
|
||||
|
||||
private BytesRef seekTerm;
|
||||
private int upto = 0;
|
||||
|
||||
SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms) throws IOException {
|
||||
super(tenum);
|
||||
this.terms = terms;
|
||||
|
||||
lastElement = terms.size() - 1;
|
||||
ords = terms.sort(comparator = tenum.getComparator());
|
||||
lastTerm = terms.get(ords[lastElement], new BytesRef());
|
||||
seekTerm = terms.get(ords[upto], spare);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException {
|
||||
BytesRef temp = seekTerm;
|
||||
seekTerm = null;
|
||||
return temp;
|
||||
}
|
||||
|
||||
protected AcceptStatus accept(BytesRef term) throws IOException {
|
||||
if (comparator.compare(term, lastTerm) > 0) {
|
||||
return AcceptStatus.END;
|
||||
}
|
||||
|
||||
BytesRef currentTerm = terms.get(ords[upto], spare);
|
||||
if (comparator.compare(term, currentTerm) == 0) {
|
||||
if (upto == lastElement) {
|
||||
return AcceptStatus.YES;
|
||||
} else {
|
||||
seekTerm = terms.get(ords[++upto], spare);
|
||||
return AcceptStatus.YES_AND_SEEK;
|
||||
}
|
||||
} else {
|
||||
if (upto == lastElement) {
|
||||
return AcceptStatus.NO;
|
||||
} else { // Our current term doesn't match the the given term.
|
||||
int cmp;
|
||||
do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher.
|
||||
if (upto == lastElement) {
|
||||
return AcceptStatus.NO;
|
||||
}
|
||||
// typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of
|
||||
// our terms so we don't do a binary search here
|
||||
seekTerm = terms.get(ords[++upto], spare);
|
||||
} while ((cmp = comparator.compare(seekTerm, term)) < 0);
|
||||
if (cmp == 0) {
|
||||
if (upto == lastElement) {
|
||||
return AcceptStatus.YES;
|
||||
}
|
||||
seekTerm = terms.get(ords[++upto], spare);
|
||||
return AcceptStatus.YES_AND_SEEK;
|
||||
} else {
|
||||
return AcceptStatus.NO_AND_SEEK;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,11 @@
|
|||
<html>
|
||||
<body>
|
||||
|
||||
<p>This module supports index-time joins while searching, where joined
|
||||
<p>This modules support index-time and query-time joins.</p>
|
||||
|
||||
<h2>Index-time joins</h2>
|
||||
|
||||
<p>The index-time joining support joins while searching, where joined
|
||||
documents are indexed as a single document block using
|
||||
{@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all
|
||||
joined tables matching a single row of the primary table must be
|
||||
|
@ -34,5 +38,37 @@
|
|||
org.apache.lucene.search.join.ToChildBlockJoinQuery}. This wraps
|
||||
any query matching parent documents, creating the joined query
|
||||
matching only child documents.
|
||||
|
||||
<h2>Search-time joins</h2>
|
||||
|
||||
<p>
|
||||
The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField
|
||||
that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms
|
||||
collected in the first pass.
|
||||
</p>
|
||||
<p>Query time joining has the following input:</p>
|
||||
<ul>
|
||||
<li><code>fromField</code>: The from field to join from.
|
||||
<li><code>fromQuery</code>: The query executed to collect the from terms. This is usually the user specified query.
|
||||
<li><code>multipleValuesPerDocument</code>: Whether the fromField contains more than one value per document
|
||||
<li><code>toField</code>: The to field to join to
|
||||
</ul>
|
||||
<p>
|
||||
Basically the query-time joining is accessible from one static method. The user of this method supplies the method
|
||||
with the described input and a <code>IndexSearcher</code> where the from terms need to be collected from. The returned
|
||||
query can be executed with the same <code>IndexSearcher</code>, but also with another <code>IndexSearcher</code>.
|
||||
Example usage of the {@link org.apache.lucene.search.join.JoinUtil#createJoinQuery(String, boolean, String, org.apache.lucene.search.Query, org.apache.lucene.search.IndexSearcher)} :
|
||||
</p>
|
||||
<pre class="prettyprint">
|
||||
String fromField = "from"; // Name of the from field
|
||||
boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
|
||||
String fromField = "to"; // Name of the to field
|
||||
Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
|
||||
|
||||
MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);
|
||||
TopDocs topDocs = toSearcher.search(joinQuery, 10); // Note: toSearcher can be the same as the fromSearcher
|
||||
// Render topDocs...
|
||||
</pre>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -0,0 +1,357 @@
|
|||
package org.apache.lucene.search.join;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
public class TestJoinUtil extends LuceneTestCase {
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
final String idField = "id";
|
||||
final String toField = "productId";
|
||||
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(
|
||||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT,
|
||||
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
// 0
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("description", "random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("name", "name1", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "1", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 1
|
||||
doc = new Document();
|
||||
doc.add(new Field("price", "10.0", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "2", TextField.TYPE_STORED));
|
||||
doc.add(new Field(toField, "1", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 2
|
||||
doc = new Document();
|
||||
doc.add(new Field("price", "20.0", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "3", TextField.TYPE_STORED));
|
||||
doc.add(new Field(toField, "1", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 3
|
||||
doc = new Document();
|
||||
doc.add(new Field("description", "more random text", TextField.TYPE_STORED));
|
||||
doc.add(new Field("name", "name2", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "4", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
w.commit();
|
||||
|
||||
// 4
|
||||
doc = new Document();
|
||||
doc.add(new Field("price", "10.0", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "5", TextField.TYPE_STORED));
|
||||
doc.add(new Field(toField, "4", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
// 5
|
||||
doc = new Document();
|
||||
doc.add(new Field("price", "20.0", TextField.TYPE_STORED));
|
||||
doc.add(new Field(idField, "6", TextField.TYPE_STORED));
|
||||
doc.add(new Field(toField, "4", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
|
||||
IndexSearcher indexSearcher = new IndexSearcher(w.getReader());
|
||||
w.close();
|
||||
|
||||
// Search for product
|
||||
Query joinQuery =
|
||||
JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name2")), indexSearcher);
|
||||
|
||||
TopDocs result = indexSearcher.search(joinQuery, 10);
|
||||
assertEquals(2, result.totalHits);
|
||||
assertEquals(4, result.scoreDocs[0].doc);
|
||||
assertEquals(5, result.scoreDocs[1].doc);
|
||||
|
||||
joinQuery = JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name1")), indexSearcher);
|
||||
result = indexSearcher.search(joinQuery, 10);
|
||||
assertEquals(2, result.totalHits);
|
||||
assertEquals(1, result.scoreDocs[0].doc);
|
||||
assertEquals(2, result.scoreDocs[1].doc);
|
||||
|
||||
// Search for offer
|
||||
joinQuery = JoinUtil.createJoinQuery(toField, false, idField, new TermQuery(new Term("id", "5")), indexSearcher);
|
||||
result = indexSearcher.search(joinQuery, 10);
|
||||
assertEquals(1, result.totalHits);
|
||||
assertEquals(3, result.scoreDocs[0].doc);
|
||||
|
||||
indexSearcher.getIndexReader().close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleValueRandomJoin() throws Exception {
|
||||
int maxIndexIter = _TestUtil.nextInt(random, 6, 12);
|
||||
int maxSearchIter = _TestUtil.nextInt(random, 13, 26);
|
||||
executeRandomJoin(false, maxIndexIter, maxSearchIter);
|
||||
}
|
||||
|
||||
@Test
|
||||
// This test really takes more time, that is why the number of iterations are smaller.
|
||||
public void testMultiValueRandomJoin() throws Exception {
|
||||
int maxIndexIter = _TestUtil.nextInt(random, 3, 6);
|
||||
int maxSearchIter = _TestUtil.nextInt(random, 6, 12);
|
||||
executeRandomJoin(true, maxIndexIter, maxSearchIter);
|
||||
}
|
||||
|
||||
private void executeRandomJoin(boolean multipleValuesPerDocument, int maxIndexIter, int maxSearchIter) throws Exception {
|
||||
for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("indexIter=" + indexIter);
|
||||
}
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(
|
||||
random,
|
||||
dir,
|
||||
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())
|
||||
);
|
||||
int numberOfDocumentsToIndex = _TestUtil.nextInt(random, 87, 764);
|
||||
IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument);
|
||||
|
||||
IndexReader topLevelReader = w.getReader();
|
||||
w.close();
|
||||
for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) {
|
||||
if (VERBOSE) {
|
||||
System.out.println("searchIter=" + searchIter);
|
||||
}
|
||||
IndexSearcher indexSearcher = newSearcher(topLevelReader);
|
||||
|
||||
int r = random.nextInt(context.randomUniqueValues.length);
|
||||
boolean from = context.randomFrom[r];
|
||||
String randomValue = context.randomUniqueValues[r];
|
||||
FixedBitSet expectedResult = createExpectedResult(randomValue, from, indexSearcher.getIndexReader(), context);
|
||||
|
||||
Query actualQuery = new TermQuery(new Term("value", randomValue));
|
||||
if (VERBOSE) {
|
||||
System.out.println("actualQuery=" + actualQuery);
|
||||
}
|
||||
Query joinQuery;
|
||||
if (from) {
|
||||
joinQuery = JoinUtil.createJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher);
|
||||
} else {
|
||||
joinQuery = JoinUtil.createJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("joinQuery=" + joinQuery);
|
||||
}
|
||||
|
||||
// Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector...
|
||||
final FixedBitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc());
|
||||
indexSearcher.search(joinQuery, new Collector() {
|
||||
|
||||
int docBase;
|
||||
|
||||
public void collect(int doc) throws IOException {
|
||||
actualResult.set(doc + docBase);
|
||||
}
|
||||
|
||||
public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException {
|
||||
docBase = context.docBase;
|
||||
}
|
||||
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
}
|
||||
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("expected cardinality:" + expectedResult.cardinality());
|
||||
DocIdSetIterator iterator = expectedResult.iterator();
|
||||
for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
|
||||
System.out.println(String.format("Expected doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id")));
|
||||
}
|
||||
System.out.println("actual cardinality:" + actualResult.cardinality());
|
||||
iterator = actualResult.iterator();
|
||||
for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) {
|
||||
System.out.println(String.format("Actual doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id")));
|
||||
}
|
||||
}
|
||||
|
||||
assertEquals(expectedResult, actualResult);
|
||||
}
|
||||
topLevelReader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException {
|
||||
return createContext(nDocs, writer, writer, multipleValuesPerDocument);
|
||||
}
|
||||
|
||||
private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException {
|
||||
IndexIterationContext context = new IndexIterationContext();
|
||||
int numRandomValues = nDocs / 2;
|
||||
context.randomUniqueValues = new String[numRandomValues];
|
||||
Set<String> trackSet = new HashSet<String>();
|
||||
context.randomFrom = new boolean[numRandomValues];
|
||||
for (int i = 0; i < numRandomValues; i++) {
|
||||
String uniqueRandomValue;
|
||||
do {
|
||||
uniqueRandomValue = _TestUtil.randomRealisticUnicodeString(random);
|
||||
// uniqueRandomValue = _TestUtil.randomSimpleString(random);
|
||||
} while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue));
|
||||
// Generate unique values and empty strings aren't allowed.
|
||||
trackSet.add(uniqueRandomValue);
|
||||
context.randomFrom[i] = random.nextBoolean();
|
||||
context.randomUniqueValues[i] = uniqueRandomValue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nDocs; i++) {
|
||||
String id = Integer.toString(i);
|
||||
int randomI = random.nextInt(context.randomUniqueValues.length);
|
||||
String value = context.randomUniqueValues[randomI];
|
||||
Document document = new Document();
|
||||
document.add(newField(random, "id", id, TextField.TYPE_STORED));
|
||||
document.add(newField(random, "value", value, TextField.TYPE_STORED));
|
||||
|
||||
boolean from = context.randomFrom[randomI];
|
||||
int numberOfLinkValues = multipleValuesPerDocument ? 2 + random.nextInt(10) : 1;
|
||||
RandomDoc doc = new RandomDoc(id, numberOfLinkValues, value);
|
||||
for (int j = 0; j < numberOfLinkValues; j++) {
|
||||
String linkValue = context.randomUniqueValues[random.nextInt(context.randomUniqueValues.length)];
|
||||
doc.linkValues.add(linkValue);
|
||||
if (from) {
|
||||
if (!context.fromDocuments.containsKey(linkValue)) {
|
||||
context.fromDocuments.put(linkValue, new ArrayList<RandomDoc>());
|
||||
}
|
||||
if (!context.randomValueFromDocs.containsKey(value)) {
|
||||
context.randomValueFromDocs.put(value, new ArrayList<RandomDoc>());
|
||||
}
|
||||
|
||||
context.fromDocuments.get(linkValue).add(doc);
|
||||
context.randomValueFromDocs.get(value).add(doc);
|
||||
document.add(newField(random, "from", linkValue, TextField.TYPE_STORED));
|
||||
} else {
|
||||
if (!context.toDocuments.containsKey(linkValue)) {
|
||||
context.toDocuments.put(linkValue, new ArrayList<RandomDoc>());
|
||||
}
|
||||
if (!context.randomValueToDocs.containsKey(value)) {
|
||||
context.randomValueToDocs.put(value, new ArrayList<RandomDoc>());
|
||||
}
|
||||
|
||||
context.toDocuments.get(linkValue).add(doc);
|
||||
context.randomValueToDocs.get(value).add(doc);
|
||||
document.add(newField(random, "to", linkValue, TextField.TYPE_STORED));
|
||||
}
|
||||
}
|
||||
|
||||
final RandomIndexWriter w;
|
||||
if (from) {
|
||||
w = fromWriter;
|
||||
} else {
|
||||
w = toWriter;
|
||||
}
|
||||
|
||||
w.addDocument(document);
|
||||
if (random.nextInt(10) == 4) {
|
||||
w.commit();
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("Added document[" + i + "]: " + document);
|
||||
}
|
||||
}
|
||||
return context;
|
||||
}
|
||||
|
||||
private FixedBitSet createExpectedResult(String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context) throws IOException {
|
||||
final Map<String, List<RandomDoc>> randomValueDocs;
|
||||
final Map<String, List<RandomDoc>> linkValueDocuments;
|
||||
if (from) {
|
||||
randomValueDocs = context.randomValueFromDocs;
|
||||
linkValueDocuments = context.toDocuments;
|
||||
} else {
|
||||
randomValueDocs = context.randomValueToDocs;
|
||||
linkValueDocuments = context.fromDocuments;
|
||||
}
|
||||
|
||||
FixedBitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc());
|
||||
List<RandomDoc> matchingDocs = randomValueDocs.get(queryValue);
|
||||
if (matchingDocs == null) {
|
||||
return new FixedBitSet(topLevelReader.maxDoc());
|
||||
}
|
||||
|
||||
for (RandomDoc matchingDoc : matchingDocs) {
|
||||
for (String linkValue : matchingDoc.linkValues) {
|
||||
List<RandomDoc> otherMatchingDocs = linkValueDocuments.get(linkValue);
|
||||
if (otherMatchingDocs == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (RandomDoc otherSideDoc : otherMatchingDocs) {
|
||||
DocsEnum docsEnum = MultiFields.getTermDocsEnum(topLevelReader, MultiFields.getLiveDocs(topLevelReader), "id", new BytesRef(otherSideDoc.id), false);
|
||||
assert docsEnum != null;
|
||||
int doc = docsEnum.nextDoc();
|
||||
expectedResult.set(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
return expectedResult;
|
||||
}
|
||||
|
||||
private static class IndexIterationContext {
|
||||
|
||||
String[] randomUniqueValues;
|
||||
boolean[] randomFrom;
|
||||
Map<String, List<RandomDoc>> fromDocuments = new HashMap<String, List<RandomDoc>>();
|
||||
Map<String, List<RandomDoc>> toDocuments = new HashMap<String, List<RandomDoc>>();
|
||||
Map<String, List<RandomDoc>> randomValueFromDocs = new HashMap<String, List<RandomDoc>>();
|
||||
Map<String, List<RandomDoc>> randomValueToDocs = new HashMap<String, List<RandomDoc>>();
|
||||
|
||||
}
|
||||
|
||||
private static class RandomDoc {
|
||||
|
||||
final String id;
|
||||
final List<String> linkValues;
|
||||
final String value;
|
||||
|
||||
private RandomDoc(String id, int numberOfLinkValues, String value) {
|
||||
this.id = id;
|
||||
linkValues = new ArrayList<String>(numberOfLinkValues);
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue