diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index cc9df3763ae..f38fe203e45 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -58,6 +58,8 @@ New Features way as DirectSpellChecker. This can be used to merge top-N results from more than one SpellChecker. (James Dyer via Robert Muir) + * LUCENE-3602: Added query time joining under the join module. (Martijn van Groningen, Michael McCandless) + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread diff --git a/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java b/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java new file mode 100644 index 00000000000..481fd945910 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java @@ -0,0 +1,61 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; + +import java.io.IOException; + +/** + * Utility for query time joining using {@link TermsQuery} and {@link TermsCollector}. + * + * @lucene.experimental + */ +public final class JoinUtil { + + // No instances allowed + private JoinUtil() { + } + + /** + * Method for query time joining. + *

+ * Execute the returned query with a {@link IndexSearcher} to retrieve all documents that have the same terms in the + * to field that match with documents matching the specified fromQuery and have the same terms in the from field. + * + * @param fromField The from field to join from + * @param multipleValuesPerDocument Whether the from field has multiple terms per document + * @param toField The to field to join to + * @param fromQuery The query to match documents on the from side + * @param fromSearcher The searcher that executed the specified fromQuery + * @return a {@link Query} instance that can be used to join documents based on the + * terms in the from and to field + * @throws IOException If I/O related errors occur + */ + public static Query createJoinQuery(String fromField, + boolean multipleValuesPerDocument, + String toField, + Query fromQuery, + IndexSearcher fromSearcher) throws IOException { + TermsCollector termsCollector = TermsCollector.create(fromField, multipleValuesPerDocument); + fromSearcher.search(fromQuery, termsCollector); + return new TermsQuery(toField, termsCollector.getCollectorTerms()); + } + +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java b/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java new file mode 100644 index 00000000000..b8df2256deb --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java @@ -0,0 +1,123 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocTermOrds; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; + +import java.io.IOException; + +/** + * A collector that collects all terms from a specified field matching the query. + * + * @lucene.experimental + */ +abstract class TermsCollector extends Collector { + + final String field; + final BytesRefHash collectorTerms = new BytesRefHash(); + + TermsCollector(String field) { + this.field = field; + } + + public BytesRefHash getCollectorTerms() { + return collectorTerms; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } + + /** + * Chooses the right {@link TermsCollector} implementation. + * + * @param field The field to collect terms for + * @param multipleValuesPerDocument Whether the field to collect terms for has multiple values per document. + * @return a {@link TermsCollector} instance + */ + static TermsCollector create(String field, boolean multipleValuesPerDocument) { + return multipleValuesPerDocument ? new MV(field) : new SV(field); + } + + // impl that works with multiple values per document + static class MV extends TermsCollector { + + private DocTermOrds docTermOrds; + private TermsEnum docTermsEnum; + private DocTermOrds.TermOrdsIterator reuse; + + MV(String field) { + super(field); + } + + public void collect(int doc) throws IOException { + reuse = docTermOrds.lookup(doc, reuse); + int[] buffer = new int[5]; + + int chunk; + do { + chunk = reuse.read(buffer); + if (chunk == 0) { + return; + } + + for (int idx = 0; idx < chunk; idx++) { + int key = buffer[idx]; + docTermsEnum.seekExact((long) key); + collectorTerms.add(docTermsEnum.term()); + } + } while (chunk >= buffer.length); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader, field); + docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader); + reuse = null; // LUCENE-3377 needs to be fixed first then this statement can be removed... + } + } + + // impl that works with single value per document + static class SV extends TermsCollector { + + final BytesRef spare = new BytesRef(); + private FieldCache.DocTerms fromDocTerms; + + SV(String field) { + super(field); + } + + public void collect(int doc) throws IOException { + collectorTerms.add(fromDocTerms.getTerm(doc, spare)); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + fromDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field); + } + } + +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java new file mode 100644 index 00000000000..71136c4eec2 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java @@ -0,0 +1,135 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FilteredTermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; + +import java.io.IOException; +import java.util.Comparator; + +/** + * A query that has an array of terms from a specific field. This query will match documents have one or more terms in + * the specified field that match with the terms specified in the array. + * + * @lucene.experimental + */ +class TermsQuery extends MultiTermQuery { + + private final BytesRefHash terms; + + /** + * @param field The field that should contain terms that are specified in the previous parameter + * @param terms The terms that matching documents should have. The terms must be sorted by natural order. + */ + TermsQuery(String field, BytesRefHash terms) { + super(field); + this.terms = terms; + } + + protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { + if (this.terms.size() == 0) { + return TermsEnum.EMPTY; + } + + return new SeekingTermSetTermsEnum(terms.iterator(null), this.terms); + } + + public String toString(String string) { + return "TermsQuery{" + + "field=" + field + + '}'; + } + + static class SeekingTermSetTermsEnum extends FilteredTermsEnum { + + private final BytesRefHash terms; + private final int[] ords; + private final int lastElement; + + private final BytesRef lastTerm; + private final BytesRef spare = new BytesRef(); + private final Comparator comparator; + + private BytesRef seekTerm; + private int upto = 0; + + SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms) throws IOException { + super(tenum); + this.terms = terms; + + lastElement = terms.size() - 1; + ords = terms.sort(comparator = tenum.getComparator()); + lastTerm = terms.get(ords[lastElement], new BytesRef()); + seekTerm = terms.get(ords[upto], spare); + } + + @Override + protected BytesRef nextSeekTerm(BytesRef currentTerm) throws IOException { + BytesRef temp = seekTerm; + seekTerm = null; + return temp; + } + + protected AcceptStatus accept(BytesRef term) throws IOException { + if (comparator.compare(term, lastTerm) > 0) { + return AcceptStatus.END; + } + + BytesRef currentTerm = terms.get(ords[upto], spare); + if (comparator.compare(term, currentTerm) == 0) { + if (upto == lastElement) { + return AcceptStatus.YES; + } else { + seekTerm = terms.get(ords[++upto], spare); + return AcceptStatus.YES_AND_SEEK; + } + } else { + if (upto == lastElement) { + return AcceptStatus.NO; + } else { // Our current term doesn't match the the given term. + int cmp; + do { // We maybe are behind the given term by more than one step. Keep incrementing till we're the same or higher. + if (upto == lastElement) { + return AcceptStatus.NO; + } + // typically the terms dict is a superset of query's terms so it's unusual that we have to skip many of + // our terms so we don't do a binary search here + seekTerm = terms.get(ords[++upto], spare); + } while ((cmp = comparator.compare(seekTerm, term)) < 0); + if (cmp == 0) { + if (upto == lastElement) { + return AcceptStatus.YES; + } + seekTerm = terms.get(ords[++upto], spare); + return AcceptStatus.YES_AND_SEEK; + } else { + return AcceptStatus.NO_AND_SEEK; + } + } + } + } + + } + +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/package.html b/modules/join/src/java/org/apache/lucene/search/join/package.html index 8b886f3dfdb..0cd5cd86a64 100644 --- a/modules/join/src/java/org/apache/lucene/search/join/package.html +++ b/modules/join/src/java/org/apache/lucene/search/join/package.html @@ -1,7 +1,11 @@ -

This module supports index-time joins while searching, where joined +

This modules support index-time and query-time joins.

+ +

Index-time joins

+ +

The index-time joining support joins while searching, where joined documents are indexed as a single document block using {@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all joined tables matching a single row of the primary table must be @@ -34,5 +38,37 @@ org.apache.lucene.search.join.ToChildBlockJoinQuery}. This wraps any query matching parent documents, creating the joined query matching only child documents. + +

Search-time joins

+ +

+ The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField + that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms + collected in the first pass. +

+

Query time joining has the following input:

+ +

+ Basically the query-time joining is accessible from one static method. The user of this method supplies the method + with the described input and a IndexSearcher where the from terms need to be collected from. The returned + query can be executed with the same IndexSearcher, but also with another IndexSearcher. + Example usage of the {@link org.apache.lucene.search.join.JoinUtil#createJoinQuery(String, boolean, String, org.apache.lucene.search.Query, org.apache.lucene.search.IndexSearcher)} : +

+
+  String fromField = "from"; // Name of the from field
+  boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index
+  String fromField = "to"; // Name of the to field
+  Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values
+
+  MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher);
+  TopDocs topDocs = toSearcher.search(joinQuery, 10); // Note: toSearcher can be the same as the fromSearcher
+  // Render topDocs...
+
+ diff --git a/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java b/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java new file mode 100644 index 00000000000..60c50bb0b25 --- /dev/null +++ b/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java @@ -0,0 +1,357 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Test; + +import java.io.IOException; +import java.util.*; + +public class TestJoinUtil extends LuceneTestCase { + + public void testSimple() throws Exception { + final String idField = "id"; + final String toField = "productId"; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + + // 0 + Document doc = new Document(); + doc.add(new Field("description", "random text", TextField.TYPE_STORED)); + doc.add(new Field("name", "name1", TextField.TYPE_STORED)); + doc.add(new Field(idField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field("price", "10.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "2", TextField.TYPE_STORED)); + doc.add(new Field(toField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field("price", "20.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "3", TextField.TYPE_STORED)); + doc.add(new Field(toField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 3 + doc = new Document(); + doc.add(new Field("description", "more random text", TextField.TYPE_STORED)); + doc.add(new Field("name", "name2", TextField.TYPE_STORED)); + doc.add(new Field(idField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + w.commit(); + + // 4 + doc = new Document(); + doc.add(new Field("price", "10.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "5", TextField.TYPE_STORED)); + doc.add(new Field(toField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field("price", "20.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "6", TextField.TYPE_STORED)); + doc.add(new Field(toField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + // Search for product + Query joinQuery = + JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name2")), indexSearcher); + + TopDocs result = indexSearcher.search(joinQuery, 10); + assertEquals(2, result.totalHits); + assertEquals(4, result.scoreDocs[0].doc); + assertEquals(5, result.scoreDocs[1].doc); + + joinQuery = JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name1")), indexSearcher); + result = indexSearcher.search(joinQuery, 10); + assertEquals(2, result.totalHits); + assertEquals(1, result.scoreDocs[0].doc); + assertEquals(2, result.scoreDocs[1].doc); + + // Search for offer + joinQuery = JoinUtil.createJoinQuery(toField, false, idField, new TermQuery(new Term("id", "5")), indexSearcher); + result = indexSearcher.search(joinQuery, 10); + assertEquals(1, result.totalHits); + assertEquals(3, result.scoreDocs[0].doc); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + @Test + public void testSingleValueRandomJoin() throws Exception { + int maxIndexIter = _TestUtil.nextInt(random, 6, 12); + int maxSearchIter = _TestUtil.nextInt(random, 13, 26); + executeRandomJoin(false, maxIndexIter, maxSearchIter); + } + + @Test + // This test really takes more time, that is why the number of iterations are smaller. + public void testMultiValueRandomJoin() throws Exception { + int maxIndexIter = _TestUtil.nextInt(random, 3, 6); + int maxSearchIter = _TestUtil.nextInt(random, 6, 12); + executeRandomJoin(true, maxIndexIter, maxSearchIter); + } + + private void executeRandomJoin(boolean multipleValuesPerDocument, int maxIndexIter, int maxSearchIter) throws Exception { + for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) { + if (VERBOSE) { + System.out.println("indexIter=" + indexIter); + } + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy()) + ); + int numberOfDocumentsToIndex = _TestUtil.nextInt(random, 87, 764); + IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument); + + IndexReader topLevelReader = w.getReader(); + w.close(); + for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) { + if (VERBOSE) { + System.out.println("searchIter=" + searchIter); + } + IndexSearcher indexSearcher = newSearcher(topLevelReader); + + int r = random.nextInt(context.randomUniqueValues.length); + boolean from = context.randomFrom[r]; + String randomValue = context.randomUniqueValues[r]; + FixedBitSet expectedResult = createExpectedResult(randomValue, from, indexSearcher.getIndexReader(), context); + + Query actualQuery = new TermQuery(new Term("value", randomValue)); + if (VERBOSE) { + System.out.println("actualQuery=" + actualQuery); + } + Query joinQuery; + if (from) { + joinQuery = JoinUtil.createJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher); + } else { + joinQuery = JoinUtil.createJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher); + } + if (VERBOSE) { + System.out.println("joinQuery=" + joinQuery); + } + + // Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector... + final FixedBitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc()); + indexSearcher.search(joinQuery, new Collector() { + + int docBase; + + public void collect(int doc) throws IOException { + actualResult.set(doc + docBase); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + docBase = context.docBase; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } + }); + + if (VERBOSE) { + System.out.println("expected cardinality:" + expectedResult.cardinality()); + DocIdSetIterator iterator = expectedResult.iterator(); + for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) { + System.out.println(String.format("Expected doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id"))); + } + System.out.println("actual cardinality:" + actualResult.cardinality()); + iterator = actualResult.iterator(); + for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) { + System.out.println(String.format("Actual doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id"))); + } + } + + assertEquals(expectedResult, actualResult); + } + topLevelReader.close(); + dir.close(); + } + } + + private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException { + return createContext(nDocs, writer, writer, multipleValuesPerDocument); + } + + private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException { + IndexIterationContext context = new IndexIterationContext(); + int numRandomValues = nDocs / 2; + context.randomUniqueValues = new String[numRandomValues]; + Set trackSet = new HashSet(); + context.randomFrom = new boolean[numRandomValues]; + for (int i = 0; i < numRandomValues; i++) { + String uniqueRandomValue; + do { + uniqueRandomValue = _TestUtil.randomRealisticUnicodeString(random); +// uniqueRandomValue = _TestUtil.randomSimpleString(random); + } while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue)); + // Generate unique values and empty strings aren't allowed. + trackSet.add(uniqueRandomValue); + context.randomFrom[i] = random.nextBoolean(); + context.randomUniqueValues[i] = uniqueRandomValue; + } + + for (int i = 0; i < nDocs; i++) { + String id = Integer.toString(i); + int randomI = random.nextInt(context.randomUniqueValues.length); + String value = context.randomUniqueValues[randomI]; + Document document = new Document(); + document.add(newField(random, "id", id, TextField.TYPE_STORED)); + document.add(newField(random, "value", value, TextField.TYPE_STORED)); + + boolean from = context.randomFrom[randomI]; + int numberOfLinkValues = multipleValuesPerDocument ? 2 + random.nextInt(10) : 1; + RandomDoc doc = new RandomDoc(id, numberOfLinkValues, value); + for (int j = 0; j < numberOfLinkValues; j++) { + String linkValue = context.randomUniqueValues[random.nextInt(context.randomUniqueValues.length)]; + doc.linkValues.add(linkValue); + if (from) { + if (!context.fromDocuments.containsKey(linkValue)) { + context.fromDocuments.put(linkValue, new ArrayList()); + } + if (!context.randomValueFromDocs.containsKey(value)) { + context.randomValueFromDocs.put(value, new ArrayList()); + } + + context.fromDocuments.get(linkValue).add(doc); + context.randomValueFromDocs.get(value).add(doc); + document.add(newField(random, "from", linkValue, TextField.TYPE_STORED)); + } else { + if (!context.toDocuments.containsKey(linkValue)) { + context.toDocuments.put(linkValue, new ArrayList()); + } + if (!context.randomValueToDocs.containsKey(value)) { + context.randomValueToDocs.put(value, new ArrayList()); + } + + context.toDocuments.get(linkValue).add(doc); + context.randomValueToDocs.get(value).add(doc); + document.add(newField(random, "to", linkValue, TextField.TYPE_STORED)); + } + } + + final RandomIndexWriter w; + if (from) { + w = fromWriter; + } else { + w = toWriter; + } + + w.addDocument(document); + if (random.nextInt(10) == 4) { + w.commit(); + } + if (VERBOSE) { + System.out.println("Added document[" + i + "]: " + document); + } + } + return context; + } + + private FixedBitSet createExpectedResult(String queryValue, boolean from, IndexReader topLevelReader, IndexIterationContext context) throws IOException { + final Map> randomValueDocs; + final Map> linkValueDocuments; + if (from) { + randomValueDocs = context.randomValueFromDocs; + linkValueDocuments = context.toDocuments; + } else { + randomValueDocs = context.randomValueToDocs; + linkValueDocuments = context.fromDocuments; + } + + FixedBitSet expectedResult = new FixedBitSet(topLevelReader.maxDoc()); + List matchingDocs = randomValueDocs.get(queryValue); + if (matchingDocs == null) { + return new FixedBitSet(topLevelReader.maxDoc()); + } + + for (RandomDoc matchingDoc : matchingDocs) { + for (String linkValue : matchingDoc.linkValues) { + List otherMatchingDocs = linkValueDocuments.get(linkValue); + if (otherMatchingDocs == null) { + continue; + } + + for (RandomDoc otherSideDoc : otherMatchingDocs) { + DocsEnum docsEnum = MultiFields.getTermDocsEnum(topLevelReader, MultiFields.getLiveDocs(topLevelReader), "id", new BytesRef(otherSideDoc.id), false); + assert docsEnum != null; + int doc = docsEnum.nextDoc(); + expectedResult.set(doc); + } + } + } + return expectedResult; + } + + private static class IndexIterationContext { + + String[] randomUniqueValues; + boolean[] randomFrom; + Map> fromDocuments = new HashMap>(); + Map> toDocuments = new HashMap>(); + Map> randomValueFromDocs = new HashMap>(); + Map> randomValueToDocs = new HashMap>(); + + } + + private static class RandomDoc { + + final String id; + final List linkValues; + final String value; + + private RandomDoc(String id, int numberOfLinkValues, String value) { + this.id = id; + linkValues = new ArrayList(numberOfLinkValues); + this.value = value; + } + } + +}