diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index cc9df3763ae..f38fe203e45 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -58,6 +58,8 @@ New Features way as DirectSpellChecker. This can be used to merge top-N results from more than one SpellChecker. (James Dyer via Robert Muir) + * LUCENE-3602: Added query time joining under the join module. (Martijn van Groningen, Michael McCandless) + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread diff --git a/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java b/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java new file mode 100644 index 00000000000..481fd945910 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/JoinUtil.java @@ -0,0 +1,61 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; + +import java.io.IOException; + +/** + * Utility for query time joining using {@link TermsQuery} and {@link TermsCollector}. + * + * @lucene.experimental + */ +public final class JoinUtil { + + // No instances allowed + private JoinUtil() { + } + + /** + * Method for query time joining. + *
+ * Execute the returned query with a {@link IndexSearcher} to retrieve all documents that have the same terms in the + * to field that match with documents matching the specified fromQuery and have the same terms in the from field. + * + * @param fromField The from field to join from + * @param multipleValuesPerDocument Whether the from field has multiple terms per document + * @param toField The to field to join to + * @param fromQuery The query to match documents on the from side + * @param fromSearcher The searcher that executed the specified fromQuery + * @return a {@link Query} instance that can be used to join documents based on the + * terms in the from and to field + * @throws IOException If I/O related errors occur + */ + public static Query createJoinQuery(String fromField, + boolean multipleValuesPerDocument, + String toField, + Query fromQuery, + IndexSearcher fromSearcher) throws IOException { + TermsCollector termsCollector = TermsCollector.create(fromField, multipleValuesPerDocument); + fromSearcher.search(fromQuery, termsCollector); + return new TermsQuery(toField, termsCollector.getCollectorTerms()); + } + +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java b/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java new file mode 100644 index 00000000000..b8df2256deb --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/TermsCollector.java @@ -0,0 +1,123 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.DocTermOrds; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; + +import java.io.IOException; + +/** + * A collector that collects all terms from a specified field matching the query. + * + * @lucene.experimental + */ +abstract class TermsCollector extends Collector { + + final String field; + final BytesRefHash collectorTerms = new BytesRefHash(); + + TermsCollector(String field) { + this.field = field; + } + + public BytesRefHash getCollectorTerms() { + return collectorTerms; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } + + /** + * Chooses the right {@link TermsCollector} implementation. + * + * @param field The field to collect terms for + * @param multipleValuesPerDocument Whether the field to collect terms for has multiple values per document. + * @return a {@link TermsCollector} instance + */ + static TermsCollector create(String field, boolean multipleValuesPerDocument) { + return multipleValuesPerDocument ? new MV(field) : new SV(field); + } + + // impl that works with multiple values per document + static class MV extends TermsCollector { + + private DocTermOrds docTermOrds; + private TermsEnum docTermsEnum; + private DocTermOrds.TermOrdsIterator reuse; + + MV(String field) { + super(field); + } + + public void collect(int doc) throws IOException { + reuse = docTermOrds.lookup(doc, reuse); + int[] buffer = new int[5]; + + int chunk; + do { + chunk = reuse.read(buffer); + if (chunk == 0) { + return; + } + + for (int idx = 0; idx < chunk; idx++) { + int key = buffer[idx]; + docTermsEnum.seekExact((long) key); + collectorTerms.add(docTermsEnum.term()); + } + } while (chunk >= buffer.length); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader, field); + docTermsEnum = docTermOrds.getOrdTermsEnum(context.reader); + reuse = null; // LUCENE-3377 needs to be fixed first then this statement can be removed... + } + } + + // impl that works with single value per document + static class SV extends TermsCollector { + + final BytesRef spare = new BytesRef(); + private FieldCache.DocTerms fromDocTerms; + + SV(String field) { + super(field); + } + + public void collect(int doc) throws IOException { + collectorTerms.add(fromDocTerms.getTerm(doc, spare)); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + fromDocTerms = FieldCache.DEFAULT.getTerms(context.reader, field); + } + } + +} diff --git a/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java new file mode 100644 index 00000000000..71136c4eec2 --- /dev/null +++ b/modules/join/src/java/org/apache/lucene/search/join/TermsQuery.java @@ -0,0 +1,135 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.FilteredTermsEnum; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; + +import java.io.IOException; +import java.util.Comparator; + +/** + * A query that has an array of terms from a specific field. This query will match documents have one or more terms in + * the specified field that match with the terms specified in the array. + * + * @lucene.experimental + */ +class TermsQuery extends MultiTermQuery { + + private final BytesRefHash terms; + + /** + * @param field The field that should contain terms that are specified in the previous parameter + * @param terms The terms that matching documents should have. The terms must be sorted by natural order. + */ + TermsQuery(String field, BytesRefHash terms) { + super(field); + this.terms = terms; + } + + protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { + if (this.terms.size() == 0) { + return TermsEnum.EMPTY; + } + + return new SeekingTermSetTermsEnum(terms.iterator(null), this.terms); + } + + public String toString(String string) { + return "TermsQuery{" + + "field=" + field + + '}'; + } + + static class SeekingTermSetTermsEnum extends FilteredTermsEnum { + + private final BytesRefHash terms; + private final int[] ords; + private final int lastElement; + + private final BytesRef lastTerm; + private final BytesRef spare = new BytesRef(); + private final ComparatorThis module supports index-time joins while searching, where joined +
This modules support index-time and query-time joins.
+ +The index-time joining support joins while searching, where joined documents are indexed as a single document block using {@link org.apache.lucene.index.IndexWriter#addDocuments}. This is useful for any normalized content (XML documents or database tables). In database terms, all rows for all joined tables matching a single row of the primary table must be @@ -34,5 +38,37 @@ org.apache.lucene.search.join.ToChildBlockJoinQuery}. This wraps any query matching parent documents, creating the joined query matching only child documents. + +
+ The query time joining is terms based and implemented as two pass search. The first pass collects all the terms from a fromField + that match the fromQuery. The second pass returns all documents that have matching terms in a toField to the terms + collected in the first pass. +
+Query time joining has the following input:
+fromField
: The from field to join from.
+ fromQuery
: The query executed to collect the from terms. This is usually the user specified query.
+ multipleValuesPerDocument
: Whether the fromField contains more than one value per document
+ toField
: The to field to join to
+
+ Basically the query-time joining is accessible from one static method. The user of this method supplies the method
+ with the described input and a IndexSearcher
where the from terms need to be collected from. The returned
+ query can be executed with the same IndexSearcher
, but also with another IndexSearcher
.
+ Example usage of the {@link org.apache.lucene.search.join.JoinUtil#createJoinQuery(String, boolean, String, org.apache.lucene.search.Query, org.apache.lucene.search.IndexSearcher)} :
+
+ String fromField = "from"; // Name of the from field + boolean multipleValuesPerDocument = false; // Set only yo true in the case when your fromField has multiple values per document in your index + String fromField = "to"; // Name of the to field + Query fromQuery = new TermQuery(new Term("content", searchTerm)); // Query executed to collect from values to join to the to values + + MultiTermQuery joinQuery = JoinUtil.createJoinQuery(fromField, multipleValuesPerDocument, toField, fromQuery, fromSearcher); + TopDocs topDocs = toSearcher.search(joinQuery, 10); // Note: toSearcher can be the same as the fromSearcher + // Render topDocs... ++ diff --git a/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java b/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java new file mode 100644 index 00000000000..60c50bb0b25 --- /dev/null +++ b/modules/join/src/test/org/apache/lucene/search/join/TestJoinUtil.java @@ -0,0 +1,357 @@ +package org.apache.lucene.search.join; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Test; + +import java.io.IOException; +import java.util.*; + +public class TestJoinUtil extends LuceneTestCase { + + public void testSimple() throws Exception { + final String idField = "id"; + final String toField = "productId"; + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + + // 0 + Document doc = new Document(); + doc.add(new Field("description", "random text", TextField.TYPE_STORED)); + doc.add(new Field("name", "name1", TextField.TYPE_STORED)); + doc.add(new Field(idField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 1 + doc = new Document(); + doc.add(new Field("price", "10.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "2", TextField.TYPE_STORED)); + doc.add(new Field(toField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 2 + doc = new Document(); + doc.add(new Field("price", "20.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "3", TextField.TYPE_STORED)); + doc.add(new Field(toField, "1", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 3 + doc = new Document(); + doc.add(new Field("description", "more random text", TextField.TYPE_STORED)); + doc.add(new Field("name", "name2", TextField.TYPE_STORED)); + doc.add(new Field(idField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + w.commit(); + + // 4 + doc = new Document(); + doc.add(new Field("price", "10.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "5", TextField.TYPE_STORED)); + doc.add(new Field(toField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + + // 5 + doc = new Document(); + doc.add(new Field("price", "20.0", TextField.TYPE_STORED)); + doc.add(new Field(idField, "6", TextField.TYPE_STORED)); + doc.add(new Field(toField, "4", TextField.TYPE_STORED)); + w.addDocument(doc); + + IndexSearcher indexSearcher = new IndexSearcher(w.getReader()); + w.close(); + + // Search for product + Query joinQuery = + JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name2")), indexSearcher); + + TopDocs result = indexSearcher.search(joinQuery, 10); + assertEquals(2, result.totalHits); + assertEquals(4, result.scoreDocs[0].doc); + assertEquals(5, result.scoreDocs[1].doc); + + joinQuery = JoinUtil.createJoinQuery(idField, false, toField, new TermQuery(new Term("name", "name1")), indexSearcher); + result = indexSearcher.search(joinQuery, 10); + assertEquals(2, result.totalHits); + assertEquals(1, result.scoreDocs[0].doc); + assertEquals(2, result.scoreDocs[1].doc); + + // Search for offer + joinQuery = JoinUtil.createJoinQuery(toField, false, idField, new TermQuery(new Term("id", "5")), indexSearcher); + result = indexSearcher.search(joinQuery, 10); + assertEquals(1, result.totalHits); + assertEquals(3, result.scoreDocs[0].doc); + + indexSearcher.getIndexReader().close(); + dir.close(); + } + + @Test + public void testSingleValueRandomJoin() throws Exception { + int maxIndexIter = _TestUtil.nextInt(random, 6, 12); + int maxSearchIter = _TestUtil.nextInt(random, 13, 26); + executeRandomJoin(false, maxIndexIter, maxSearchIter); + } + + @Test + // This test really takes more time, that is why the number of iterations are smaller. + public void testMultiValueRandomJoin() throws Exception { + int maxIndexIter = _TestUtil.nextInt(random, 3, 6); + int maxSearchIter = _TestUtil.nextInt(random, 6, 12); + executeRandomJoin(true, maxIndexIter, maxSearchIter); + } + + private void executeRandomJoin(boolean multipleValuesPerDocument, int maxIndexIter, int maxSearchIter) throws Exception { + for (int indexIter = 1; indexIter <= maxIndexIter; indexIter++) { + if (VERBOSE) { + System.out.println("indexIter=" + indexIter); + } + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + random, + dir, + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy()) + ); + int numberOfDocumentsToIndex = _TestUtil.nextInt(random, 87, 764); + IndexIterationContext context = createContext(numberOfDocumentsToIndex, w, multipleValuesPerDocument); + + IndexReader topLevelReader = w.getReader(); + w.close(); + for (int searchIter = 1; searchIter <= maxSearchIter; searchIter++) { + if (VERBOSE) { + System.out.println("searchIter=" + searchIter); + } + IndexSearcher indexSearcher = newSearcher(topLevelReader); + + int r = random.nextInt(context.randomUniqueValues.length); + boolean from = context.randomFrom[r]; + String randomValue = context.randomUniqueValues[r]; + FixedBitSet expectedResult = createExpectedResult(randomValue, from, indexSearcher.getIndexReader(), context); + + Query actualQuery = new TermQuery(new Term("value", randomValue)); + if (VERBOSE) { + System.out.println("actualQuery=" + actualQuery); + } + Query joinQuery; + if (from) { + joinQuery = JoinUtil.createJoinQuery("from", multipleValuesPerDocument, "to", actualQuery, indexSearcher); + } else { + joinQuery = JoinUtil.createJoinQuery("to", multipleValuesPerDocument, "from", actualQuery, indexSearcher); + } + if (VERBOSE) { + System.out.println("joinQuery=" + joinQuery); + } + + // Need to know all documents that have matches. TopDocs doesn't give me that and then I'd be also testing TopDocsCollector... + final FixedBitSet actualResult = new FixedBitSet(indexSearcher.getIndexReader().maxDoc()); + indexSearcher.search(joinQuery, new Collector() { + + int docBase; + + public void collect(int doc) throws IOException { + actualResult.set(doc + docBase); + } + + public void setNextReader(IndexReader.AtomicReaderContext context) throws IOException { + docBase = context.docBase; + } + + public void setScorer(Scorer scorer) throws IOException { + } + + public boolean acceptsDocsOutOfOrder() { + return true; + } + }); + + if (VERBOSE) { + System.out.println("expected cardinality:" + expectedResult.cardinality()); + DocIdSetIterator iterator = expectedResult.iterator(); + for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) { + System.out.println(String.format("Expected doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id"))); + } + System.out.println("actual cardinality:" + actualResult.cardinality()); + iterator = actualResult.iterator(); + for (int doc = iterator.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iterator.nextDoc()) { + System.out.println(String.format("Actual doc[%d] with id value %s", doc, indexSearcher.doc(doc).get("id"))); + } + } + + assertEquals(expectedResult, actualResult); + } + topLevelReader.close(); + dir.close(); + } + } + + private IndexIterationContext createContext(int nDocs, RandomIndexWriter writer, boolean multipleValuesPerDocument) throws IOException { + return createContext(nDocs, writer, writer, multipleValuesPerDocument); + } + + private IndexIterationContext createContext(int nDocs, RandomIndexWriter fromWriter, RandomIndexWriter toWriter, boolean multipleValuesPerDocument) throws IOException { + IndexIterationContext context = new IndexIterationContext(); + int numRandomValues = nDocs / 2; + context.randomUniqueValues = new String[numRandomValues]; + Set