diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9800fa8d38c..f03765d27cb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -65,6 +65,11 @@ API Changes * LUCENE-5114: Remove unused boolean useCache parameter from TermsEnum.seekCeil and .seekExact (Mike McCandless) +Optimizations + +* LUCENE-5088: Added TermFilter to filter docs by a specific term. + (Martijn van Groningen) + ======================= Lucene 4.4.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/queries/src/java/org/apache/lucene/queries/TermFilter.java b/lucene/queries/src/java/org/apache/lucene/queries/TermFilter.java new file mode 100644 index 00000000000..0a61b978fdc --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/TermFilter.java @@ -0,0 +1,100 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.util.Bits; + +import java.io.IOException; + +/** + * A filter that includes documents that match with a specific term. + */ +final public class TermFilter extends Filter { + + private final Term term; + + /** + * @param term The term documents need to have in order to be a match for this filter. + */ + public TermFilter(Term term) { + if (term == null) { + throw new IllegalArgumentException("Term must not be null"); + } else if (term.field() == null) { + throw new IllegalArgumentException("Field must not be null"); + } + this.term = term; + } + + /** + * @return The term this filter includes documents with. + */ + public Term getTerm() { + return term; + } + + @Override + public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException { + Terms terms = context.reader().terms(term.field()); + if (terms == null) { + return null; + } + + final TermsEnum termsEnum = terms.iterator(null); + if (!termsEnum.seekExact(term.bytes())) { + return null; + } + return new DocIdSet() { + @Override + public DocIdSetIterator iterator() throws IOException { + return termsEnum.docs(acceptDocs, null, DocsEnum.FLAG_NONE); + } + + }; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TermFilter that = (TermFilter) o; + + if (term != null ? !term.equals(that.term) : that.term != null) return false; + + return true; + } + + @Override + public int hashCode() { + return term != null ? term.hashCode() : 0; + } + + @Override + public String toString() { + return term.field() + ":" + term.text(); + } + +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TermFilterTest.java b/lucene/queries/src/test/org/apache/lucene/queries/TermFilterTest.java new file mode 100644 index 00000000000..a8705e0c0b6 --- /dev/null +++ b/lucene/queries/src/test/org/apache/lucene/queries/TermFilterTest.java @@ -0,0 +1,182 @@ +package org.apache.lucene.queries; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SlowCompositeReaderWrapper; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +public class TermFilterTest extends LuceneTestCase { + + public void testCachability() throws Exception { + TermFilter a = termFilter("field1", "a"); + HashSet cachedFilters = new HashSet(); + cachedFilters.add(a); + assertTrue("Must be cached", cachedFilters.contains(termFilter("field1", "a"))); + assertFalse("Must not be cached", cachedFilters.contains(termFilter("field1", "b"))); + assertFalse("Must not be cached", cachedFilters.contains(termFilter("field2", "a"))); + } + + public void testMissingTermAndField() throws Exception { + String fieldName = "field1"; + Directory rd = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), rd); + Document doc = new Document(); + doc.add(newStringField(fieldName, "value1", Field.Store.NO)); + w.addDocument(doc); + IndexReader reader = new SlowCompositeReaderWrapper(w.getReader()); + assertTrue(reader.getContext() instanceof AtomicReaderContext); + AtomicReaderContext context = (AtomicReaderContext) reader.getContext(); + w.close(); + + DocIdSet idSet = termFilter(fieldName, "value1").getDocIdSet(context, context.reader().getLiveDocs()); + assertNotNull("must not be null", idSet); + DocIdSetIterator iter = idSet.iterator(); + assertEquals(iter.nextDoc(), 0); + assertEquals(iter.nextDoc(), DocIdSetIterator.NO_MORE_DOCS); + + idSet = termFilter(fieldName, "value2").getDocIdSet(context, context.reader().getLiveDocs()); + assertNull("must be null", idSet); + + idSet = termFilter("field2", "value1").getDocIdSet(context, context.reader().getLiveDocs()); + assertNull("must be null", idSet); + + reader.close(); + rd.close(); + } + + public void testRandom() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + int num = atLeast(100); + List terms = new ArrayList(); + for (int i = 0; i < num; i++) { + String field = "field" + i; + String string = _TestUtil.randomRealisticUnicodeString(random()); + terms.add(new Term(field, string)); + Document doc = new Document(); + doc.add(newStringField(field, string, Field.Store.NO)); + w.addDocument(doc); + } + IndexReader reader = w.getReader(); + w.close(); + + IndexSearcher searcher = newSearcher(reader); + + int numQueries = atLeast(10); + for (int i = 0; i < numQueries; i++) { + Term term = terms.get(random().nextInt(num)); + TopDocs queryResult = searcher.search(new TermQuery(term), reader.maxDoc()); + + MatchAllDocsQuery matchAll = new MatchAllDocsQuery(); + final TermFilter filter = termFilter(term); + TopDocs filterResult = searcher.search(matchAll, filter, reader.maxDoc()); + assertEquals(filterResult.totalHits, queryResult.totalHits); + ScoreDoc[] scoreDocs = filterResult.scoreDocs; + for (int j = 0; j < scoreDocs.length; j++) { + assertEquals(scoreDocs[j].doc, queryResult.scoreDocs[j].doc); + } + } + + reader.close(); + dir.close(); + } + + public void testHashCodeAndEquals() { + int num = atLeast(100); + for (int i = 0; i < num; i++) { + String field1 = "field" + i; + String field2 = "field" + i + num; + String value1 = _TestUtil.randomRealisticUnicodeString(random()); + String value2 = _TestUtil.randomRealisticUnicodeString(random()); + + TermFilter filter1 = termFilter(field1, value1); + TermFilter filter2 = termFilter(field1, value2); + TermFilter filter3 = termFilter(field2, value1); + TermFilter filter4 = termFilter(field2, value2); + TermFilter[] filters = new TermFilter[]{filter1, filter2, filter3, filter4}; + for (int j = 0; j < filters.length; j++) { + TermFilter termFilter = filters[j]; + for (int k = 0; k < filters.length; k++) { + TermFilter otherTermFilter = filters[k]; + if (j == k) { + assertEquals(termFilter, otherTermFilter); + assertEquals(termFilter.hashCode(), otherTermFilter.hashCode()); + assertTrue(termFilter.equals(otherTermFilter)); + } else { + assertFalse(termFilter.equals(otherTermFilter)); + } + } + } + + TermFilter filter5 = termFilter(field2, value2); + assertEquals(filter5, filter4); + assertEquals(filter5.hashCode(), filter4.hashCode()); + assertTrue(filter5.equals(filter4)); + + assertEquals(filter5, filter4); + assertTrue(filter5.equals(filter4)); + } + } + + public void testNoTerms() { + try { + new TermFilter(null); + fail("must fail - no term!"); + } catch (IllegalArgumentException e) {} + + try { + new TermFilter(new Term(null)); + fail("must fail - no field!"); + } catch (IllegalArgumentException e) {} + } + + public void testToString() { + TermFilter termsFilter = new TermFilter(new Term("field1", "a")); + assertEquals("field1:a", termsFilter.toString()); + } + + private TermFilter termFilter(String field, String value) { + return termFilter(new Term(field, value)); + } + + private TermFilter termFilter(Term term) { + return new TermFilter(term); + } + +}