diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6f668c9e72b..7441219c291 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -89,6 +89,10 @@ New Features tail" suggestions, when a primary suggester fails to find a suggestion. (Mike McCandless) +* LUCENE-5251: New DocumentDictionary allows building suggesters via + contents of existing field, weight and optionally payload stored + fields in an index (Areek Zillur via Mike McCandless) + Bug Fixes * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java new file mode 100644 index 00000000000..35199613512 --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java @@ -0,0 +1,174 @@ +package org.apache.lucene.search.suggest; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.StorableField; +import org.apache.lucene.index.StoredDocument; +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; // javadoc +import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; // javadoc +import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; // javadoc +import org.apache.lucene.search.suggest.jaspell.JaspellLookup; // javadoc +import org.apache.lucene.search.suggest.tst.TSTLookup; // javadoc +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; + +/** + * Dictionary with terms, weights and optionally payload information + * taken from stored fields in a Lucene index. + * + * NOTE: + * + */ +public class DocumentDictionary implements Dictionary { + + private final IndexReader reader; + private final String field; + private final String weightField; + private final String payloadField; + + /** + * Creates a new dictionary with the contents of the fields named field + * for the terms and weightField for the weights that will be used for + * the corresponding terms. + */ + public DocumentDictionary(IndexReader reader, String field, String weightField) { + this.reader = reader; + this.field = field; + this.weightField = weightField; + this.payloadField = null; + } + + /** + * Creates a new dictionary with the contents of the fields named field + * for the terms, weightField for the weights that will be used for the + * the corresponding terms and payloadField for the corresponding payloads + * for the entry. + */ + public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField) { + this.reader = reader; + this.field = field; + this.weightField = weightField; + this.payloadField = payloadField; + } + + @Override + public BytesRefIterator getWordsIterator() throws IOException { + return new TermWeightPayloadIterator(payloadField!=null); + } + + final class TermWeightPayloadIterator implements TermFreqPayloadIterator { + private final int docCount; + private final Set relevantFields; + private final boolean withPayload; + private final Bits liveDocs; + private int currentDocId = -1; + private long currentWeight; + private BytesRef currentPayload; + + /** + * Creates an iterator over term, weight and payload fields from the lucene + * index. setting withPayload to false, implies an iterator + * over only term and weight. + */ + public TermWeightPayloadIterator(boolean withPayload) throws IOException { + docCount = reader.maxDoc() - 1; + this.withPayload = withPayload; + currentPayload = null; + liveDocs = MultiFields.getLiveDocs(reader); + List relevantFieldList; + if(withPayload) { + relevantFieldList = Arrays.asList(field, weightField, payloadField); + } else { + relevantFieldList = Arrays.asList(field, weightField); + } + this.relevantFields = new HashSet<>(relevantFieldList); + } + + @Override + public long weight() { + return currentWeight; + } + + @Override + public BytesRef next() throws IOException { + while (currentDocId < docCount) { + currentDocId++; + if (liveDocs != null && !liveDocs.get(currentDocId)) { + continue; + } + + StoredDocument doc = reader.document(currentDocId, relevantFields); + + if (withPayload) { + StorableField payload = doc.getField(payloadField); + if (payload == null) { + throw new IllegalArgumentException(payloadField + " does not exist"); + } else if (payload.binaryValue() == null) { + throw new IllegalArgumentException(payloadField + " does not have binary value"); + } + currentPayload = payload.binaryValue(); + } + + StorableField weight = doc.getField(weightField); + if (weight == null) { + throw new IllegalArgumentException(weightField + " does not exist"); + } else if (weight.numericValue() == null) { + throw new IllegalArgumentException(weightField + " does not have numeric value"); + } + currentWeight = weight.numericValue().longValue(); + + StorableField fieldVal = doc.getField(field); + if (fieldVal == null) { + throw new IllegalArgumentException(field + " does not exist"); + } else if(fieldVal.stringValue() == null) { + throw new IllegalArgumentException(field + " does not have string value"); + } + + return new BytesRef(fieldVal.stringValue()); + } + return null; + } + + @Override + public BytesRef payload() { + return currentPayload; + } + + } +} diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java new file mode 100644 index 00000000000..efdd09cb522 --- /dev/null +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java @@ -0,0 +1,168 @@ +package org.apache.lucene.search.suggest; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.TermFreqPayloadIterator; +import org.apache.lucene.search.suggest.DocumentDictionary; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class DocumentDictionaryTest extends LuceneTestCase { + + static final String FIELD_NAME = "f1"; + static final String WEIGHT_FIELD_NAME = "w1"; + static final String PAYLOAD_FIELD_NAME = "p1"; + + private Map generateIndexDocuments(int ndocs) { + Map docs = new HashMap<>(); + for(int i = 0; i < ndocs ; i++) { + Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES); + Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i)); + Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i); + Document doc = new Document(); + doc.add(field); + doc.add(payload); + doc.add(weight); + docs.put(field.stringValue(), doc); + } + return docs; + } + + @Test + public void testBasic() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + Map docs = generateIndexDocuments(10); + for(Document doc: docs.values()) { + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + IndexReader ir = DirectoryReader.open(dir); + Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + while((f = tfp.next())!=null) { + Document doc = docs.remove(f.utf8ToString()); + assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); + assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); + assertTrue(tfp.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue())); + } + assertTrue(docs.isEmpty()); + ir.close(); + dir.close(); + } + + @Test + public void testWithoutPayload() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + Map docs = generateIndexDocuments(10); + for(Document doc: docs.values()) { + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + IndexReader ir = DirectoryReader.open(dir); + Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + while((f = tfp.next())!=null) { + Document doc = docs.remove(f.utf8ToString()); + assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); + assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); + assertEquals(tfp.payload(), null); + } + assertTrue(docs.isEmpty()); + ir.close(); + dir.close(); + } + + @Test + public void testWithDeletions() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + Map docs = generateIndexDocuments(10); + Random rand = random(); + List termsToDel = new ArrayList<>(); + for(Document doc : docs.values()) { + if(rand.nextBoolean()) { + termsToDel.add(doc.get(FIELD_NAME)); + } + writer.addDocument(doc); + } + writer.commit(); + + Term[] delTerms = new Term[termsToDel.size()]; + for(int i=0; i < termsToDel.size() ; i++) { + delTerms[i] = new Term(FIELD_NAME, termsToDel.get(i)); + } + + for(Term delTerm: delTerms) { + writer.deleteDocuments(delTerm); + } + writer.commit(); + writer.close(); + + for(String termToDel: termsToDel) { + assertTrue(null!=docs.remove(termToDel)); + } + + IndexReader ir = DirectoryReader.open(dir); + assertEquals(ir.numDocs(), docs.size()); + Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME); + TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator(); + BytesRef f; + while((f = tfp.next())!=null) { + Document doc = docs.remove(f.utf8ToString()); + assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME)))); + assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue()); + assertEquals(tfp.payload(), null); + } + assertTrue(docs.isEmpty()); + ir.close(); + dir.close(); + } +}