From 5e935d710604a5d03d20e3aebe265e1c799e2f2c Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 25 Jan 2016 16:10:49 +0000 Subject: [PATCH] LUCENE-6992: Allow adding IndexableField to MemoryIndex --- lucene/CHANGES.txt | 3 + .../lucene/index/memory/MemoryIndex.java | 107 +++++++++++++----- .../lucene/index/memory/TestMemoryIndex.java | 27 +++++ 3 files changed, 106 insertions(+), 31 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4b9b4662c85..36a667495ef 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -257,6 +257,9 @@ Other Locale#forLanguageTag() and Locale#toString() were placed on list of forbidden signatures. (Uwe Schindler, Robert Muir) +* LUCENE-6988: You can now add IndexableFields directly to a MemoryIndex, + and create a MemoryIndex from a lucene Document. (Alan Woodward) + ======================= Lucene 5.4.1 ======================= Bug Fixes diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index dec42d522f3..e5e461b871c 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -33,46 +33,18 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.PointValues; -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.OrdTermState; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedNumericDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.*; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.SimpleCollector; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefArray; -import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.*; import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.Counter; import org.apache.lucene.util.IntBlockPool.SliceReader; import org.apache.lucene.util.IntBlockPool.SliceWriter; -import org.apache.lucene.util.IntBlockPool; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.RecyclingByteBlockAllocator; -import org.apache.lucene.util.RecyclingIntBlockAllocator; /** * High-performance single-document main memory Apache Lucene fulltext search index. @@ -288,6 +260,46 @@ public class MemoryIndex { addField(fieldName, stream, 1.0f, analyzer.getPositionIncrementGap(fieldName), analyzer.getOffsetGap(fieldName)); } + /** + * Builds a MemoryIndex from a lucene {@link Document} using an analyzer + * + * @param document the document to index + * @param analyzer the analyzer to use + * @return a MemoryIndex + */ + public static MemoryIndex fromDocument(Document document, Analyzer analyzer) { + return fromDocument(document, analyzer, false, false, 0); + } + + /** + * Builds a MemoryIndex from a lucene {@link Document} using an analyzer + * @param document the document to index + * @param analyzer the analyzer to use + * @param storeOffsets true if offsets should be stored + * @param storePayloads true if payloads should be stored + * @return a MemoryIndex + */ + public static MemoryIndex fromDocument(Document document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads) { + return fromDocument(document, analyzer, storeOffsets, storePayloads, 0); + } + + /** + * Builds a MemoryIndex from a lucene {@link Document} using an analyzer + * @param document the document to index + * @param analyzer the analyzer to use + * @param storeOffsets true if offsets should be stored + * @param storePayloads true if payloads should be stored + * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called + * @return a MemoryIndex + */ + public static MemoryIndex fromDocument(Document document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads, long maxReusedBytes) { + MemoryIndex mi = new MemoryIndex(storeOffsets, storePayloads, maxReusedBytes); + for (IndexableField field : document) { + mi.addField(field, analyzer); + } + return mi; + } + /** * Convenience method; Creates and returns a token stream that generates a * token for each keyword in the given collection, "as is", without any @@ -339,6 +351,39 @@ public class MemoryIndex { public void addField(String fieldName, TokenStream stream) { addField(fieldName, stream, 1.0f); } + + /** + * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer + * @param field the field to add + * @param analyzer the analyzer to use for term analysis + * @throws IllegalArgumentException if the field is a DocValues or Point field, as these + * structures are not supported by MemoryIndex + */ + public void addField(IndexableField field, Analyzer analyzer) { + addField(field, analyzer, 1.0f); + } + + /** + * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer + * @param field the field to add + * @param analyzer the analyzer to use for term analysis + * @param boost a field boost + * @throws IllegalArgumentException if the field is a DocValues or Point field, as these + * structures are not supported by MemoryIndex + */ + public void addField(IndexableField field, Analyzer analyzer, float boost) { + if (field.fieldType().docValuesType() != DocValuesType.NONE) + throw new IllegalArgumentException("MemoryIndex does not support DocValues fields"); + if (field.fieldType().pointDimensionCount() != 0) + throw new IllegalArgumentException("MemoryIndex does not support Points"); + if (analyzer == null) { + addField(field.name(), field.tokenStream(null, null), boost); + } + else { + addField(field.name(), field.tokenStream(analyzer, null), boost, + analyzer.getPositionIncrementGap(field.name()), analyzer.getOffsetGap(field.name())); + } + } /** * Iterates over the given token stream and adds the resulting terms to the index; diff --git a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java index 9011231ae63..6f889048e57 100644 --- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java +++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java @@ -22,12 +22,17 @@ import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockPayloadAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.ClassicSimilarity; @@ -156,5 +161,27 @@ public class TestMemoryIndex extends LuceneTestCase { TestUtil.checkReader(reader); } + @Test + public void testBuildFromDocument() { + + Document doc = new Document(); + doc.add(new TextField("field1", "some text", Field.Store.NO)); + doc.add(new TextField("field1", "some more text", Field.Store.NO)); + doc.add(new StringField("field2", "untokenized text", Field.Store.NO)); + + analyzer.setPositionIncrementGap(100); + + MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer); + + assertThat(mi.search(new TermQuery(new Term("field1", "text"))), not(0.0f)); + assertThat(mi.search(new TermQuery(new Term("field2", "text"))), is(0.0f)); + assertThat(mi.search(new TermQuery(new Term("field2", "untokenized text"))), not(0.0f)); + + assertThat(mi.search(new PhraseQuery("field1", "some", "more", "text")), not(0.0f)); + assertThat(mi.search(new PhraseQuery("field1", "some", "text")), not(0.0f)); + assertThat(mi.search(new PhraseQuery("field1", "text", "some")), is(0.0f)); + + } + }