LUCENE-6992: Allow adding IndexableField to MemoryIndex

2016-01-25 16:10:49 +00:00 · 2016-01-25 16:10:49 +00:00 · 5e935d7106
parent a3b2ad334c
commit 5e935d7106
3 changed files with 106 additions and 31 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -257,6 +257,9 @@ Other
  Locale#forLanguageTag() and Locale#toString() were placed on list
  of forbidden signatures.  (Uwe Schindler, Robert Muir)

+* LUCENE-6988: You can now add IndexableFields directly to a MemoryIndex,
+  and create a MemoryIndex from a lucene Document.  (Alan Woodward)
+
 ======================= Lucene 5.4.1 =======================

 Bug Fixes
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@ -33,46 +33,18 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.PointValues;
-import org.apache.lucene.index.DocValuesType;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FieldInfos;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.NumericDocValues;
-import org.apache.lucene.index.OrdTermState;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedNumericDocValues;
-import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.index.StoredFieldVisitor;
-import org.apache.lucene.index.TermState;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.*;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.SimpleCollector;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.ByteBlockPool;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefArray;
-import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.*;
 import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
-import org.apache.lucene.util.BytesRefHash;
-import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.IntBlockPool.SliceReader;
 import org.apache.lucene.util.IntBlockPool.SliceWriter;
-import org.apache.lucene.util.IntBlockPool;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.RecyclingByteBlockAllocator;
-import org.apache.lucene.util.RecyclingIntBlockAllocator;

 /**
 * High-performance single-document main memory Apache Lucene fulltext search index. 
@ -288,6 +260,46 @@ public class MemoryIndex {
    addField(fieldName, stream, 1.0f, analyzer.getPositionIncrementGap(fieldName), analyzer.getOffsetGap(fieldName));
  }

+  /**
+   * Builds a MemoryIndex from a lucene {@link Document} using an analyzer
+   *
+   * @param document the document to index
+   * @param analyzer the analyzer to use
+   * @return a MemoryIndex
+   */
+  public static MemoryIndex fromDocument(Document document, Analyzer analyzer) {
+    return fromDocument(document, analyzer, false, false, 0);
+  }
+
+  /**
+   * Builds a MemoryIndex from a lucene {@link Document} using an analyzer
+   * @param document the document to index
+   * @param analyzer the analyzer to use
+   * @param storeOffsets <code>true</code> if offsets should be stored
+   * @param storePayloads <code>true</code> if payloads should be stored
+   * @return a MemoryIndex
+   */
+  public static MemoryIndex fromDocument(Document document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads) {
+    return fromDocument(document, analyzer, storeOffsets, storePayloads, 0);
+  }
+
+  /**
+   * Builds a MemoryIndex from a lucene {@link Document} using an analyzer
+   * @param document the document to index
+   * @param analyzer the analyzer to use
+   * @param storeOffsets <code>true</code> if offsets should be stored
+   * @param storePayloads <code>true</code> if payloads should be stored
+   * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called
+   * @return a MemoryIndex
+   */
+  public static MemoryIndex fromDocument(Document document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads, long maxReusedBytes) {
+    MemoryIndex mi = new MemoryIndex(storeOffsets, storePayloads, maxReusedBytes);
+    for (IndexableField field : document) {
+      mi.addField(field, analyzer);
+    }
+    return mi;
+  }
+
  /**
   * Convenience method; Creates and returns a token stream that generates a
   * token for each keyword in the given collection, "as is", without any
@ -339,6 +351,39 @@ public class MemoryIndex {
  public void addField(String fieldName, TokenStream stream) {
    addField(fieldName, stream, 1.0f);
  }
+
+  /**
+   * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer
+   * @param field the field to add
+   * @param analyzer the analyzer to use for term analysis
+   * @throws IllegalArgumentException if the field is a DocValues or Point field, as these
+   *                                  structures are not supported by MemoryIndex
+   */
+  public void addField(IndexableField field, Analyzer analyzer) {
+    addField(field, analyzer, 1.0f);
+  }
+
+  /**
+   * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer
+   * @param field the field to add
+   * @param analyzer the analyzer to use for term analysis
+   * @param boost a field boost
+   * @throws IllegalArgumentException if the field is a DocValues or Point field, as these
+   *                                  structures are not supported by MemoryIndex
+   */
+  public void addField(IndexableField field, Analyzer analyzer, float boost) {
+    if (field.fieldType().docValuesType() != DocValuesType.NONE)
+      throw new IllegalArgumentException("MemoryIndex does not support DocValues fields");
+    if (field.fieldType().pointDimensionCount() != 0)
+      throw new IllegalArgumentException("MemoryIndex does not support Points");
+    if (analyzer == null) {
+      addField(field.name(), field.tokenStream(null, null), boost);
+    }
+    else {
+      addField(field.name(), field.tokenStream(analyzer, null), boost,
+          analyzer.getPositionIncrementGap(field.name()), analyzer.getOffsetGap(field.name()));
+    }
+  }
  
  /**
   * Iterates over the given token stream and adds the resulting terms to the index;
--- a/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
+++ b/lucene/memory/src/test/org/apache/lucene/index/memory/TestMemoryIndex.java
@ -22,12 +22,17 @@ import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockPayloadAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.similarities.BM25Similarity;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
@ -156,5 +161,27 @@ public class TestMemoryIndex extends LuceneTestCase {
    TestUtil.checkReader(reader);
  }

+  @Test
+  public void testBuildFromDocument() {
+
+    Document doc = new Document();
+    doc.add(new TextField("field1", "some text", Field.Store.NO));
+    doc.add(new TextField("field1", "some more text", Field.Store.NO));
+    doc.add(new StringField("field2", "untokenized text", Field.Store.NO));
+
+    analyzer.setPositionIncrementGap(100);
+
+    MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
+
+    assertThat(mi.search(new TermQuery(new Term("field1", "text"))), not(0.0f));
+    assertThat(mi.search(new TermQuery(new Term("field2", "text"))), is(0.0f));
+    assertThat(mi.search(new TermQuery(new Term("field2", "untokenized text"))), not(0.0f));
+
+    assertThat(mi.search(new PhraseQuery("field1", "some", "more", "text")), not(0.0f));
+    assertThat(mi.search(new PhraseQuery("field1", "some", "text")), not(0.0f));
+    assertThat(mi.search(new PhraseQuery("field1", "text", "some")), is(0.0f));
+
+  }
+

 }