LUCENE-3354: FieldCache can cache DocTermOrds.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158393 13f79535-47bb-0310-9956-ffa450edef68
2011-08-16 17:44:16 +00:00 · 2011-08-16 17:44:16 +00:00 · 4dfa638783
parent 7eab19aff7
commit 4dfa638783
5 changed files with 160 additions and 42 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -473,6 +473,8 @@ New features
  different usecases like merging, flushing and reading.
  (Simon Willnauer, Mike McCandless, Varun Thacker)

+* LUCENE-3354: FieldCache can cache DocTermOrds. (Martijn van Groningen)
+
 Optimizations

 * LUCENE-2588: Don't store unnecessary suffixes when writing the terms
--- a/lucene/src/java/org/apache/lucene/search/FieldCache.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCache.java
@ -17,6 +17,7 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import org.apache.lucene.index.DocTermOrds;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.cache.EntryCreator;
@ -653,6 +654,18 @@ public interface FieldCache {
  public DocTermsIndex getTermsIndex (IndexReader reader, String field, boolean fasterButMoreRAM)
  throws IOException;

+  /**
+   * Checks the internal cache for an appropriate entry, and if none is found, reads the term values
+   * in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
+   * the terms (as ords) per document.
+   *
+   * @param reader  Used to build a {@link DocTermOrds} instance
+   * @param field   Which field contains the strings.
+   * @return a {@link DocTermOrds} instance
+   * @throws IOException  If any error occurs.
+   */
+  public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException;
+
  /**
   * EXPERT: A unique Identifier/Description for each item in the FieldCache. 
   * Can be useful for logging/debugging.
--- a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
@ -17,31 +17,15 @@ package org.apache.lucene.search;
 * limitations under the License.
 */

+import org.apache.lucene.index.DocTermOrds;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.cache.*;
+import org.apache.lucene.search.cache.CachedArray.*;
+import org.apache.lucene.util.FieldCacheSanityChecker;
+
 import java.io.IOException;
 import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.WeakHashMap;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.cache.ByteValuesCreator;
-import org.apache.lucene.search.cache.DocTermsCreator;
-import org.apache.lucene.search.cache.DocTermsIndexCreator;
-import org.apache.lucene.search.cache.DoubleValuesCreator;
-import org.apache.lucene.search.cache.EntryCreator;
-import org.apache.lucene.search.cache.FloatValuesCreator;
-import org.apache.lucene.search.cache.IntValuesCreator;
-import org.apache.lucene.search.cache.LongValuesCreator;
-import org.apache.lucene.search.cache.ShortValuesCreator;
-import org.apache.lucene.search.cache.CachedArray.ByteValues;
-import org.apache.lucene.search.cache.CachedArray.DoubleValues;
-import org.apache.lucene.search.cache.CachedArray.FloatValues;
-import org.apache.lucene.search.cache.CachedArray.IntValues;
-import org.apache.lucene.search.cache.CachedArray.LongValues;
-import org.apache.lucene.search.cache.CachedArray.ShortValues;
-import org.apache.lucene.util.FieldCacheSanityChecker;
+import java.util.*;

 /**
 * Expert: The default cache implementation, storing all values in memory.
@ -70,6 +54,7 @@ public class FieldCacheImpl implements FieldCache {  // Made Public so that
    caches.put(Double.TYPE, new Cache<DoubleValues>(this));
    caches.put(DocTermsIndex.class, new Cache<DocTermsIndex>(this));
    caches.put(DocTerms.class, new Cache<DocTerms>(this));
+    caches.put(DocTermOrds.class, new Cache<DocTermOrds>(this));
  }
  
  public synchronized void purgeAllCaches() {
@ -393,6 +378,11 @@ public class FieldCacheImpl implements FieldCache {  // Made Public so that
    return (DocTerms)caches.get(DocTerms.class).get(reader, new Entry(field, creator));
  }

+  @SuppressWarnings("unchecked")
+  public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException {
+    return (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new Entry(field, new DocTermOrdsCreator(field, 0)));
+  }
+
  private volatile PrintStream infoStream;

  public void setInfoStream(PrintStream stream) {
--- a/lucene/src/java/org/apache/lucene/search/cache/DocTermOrdsCreator.java
+++ b/lucene/src/java/org/apache/lucene/search/cache/DocTermOrdsCreator.java
@ -0,0 +1,51 @@
+package org.apache.lucene.search.cache;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.DocTermOrds;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+
+/**
+ * Creates {@link DocTermOrds} instances.
+ */
+public class DocTermOrdsCreator extends EntryCreatorWithOptions<DocTermOrds> {
+
+  private final String field;
+
+  public DocTermOrdsCreator(String field, int flag) {
+    super(flag);
+    this.field = field;
+  }
+
+  @Override
+  public DocTermOrds create(IndexReader reader) throws IOException {
+    return new DocTermOrds(reader, field);
+  }
+
+  @Override
+  public DocTermOrds validate(DocTermOrds entry, IndexReader reader) throws IOException {
+    return entry;
+  }
+
+  @Override
+  public EntryKey getCacheKey() {
+    return new SimpleEntryKey(DocTermOrdsCreator.class, field);
+  }
+}
--- a/lucene/src/test/org/apache/lucene/search/TestFieldCache.java
+++ b/lucene/src/test/org/apache/lucene/search/TestFieldCache.java
@ -19,28 +19,33 @@ package org.apache.lucene.search;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.*;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
-import org.apache.lucene.util.BytesRef;
-import java.io.IOException;
+
 import java.io.ByteArrayOutputStream;
+import java.io.IOException;
 import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedHashSet;
+import java.util.List;

 public class TestFieldCache extends LuceneTestCase {
  protected IndexReader reader;
  private int NUM_DOCS;
+  private int NUM_ORDS;
  private String[] unicodeStrings;
+  private BytesRef[][] multiValued;
  private Directory directory;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    NUM_DOCS = atLeast(1000);
+    NUM_ORDS = atLeast(2);
    directory = newDirectory();
    RandomIndexWriter writer= new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
    long theLong = Long.MAX_VALUE;
@ -50,6 +55,7 @@ public class TestFieldCache extends LuceneTestCase {
    int theInt = Integer.MAX_VALUE;
    float theFloat = Float.MAX_VALUE;
    unicodeStrings = new String[NUM_DOCS];
+    multiValued = new BytesRef[NUM_DOCS][NUM_ORDS];
    if (VERBOSE) {
      System.out.println("TEST: setUp");
    }
@ -65,21 +71,19 @@ public class TestFieldCache extends LuceneTestCase {

      // sometimes skip the field:
      if (random.nextInt(40) != 17) {
-        String s = null;
-        if (i > 0 && random.nextInt(3) == 1) {
-          // reuse past string -- try to find one that's not null
-          for(int iter=0;iter<10 && s==null;iter++) {
-            s = unicodeStrings[random.nextInt(i)];
-          }
-          if (s == null) {
-            s = _TestUtil.randomUnicodeString(random, 250);
-          }
-        } else {
-          s = _TestUtil.randomUnicodeString(random, 250);
-        }
-        unicodeStrings[i] = s;
+        unicodeStrings[i] = generateString(i);
        doc.add(newField("theRandomUnicodeString", unicodeStrings[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
      }
+
+      // sometimes skip the field:
+      if (random.nextInt(10) != 8) {
+        for (int j = 0; j < NUM_ORDS; j++) {
+          String newValue = generateString(i);
+          multiValued[i][j] = new BytesRef(newValue);
+          doc.add(newField("theRandomUnicodeMultiValuedField", newValue, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
+        }
+        Arrays.sort(multiValued[i]);
+      }
      writer.addDocument(doc);
    }
    reader = writer.getReader();
@ -210,6 +214,47 @@ public class TestFieldCache extends LuceneTestCase {
    // test bad field
    terms = cache.getTerms(reader, "bogusfield");

+    // getDocTermOrds
+    DocTermOrds termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField");
+    TermsEnum termsEnum = termOrds.getOrdTermsEnum(reader);
+    assertSame("Second request to cache return same DocTermOrds", termOrds, cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"));
+    DocTermOrds.TermOrdsIterator reuse = null;
+    for (int i = 0; i < NUM_DOCS; i++) {
+      reuse = termOrds.lookup(i, reuse);
+      final int[] buffer = new int[5];
+      // This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId
+      List<BytesRef> values = new ArrayList<BytesRef>(new LinkedHashSet<BytesRef>(Arrays.asList(multiValued[i])));
+      for (;;) {
+        int chunk = reuse.read(buffer);
+        if (chunk == 0) {
+          for (int ord = 0; ord < values.size(); ord++) {
+            BytesRef term = values.get(ord);
+            assertNull(String.format("Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
+          }
+          break;
+        }
+
+        for(int idx=0; idx < chunk; idx++) {
+          int key = buffer[idx];
+          termsEnum.seekExact((long) key);
+          String actual = termsEnum.term().utf8ToString();
+          String expected = values.get(idx).utf8ToString();
+          if (!expected.equals(actual)) {
+              reuse = termOrds.lookup(i, reuse);
+              reuse.read(buffer);
+          }
+          assertTrue(String.format("Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
+        }
+
+        if (chunk < buffer.length) {
+          break;
+        }
+      }
+    }
+
+    // test bad field
+    termOrds = cache.getDocTermOrds(reader, "bogusfield");
+
    FieldCache.DEFAULT.purge(reader);
  }

@ -223,4 +268,21 @@ public class TestFieldCache extends LuceneTestCase {
    r.close();
    dir.close();
  }
+
+  private String generateString(int i) {
+    String s = null;
+    if (i > 0 && random.nextInt(3) == 1) {
+      // reuse past string -- try to find one that's not null
+      for(int iter = 0; iter < 10 && s == null;iter++) {
+        s = unicodeStrings[random.nextInt(i)];
+      }
+      if (s == null) {
+        s = _TestUtil.randomUnicodeString(random, 250);
+      }
+    } else {
+      s = _TestUtil.randomUnicodeString(random, 250);
+    }
+    return s;
+  }
+
 }