LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
2012-05-30 20:15:58 +00:00 · 2012-05-30 20:15:58 +00:00 · 8963cf411b
parent 2d91c246d7
commit 8963cf411b
7 changed files with 119 additions and 18 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
 * limitations under the License.
 */

+import java.io.Closeable;
 import java.io.IOException;
-import java.util.IdentityHashMap;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.ServiceLoader; // javadocs
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
 * <p>
 * Files written by each posting format have an additional suffix containing the 
 * format name. For example, in a per-field configuration instead of <tt>_1.prx</tt> 
- * filenames would look like <tt>_1_Lucene40.prx</tt>.
+ * filenames would look like <tt>_1_Lucene40_0.prx</tt>.
 * @see ServiceLoader
 * @lucene.experimental
 */
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
  public static final String PER_FIELD_NAME = "PerField40";
  
  public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
+  public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";

  public PerFieldPostingsFormat() {
    super(PER_FIELD_NAME);
@ -65,11 +67,22 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      throws IOException {
    return new FieldsWriter(state);
  }
+  
+  static class FieldsConsumerAndSuffix implements Closeable {
+    FieldsConsumer consumer;
+    int suffix;
+    
+    @Override
+    public void close() throws IOException {
+      consumer.close();
+    }
+  }
    
  private class FieldsWriter extends FieldsConsumer {

-    private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>();
-
+    private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
+    private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
+    
    private final SegmentWriteState segmentWriteState;

    public FieldsWriter(SegmentWriteState state) throws IOException {
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      if (format == null) {
        throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
      }
+      final String formatName = format.getName();
      
-      String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName());
+      String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
      assert previousValue == null;
-
-      FieldsConsumer consumer = formats.get(format);
+      
+      Integer suffix;
+      
+      FieldsConsumerAndSuffix consumer = formats.get(format);
      if (consumer == null) {
        // First time we are seeing this format; create a new instance
+        
+        // bump the suffix
+        suffix = suffixes.get(formatName);
+        if (suffix == null) {
+          suffix = 0;
+        } else {
+          suffix = suffix + 1;
+        }
+        suffixes.put(formatName, suffix);
+        
        final String segmentSuffix = getFullSegmentSuffix(field.name,
                                                          segmentWriteState.segmentSuffix,
-                                                          format.getName());
-        consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
+                                                          getSuffix(formatName, Integer.toString(suffix)));
+        consumer = new FieldsConsumerAndSuffix();
+        consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
+        consumer.suffix = suffix;
        formats.put(format, consumer);
+      } else {
+        // we've already seen this format, so just grab its suffix
+        assert suffixes.containsKey(formatName);
+        suffix = consumer.suffix;
      }
+      
+      previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
+      assert previousValue == null;

      // TODO: we should only provide the "slice" of FIS
      // that this PF actually sees ... then stuff like
      // .hasProx could work correctly?
      // NOTE: .hasProx is already broken in the same way for the non-perfield case,
      // if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
-      return consumer.addField(field);
+      return consumer.consumer.addField(field);
    }

    @Override
@ -110,6 +145,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      IOUtils.close(formats.values());
    }
  }
+  
+  static String getSuffix(String formatName, String suffix) {
+    return formatName + "_" + suffix;
+  }

  static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
    if (outerSegmentSuffix.length() == 0) {
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
  private class FieldsReader extends FieldsProducer {

    private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
-    private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>();
+    private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();

    public FieldsReader(final SegmentReadState readState) throws IOException {

@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
            final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
            if (formatName != null) {
              // null formatName means the field is in fieldInfos, but has no postings!
+              final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
+              assert suffix != null;
              PostingsFormat format = PostingsFormat.forName(formatName);
-              if (!formats.containsKey(format)) {
-                formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName)));
+              String segmentSuffix = getSuffix(formatName, suffix);
+              if (!formats.containsKey(segmentSuffix)) {
+                formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
              }
-              fields.put(fieldName, formats.get(format));
+              fields.put(fieldName, formats.get(segmentSuffix));
            }
          }
        }
--- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
 import java.io.IOException;

 import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene40.Lucene40Codec;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
+import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogDocMergePolicy;
+import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.IndexSearcher;
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
    }
    dir.close();
  }
+  
+  public void testSameCodecDifferentInstance() throws Exception {
+    Codec codec = new Lucene40Codec() {
+      @Override
+      public PostingsFormat getPostingsFormatForField(String field) {
+        if ("id".equals(field)) {
+          return new Pulsing40PostingsFormat(1);
+        } else if ("date".equals(field)) {
+          return new Pulsing40PostingsFormat(1);
+        } else {
+          return super.getPostingsFormatForField(field);
+        }
+      }
+    };
+    doTestMixedPostings(codec);
+  }
+  
+  public void testSameCodecDifferentParams() throws Exception {
+    Codec codec = new Lucene40Codec() {
+      @Override
+      public PostingsFormat getPostingsFormatForField(String field) {
+        if ("id".equals(field)) {
+          return new Pulsing40PostingsFormat(1);
+        } else if ("date".equals(field)) {
+          return new Pulsing40PostingsFormat(2);
+        } else {
+          return super.getPostingsFormatForField(field);
+        }
+      }
+    };
+    doTestMixedPostings(codec);
+  }
+  
+  private void doTestMixedPostings(Codec codec) throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setCodec(codec);
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    Document doc = new Document();
+    FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
+    // turn on vectors for the checkindex cross-check
+    ft.setStoreTermVectors(true);
+    ft.setStoreTermVectorOffsets(true);
+    ft.setStoreTermVectorPositions(true);
+    Field idField = new Field("id", "", ft);
+    Field dateField = new Field("date", "", ft);
+    doc.add(idField);
+    doc.add(dateField);
+    for (int i = 0; i < 100; i++) {
+      idField.setStringValue(Integer.toString(random().nextInt(50)));
+      dateField.setStringValue(Integer.toString(random().nextInt(100)));
+      iw.addDocument(doc);
+    }
+    iw.close();
+    dir.close(); // checkindex
+  }
 }
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
    int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
    int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);

-    // TODO: make it possible to specify min/max iterms per block via CL:
-    minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
-    maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
    add(avoidCodecs,
        new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
        new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
+        // add pulsing again with (usually) different parameters
+        new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
        new MockSepPostingsFormat(),
        new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
        new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
        new NestedPulsingPostingsFormat(),
        new Lucene40WithOrds(),
        new SimpleTextPostingsFormat(),
-        new MemoryPostingsFormat(random.nextBoolean()));
+        new MemoryPostingsFormat(true),
+        new MemoryPostingsFormat(false));

    Collections.shuffle(formats, random);
  }