LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
2012-05-30 20:15:58 +00:00 · 2012-05-30 20:15:58 +00:00 · 8963cf411b
parent 2d91c246d7
commit 8963cf411b
7 changed files with 119 additions and 18 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldPostingsFormat.java
@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
 * limitations under the License.
 */
 import java.io.Closeable;
 import java.io.IOException;
-import java.util.IdentityHashMap;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.ServiceLoader; // javadocs
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
 * <p>
 * Files written by each posting format have an additional suffix containing the 
 * format name. For example, in a per-field configuration instead of <tt>_1.prx</tt> 
- * filenames would look like <tt>_1_Lucene40.prx</tt>.
+ * filenames would look like <tt>_1_Lucene40_0.prx</tt>.
 * @see ServiceLoader
 * @lucene.experimental
 */
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
  public static final String PER_FIELD_NAME = "PerField40";
  public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
  public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";
  public PerFieldPostingsFormat() {
    super(PER_FIELD_NAME);
@ -65,11 +67,22 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      throws IOException {
    return new FieldsWriter(state);
  }
  static class FieldsConsumerAndSuffix implements Closeable {
    FieldsConsumer consumer;
    int suffix;
    @Override
    public void close() throws IOException {
      consumer.close();
    }
  }
  private class FieldsWriter extends FieldsConsumer {
-    private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>();
+    private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
-
+    private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
    private final SegmentWriteState segmentWriteState;
    public FieldsWriter(SegmentWriteState state) throws IOException {
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      if (format == null) {
        throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
      }
      final String formatName = format.getName();
-      String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName());
+      String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
      assert previousValue == null;
-
+      
-      FieldsConsumer consumer = formats.get(format);
+      Integer suffix;
      FieldsConsumerAndSuffix consumer = formats.get(format);
      if (consumer == null) {
        // First time we are seeing this format; create a new instance
        // bump the suffix
        suffix = suffixes.get(formatName);
        if (suffix == null) {
          suffix = 0;
        } else {
          suffix = suffix + 1;
        }
        suffixes.put(formatName, suffix);
        final String segmentSuffix = getFullSegmentSuffix(field.name,
                                                          segmentWriteState.segmentSuffix,
-                                                          format.getName());
+                                                          getSuffix(formatName, Integer.toString(suffix)));
-        consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
+        consumer = new FieldsConsumerAndSuffix();
        consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
        consumer.suffix = suffix;
        formats.put(format, consumer);
      } else {
        // we've already seen this format, so just grab its suffix
        assert suffixes.containsKey(formatName);
        suffix = consumer.suffix;
      }
      previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
      assert previousValue == null;
      // TODO: we should only provide the "slice" of FIS
      // that this PF actually sees ... then stuff like
      // .hasProx could work correctly?
      // NOTE: .hasProx is already broken in the same way for the non-perfield case,
      // if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
-      return consumer.addField(field);
+      return consumer.consumer.addField(field);
    }
    @Override
@ -110,6 +145,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
      IOUtils.close(formats.values());
    }
  }
  static String getSuffix(String formatName, String suffix) {
    return formatName + "_" + suffix;
  }
  static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
    if (outerSegmentSuffix.length() == 0) {
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
  private class FieldsReader extends FieldsProducer {
    private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
-    private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>();
+    private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();
    public FieldsReader(final SegmentReadState readState) throws IOException {
@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
            final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
            if (formatName != null) {
              // null formatName means the field is in fieldInfos, but has no postings!
              final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
              assert suffix != null;
              PostingsFormat format = PostingsFormat.forName(formatName);
-              if (!formats.containsKey(format)) {
+              String segmentSuffix = getSuffix(formatName, suffix);
-                formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName)));
+              if (!formats.containsKey(segmentSuffix)) {
                formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
              }
-              fields.put(fieldName, formats.get(format));
+              fields.put(fieldName, formats.get(segmentSuffix));
            }
          }
        }
--- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldPostingsFormat.java
@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
 import java.io.IOException;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.lucene40.Lucene40Codec;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
 import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
 import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogDocMergePolicy;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.search.IndexSearcher;
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
    }
    dir.close();
  }
  public void testSameCodecDifferentInstance() throws Exception {
    Codec codec = new Lucene40Codec() {
      @Override
      public PostingsFormat getPostingsFormatForField(String field) {
        if ("id".equals(field)) {
          return new Pulsing40PostingsFormat(1);
        } else if ("date".equals(field)) {
          return new Pulsing40PostingsFormat(1);
        } else {
          return super.getPostingsFormatForField(field);
        }
      }
    };
    doTestMixedPostings(codec);
  }
  public void testSameCodecDifferentParams() throws Exception {
    Codec codec = new Lucene40Codec() {
      @Override
      public PostingsFormat getPostingsFormatForField(String field) {
        if ("id".equals(field)) {
          return new Pulsing40PostingsFormat(1);
        } else if ("date".equals(field)) {
          return new Pulsing40PostingsFormat(2);
        } else {
          return super.getPostingsFormatForField(field);
        }
      }
    };
    doTestMixedPostings(codec);
  }
  private void doTestMixedPostings(Codec codec) throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
    iwc.setCodec(codec);
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
    // turn on vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    Field idField = new Field("id", "", ft);
    Field dateField = new Field("date", "", ft);
    doc.add(idField);
    doc.add(dateField);
    for (int i = 0; i < 100; i++) {
      idField.setStringValue(Integer.toString(random().nextInt(50)));
      dateField.setStringValue(Integer.toString(random().nextInt(100)));
      iw.addDocument(doc);
    }
    iw.close();
    dir.close(); // checkindex
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
    int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
    int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
    // TODO: make it possible to specify min/max iterms per block via CL:
    minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
    maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
    add(avoidCodecs,
        new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
        new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
        // add pulsing again with (usually) different parameters
        new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
        new MockSepPostingsFormat(),
        new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
        new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
        new NestedPulsingPostingsFormat(),
        new Lucene40WithOrds(),
        new SimpleTextPostingsFormat(),
-        new MemoryPostingsFormat(random.nextBoolean()));
+        new MemoryPostingsFormat(true),
        new MemoryPostingsFormat(false));
    Collections.shuffle(formats, random);
  }