LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-30 20:15:58 +00:00
parent 2d91c246d7
commit 8963cf411b
7 changed files with 119 additions and 18 deletions

View File

@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.IdentityHashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.ServiceLoader; // javadocs import java.util.ServiceLoader; // javadocs
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
* <p> * <p>
* Files written by each posting format have an additional suffix containing the * Files written by each posting format have an additional suffix containing the
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt> * format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
* filenames would look like <tt>_1_Lucene40.prx</tt>. * filenames would look like <tt>_1_Lucene40_0.prx</tt>.
* @see ServiceLoader * @see ServiceLoader
* @lucene.experimental * @lucene.experimental
*/ */
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
public static final String PER_FIELD_NAME = "PerField40"; public static final String PER_FIELD_NAME = "PerField40";
public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format"; public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";
public PerFieldPostingsFormat() { public PerFieldPostingsFormat() {
super(PER_FIELD_NAME); super(PER_FIELD_NAME);
@ -66,9 +68,20 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
return new FieldsWriter(state); return new FieldsWriter(state);
} }
static class FieldsConsumerAndSuffix implements Closeable {
FieldsConsumer consumer;
int suffix;
@Override
public void close() throws IOException {
consumer.close();
}
}
private class FieldsWriter extends FieldsConsumer { private class FieldsWriter extends FieldsConsumer {
private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>(); private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
private final SegmentWriteState segmentWriteState; private final SegmentWriteState segmentWriteState;
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
if (format == null) { if (format == null) {
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\""); throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
} }
final String formatName = format.getName();
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName()); String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
assert previousValue == null; assert previousValue == null;
FieldsConsumer consumer = formats.get(format); Integer suffix;
FieldsConsumerAndSuffix consumer = formats.get(format);
if (consumer == null) { if (consumer == null) {
// First time we are seeing this format; create a new instance // First time we are seeing this format; create a new instance
// bump the suffix
suffix = suffixes.get(formatName);
if (suffix == null) {
suffix = 0;
} else {
suffix = suffix + 1;
}
suffixes.put(formatName, suffix);
final String segmentSuffix = getFullSegmentSuffix(field.name, final String segmentSuffix = getFullSegmentSuffix(field.name,
segmentWriteState.segmentSuffix, segmentWriteState.segmentSuffix,
format.getName()); getSuffix(formatName, Integer.toString(suffix)));
consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix)); consumer = new FieldsConsumerAndSuffix();
consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
consumer.suffix = suffix;
formats.put(format, consumer); formats.put(format, consumer);
} else {
// we've already seen this format, so just grab its suffix
assert suffixes.containsKey(formatName);
suffix = consumer.suffix;
} }
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
assert previousValue == null;
// TODO: we should only provide the "slice" of FIS // TODO: we should only provide the "slice" of FIS
// that this PF actually sees ... then stuff like // that this PF actually sees ... then stuff like
// .hasProx could work correctly? // .hasProx could work correctly?
// NOTE: .hasProx is already broken in the same way for the non-perfield case, // NOTE: .hasProx is already broken in the same way for the non-perfield case,
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file. // if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
return consumer.addField(field); return consumer.consumer.addField(field);
} }
@Override @Override
@ -111,6 +146,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
} }
} }
static String getSuffix(String formatName, String suffix) {
return formatName + "_" + suffix;
}
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) { static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
if (outerSegmentSuffix.length() == 0) { if (outerSegmentSuffix.length() == 0) {
return segmentSuffix; return segmentSuffix;
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
private class FieldsReader extends FieldsProducer { private class FieldsReader extends FieldsProducer {
private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>(); private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>(); private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();
public FieldsReader(final SegmentReadState readState) throws IOException { public FieldsReader(final SegmentReadState readState) throws IOException {
@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY); final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
if (formatName != null) { if (formatName != null) {
// null formatName means the field is in fieldInfos, but has no postings! // null formatName means the field is in fieldInfos, but has no postings!
final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
assert suffix != null;
PostingsFormat format = PostingsFormat.forName(formatName); PostingsFormat format = PostingsFormat.forName(formatName);
if (!formats.containsKey(format)) { String segmentSuffix = getSuffix(formatName, suffix);
formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName))); if (!formats.containsKey(segmentSuffix)) {
formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
} }
fields.put(fieldName, formats.get(format)); fields.put(fieldName, formats.get(segmentSuffix));
} }
} }
} }

View File

@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40Codec; import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat; import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
} }
dir.close(); dir.close();
} }
public void testSameCodecDifferentInstance() throws Exception {
Codec codec = new Lucene40Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else if ("date".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
public void testSameCodecDifferentParams() throws Exception {
Codec codec = new Lucene40Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else if ("date".equals(field)) {
return new Pulsing40PostingsFormat(2);
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
private void doTestMixedPostings(Codec codec) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setCodec(codec);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
// turn on vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field idField = new Field("id", "", ft);
Field dateField = new Field("date", "", ft);
doc.add(idField);
doc.add(dateField);
for (int i = 0; i < 100; i++) {
idField.setStringValue(Integer.toString(random().nextInt(50)));
dateField.setStringValue(Integer.toString(random().nextInt(100)));
iw.addDocument(doc);
}
iw.close();
dir.close(); // checkindex
}
} }

View File

@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100); int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100); int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
// TODO: make it possible to specify min/max iterms per block via CL:
minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
add(avoidCodecs, add(avoidCodecs,
new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock), new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
// add pulsing again with (usually) different parameters
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
new MockSepPostingsFormat(), new MockSepPostingsFormat(),
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)), new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)), new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
new NestedPulsingPostingsFormat(), new NestedPulsingPostingsFormat(),
new Lucene40WithOrds(), new Lucene40WithOrds(),
new SimpleTextPostingsFormat(), new SimpleTextPostingsFormat(),
new MemoryPostingsFormat(random.nextBoolean())); new MemoryPostingsFormat(true),
new MemoryPostingsFormat(false));
Collections.shuffle(formats, random); Collections.shuffle(formats, random);
} }