LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-30 20:15:58 +00:00
parent 2d91c246d7
commit 8963cf411b
7 changed files with 119 additions and 18 deletions

View File

@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.IdentityHashMap;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.ServiceLoader; // javadocs
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
* <p>
* Files written by each posting format have an additional suffix containing the
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
* filenames would look like <tt>_1_Lucene40.prx</tt>.
* filenames would look like <tt>_1_Lucene40_0.prx</tt>.
* @see ServiceLoader
* @lucene.experimental
*/
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
public static final String PER_FIELD_NAME = "PerField40";
public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";
public PerFieldPostingsFormat() {
super(PER_FIELD_NAME);
@ -65,11 +67,22 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
throws IOException {
return new FieldsWriter(state);
}
static class FieldsConsumerAndSuffix implements Closeable {
FieldsConsumer consumer;
int suffix;
@Override
public void close() throws IOException {
consumer.close();
}
}
private class FieldsWriter extends FieldsConsumer {
private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>();
private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
private final SegmentWriteState segmentWriteState;
public FieldsWriter(SegmentWriteState state) throws IOException {
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
if (format == null) {
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
}
final String formatName = format.getName();
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName());
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
assert previousValue == null;
FieldsConsumer consumer = formats.get(format);
Integer suffix;
FieldsConsumerAndSuffix consumer = formats.get(format);
if (consumer == null) {
// First time we are seeing this format; create a new instance
// bump the suffix
suffix = suffixes.get(formatName);
if (suffix == null) {
suffix = 0;
} else {
suffix = suffix + 1;
}
suffixes.put(formatName, suffix);
final String segmentSuffix = getFullSegmentSuffix(field.name,
segmentWriteState.segmentSuffix,
format.getName());
consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
getSuffix(formatName, Integer.toString(suffix)));
consumer = new FieldsConsumerAndSuffix();
consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
consumer.suffix = suffix;
formats.put(format, consumer);
} else {
// we've already seen this format, so just grab its suffix
assert suffixes.containsKey(formatName);
suffix = consumer.suffix;
}
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
assert previousValue == null;
// TODO: we should only provide the "slice" of FIS
// that this PF actually sees ... then stuff like
// .hasProx could work correctly?
// NOTE: .hasProx is already broken in the same way for the non-perfield case,
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
return consumer.addField(field);
return consumer.consumer.addField(field);
}
@Override
@ -110,6 +145,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
IOUtils.close(formats.values());
}
}
static String getSuffix(String formatName, String suffix) {
return formatName + "_" + suffix;
}
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
if (outerSegmentSuffix.length() == 0) {
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
private class FieldsReader extends FieldsProducer {
private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>();
private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();
public FieldsReader(final SegmentReadState readState) throws IOException {
@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
if (formatName != null) {
// null formatName means the field is in fieldInfos, but has no postings!
final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
assert suffix != null;
PostingsFormat format = PostingsFormat.forName(formatName);
if (!formats.containsKey(format)) {
formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName)));
String segmentSuffix = getSuffix(formatName, suffix);
if (!formats.containsKey(segmentSuffix)) {
formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
}
fields.put(fieldName, formats.get(format));
fields.put(fieldName, formats.get(segmentSuffix));
}
}
}

View File

@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
}
dir.close();
}
public void testSameCodecDifferentInstance() throws Exception {
Codec codec = new Lucene40Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else if ("date".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
public void testSameCodecDifferentParams() throws Exception {
Codec codec = new Lucene40Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new Pulsing40PostingsFormat(1);
} else if ("date".equals(field)) {
return new Pulsing40PostingsFormat(2);
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
private void doTestMixedPostings(Codec codec) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setCodec(codec);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
// turn on vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field idField = new Field("id", "", ft);
Field dateField = new Field("date", "", ft);
doc.add(idField);
doc.add(dateField);
for (int i = 0; i < 100; i++) {
idField.setStringValue(Integer.toString(random().nextInt(50)));
dateField.setStringValue(Integer.toString(random().nextInt(100)));
iw.addDocument(doc);
}
iw.close();
dir.close(); // checkindex
}
}

View File

@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
// TODO: make it possible to specify min/max iterms per block via CL:
minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
add(avoidCodecs,
new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
// add pulsing again with (usually) different parameters
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
new MockSepPostingsFormat(),
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
new NestedPulsingPostingsFormat(),
new Lucene40WithOrds(),
new SimpleTextPostingsFormat(),
new MemoryPostingsFormat(random.nextBoolean()));
new MemoryPostingsFormat(true),
new MemoryPostingsFormat(false));
Collections.shuffle(formats, random);
}