mirror of https://github.com/apache/lucene.git
LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d91c246d7
commit
8963cf411b
|
@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.ServiceLoader; // javadocs
|
||||
|
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
* <p>
|
||||
* Files written by each posting format have an additional suffix containing the
|
||||
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
|
||||
* filenames would look like <tt>_1_Lucene40.prx</tt>.
|
||||
* filenames would look like <tt>_1_Lucene40_0.prx</tt>.
|
||||
* @see ServiceLoader
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
public static final String PER_FIELD_NAME = "PerField40";
|
||||
|
||||
public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
|
||||
public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";
|
||||
|
||||
public PerFieldPostingsFormat() {
|
||||
super(PER_FIELD_NAME);
|
||||
|
@ -65,11 +67,22 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
throws IOException {
|
||||
return new FieldsWriter(state);
|
||||
}
|
||||
|
||||
static class FieldsConsumerAndSuffix implements Closeable {
|
||||
FieldsConsumer consumer;
|
||||
int suffix;
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
consumer.close();
|
||||
}
|
||||
}
|
||||
|
||||
private class FieldsWriter extends FieldsConsumer {
|
||||
|
||||
private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>();
|
||||
|
||||
private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
|
||||
private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
|
||||
|
||||
private final SegmentWriteState segmentWriteState;
|
||||
|
||||
public FieldsWriter(SegmentWriteState state) throws IOException {
|
||||
|
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
if (format == null) {
|
||||
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
|
||||
}
|
||||
final String formatName = format.getName();
|
||||
|
||||
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName());
|
||||
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||
assert previousValue == null;
|
||||
|
||||
FieldsConsumer consumer = formats.get(format);
|
||||
|
||||
Integer suffix;
|
||||
|
||||
FieldsConsumerAndSuffix consumer = formats.get(format);
|
||||
if (consumer == null) {
|
||||
// First time we are seeing this format; create a new instance
|
||||
|
||||
// bump the suffix
|
||||
suffix = suffixes.get(formatName);
|
||||
if (suffix == null) {
|
||||
suffix = 0;
|
||||
} else {
|
||||
suffix = suffix + 1;
|
||||
}
|
||||
suffixes.put(formatName, suffix);
|
||||
|
||||
final String segmentSuffix = getFullSegmentSuffix(field.name,
|
||||
segmentWriteState.segmentSuffix,
|
||||
format.getName());
|
||||
consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
|
||||
getSuffix(formatName, Integer.toString(suffix)));
|
||||
consumer = new FieldsConsumerAndSuffix();
|
||||
consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
|
||||
consumer.suffix = suffix;
|
||||
formats.put(format, consumer);
|
||||
} else {
|
||||
// we've already seen this format, so just grab its suffix
|
||||
assert suffixes.containsKey(formatName);
|
||||
suffix = consumer.suffix;
|
||||
}
|
||||
|
||||
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
|
||||
assert previousValue == null;
|
||||
|
||||
// TODO: we should only provide the "slice" of FIS
|
||||
// that this PF actually sees ... then stuff like
|
||||
// .hasProx could work correctly?
|
||||
// NOTE: .hasProx is already broken in the same way for the non-perfield case,
|
||||
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
|
||||
return consumer.addField(field);
|
||||
return consumer.consumer.addField(field);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -110,6 +145,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
IOUtils.close(formats.values());
|
||||
}
|
||||
}
|
||||
|
||||
static String getSuffix(String formatName, String suffix) {
|
||||
return formatName + "_" + suffix;
|
||||
}
|
||||
|
||||
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
|
||||
if (outerSegmentSuffix.length() == 0) {
|
||||
|
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
private class FieldsReader extends FieldsProducer {
|
||||
|
||||
private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
|
||||
private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>();
|
||||
private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();
|
||||
|
||||
public FieldsReader(final SegmentReadState readState) throws IOException {
|
||||
|
||||
|
@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
|
||||
if (formatName != null) {
|
||||
// null formatName means the field is in fieldInfos, but has no postings!
|
||||
final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
|
||||
assert suffix != null;
|
||||
PostingsFormat format = PostingsFormat.forName(formatName);
|
||||
if (!formats.containsKey(format)) {
|
||||
formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName)));
|
||||
String segmentSuffix = getSuffix(formatName, suffix);
|
||||
if (!formats.containsKey(segmentSuffix)) {
|
||||
formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
|
||||
}
|
||||
fields.put(fieldName, formats.get(format));
|
||||
fields.put(fieldName, formats.get(segmentSuffix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
|
||||
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LogDocMergePolicy;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
|
|||
}
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSameCodecDifferentInstance() throws Exception {
|
||||
Codec codec = new Lucene40Codec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if ("id".equals(field)) {
|
||||
return new Pulsing40PostingsFormat(1);
|
||||
} else if ("date".equals(field)) {
|
||||
return new Pulsing40PostingsFormat(1);
|
||||
} else {
|
||||
return super.getPostingsFormatForField(field);
|
||||
}
|
||||
}
|
||||
};
|
||||
doTestMixedPostings(codec);
|
||||
}
|
||||
|
||||
public void testSameCodecDifferentParams() throws Exception {
|
||||
Codec codec = new Lucene40Codec() {
|
||||
@Override
|
||||
public PostingsFormat getPostingsFormatForField(String field) {
|
||||
if ("id".equals(field)) {
|
||||
return new Pulsing40PostingsFormat(1);
|
||||
} else if ("date".equals(field)) {
|
||||
return new Pulsing40PostingsFormat(2);
|
||||
} else {
|
||||
return super.getPostingsFormatForField(field);
|
||||
}
|
||||
}
|
||||
};
|
||||
doTestMixedPostings(codec);
|
||||
}
|
||||
|
||||
private void doTestMixedPostings(Codec codec) throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setCodec(codec);
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||
Document doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||
// turn on vectors for the checkindex cross-check
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
Field idField = new Field("id", "", ft);
|
||||
Field dateField = new Field("date", "", ft);
|
||||
doc.add(idField);
|
||||
doc.add(dateField);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
idField.setStringValue(Integer.toString(random().nextInt(50)));
|
||||
dateField.setStringValue(Integer.toString(random().nextInt(100)));
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
iw.close();
|
||||
dir.close(); // checkindex
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
|
|||
int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
|
||||
int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
|
||||
|
||||
// TODO: make it possible to specify min/max iterms per block via CL:
|
||||
minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
|
||||
maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
|
||||
add(avoidCodecs,
|
||||
new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
|
||||
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
// add pulsing again with (usually) different parameters
|
||||
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
new MockSepPostingsFormat(),
|
||||
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
|
||||
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
|
||||
|
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
|
|||
new NestedPulsingPostingsFormat(),
|
||||
new Lucene40WithOrds(),
|
||||
new SimpleTextPostingsFormat(),
|
||||
new MemoryPostingsFormat(random.nextBoolean()));
|
||||
new MemoryPostingsFormat(true),
|
||||
new MemoryPostingsFormat(false));
|
||||
|
||||
Collections.shuffle(formats, random);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue