mirror of https://github.com/apache/lucene.git
LUCENE-4090: PerFieldPostingsFormat cannot use name as suffix
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344441 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d91c246d7
commit
8963cf411b
|
@ -17,8 +17,9 @@ package org.apache.lucene.codecs.perfield;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.IdentityHashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.ServiceLoader; // javadocs
|
import java.util.ServiceLoader; // javadocs
|
||||||
|
@ -46,7 +47,7 @@ import org.apache.lucene.util.IOUtils;
|
||||||
* <p>
|
* <p>
|
||||||
* Files written by each posting format have an additional suffix containing the
|
* Files written by each posting format have an additional suffix containing the
|
||||||
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
|
* format name. For example, in a per-field configuration instead of <tt>_1.prx</tt>
|
||||||
* filenames would look like <tt>_1_Lucene40.prx</tt>.
|
* filenames would look like <tt>_1_Lucene40_0.prx</tt>.
|
||||||
* @see ServiceLoader
|
* @see ServiceLoader
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -55,6 +56,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
public static final String PER_FIELD_NAME = "PerField40";
|
public static final String PER_FIELD_NAME = "PerField40";
|
||||||
|
|
||||||
public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
|
public static final String PER_FIELD_FORMAT_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".format";
|
||||||
|
public static final String PER_FIELD_SUFFIX_KEY = PerFieldPostingsFormat.class.getSimpleName() + ".suffix";
|
||||||
|
|
||||||
public PerFieldPostingsFormat() {
|
public PerFieldPostingsFormat() {
|
||||||
super(PER_FIELD_NAME);
|
super(PER_FIELD_NAME);
|
||||||
|
@ -65,11 +67,22 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return new FieldsWriter(state);
|
return new FieldsWriter(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static class FieldsConsumerAndSuffix implements Closeable {
|
||||||
|
FieldsConsumer consumer;
|
||||||
|
int suffix;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
consumer.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class FieldsWriter extends FieldsConsumer {
|
private class FieldsWriter extends FieldsConsumer {
|
||||||
|
|
||||||
private final Map<PostingsFormat,FieldsConsumer> formats = new IdentityHashMap<PostingsFormat,FieldsConsumer>();
|
private final Map<PostingsFormat,FieldsConsumerAndSuffix> formats = new HashMap<PostingsFormat,FieldsConsumerAndSuffix>();
|
||||||
|
private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
|
||||||
|
|
||||||
private final SegmentWriteState segmentWriteState;
|
private final SegmentWriteState segmentWriteState;
|
||||||
|
|
||||||
public FieldsWriter(SegmentWriteState state) throws IOException {
|
public FieldsWriter(SegmentWriteState state) throws IOException {
|
||||||
|
@ -82,26 +95,48 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
if (format == null) {
|
if (format == null) {
|
||||||
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
|
throw new IllegalStateException("invalid null PostingsFormat for field=\"" + field.name + "\"");
|
||||||
}
|
}
|
||||||
|
final String formatName = format.getName();
|
||||||
|
|
||||||
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, format.getName());
|
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
|
||||||
assert previousValue == null;
|
assert previousValue == null;
|
||||||
|
|
||||||
FieldsConsumer consumer = formats.get(format);
|
Integer suffix;
|
||||||
|
|
||||||
|
FieldsConsumerAndSuffix consumer = formats.get(format);
|
||||||
if (consumer == null) {
|
if (consumer == null) {
|
||||||
// First time we are seeing this format; create a new instance
|
// First time we are seeing this format; create a new instance
|
||||||
|
|
||||||
|
// bump the suffix
|
||||||
|
suffix = suffixes.get(formatName);
|
||||||
|
if (suffix == null) {
|
||||||
|
suffix = 0;
|
||||||
|
} else {
|
||||||
|
suffix = suffix + 1;
|
||||||
|
}
|
||||||
|
suffixes.put(formatName, suffix);
|
||||||
|
|
||||||
final String segmentSuffix = getFullSegmentSuffix(field.name,
|
final String segmentSuffix = getFullSegmentSuffix(field.name,
|
||||||
segmentWriteState.segmentSuffix,
|
segmentWriteState.segmentSuffix,
|
||||||
format.getName());
|
getSuffix(formatName, Integer.toString(suffix)));
|
||||||
consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
|
consumer = new FieldsConsumerAndSuffix();
|
||||||
|
consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
|
||||||
|
consumer.suffix = suffix;
|
||||||
formats.put(format, consumer);
|
formats.put(format, consumer);
|
||||||
|
} else {
|
||||||
|
// we've already seen this format, so just grab its suffix
|
||||||
|
assert suffixes.containsKey(formatName);
|
||||||
|
suffix = consumer.suffix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
|
||||||
|
assert previousValue == null;
|
||||||
|
|
||||||
// TODO: we should only provide the "slice" of FIS
|
// TODO: we should only provide the "slice" of FIS
|
||||||
// that this PF actually sees ... then stuff like
|
// that this PF actually sees ... then stuff like
|
||||||
// .hasProx could work correctly?
|
// .hasProx could work correctly?
|
||||||
// NOTE: .hasProx is already broken in the same way for the non-perfield case,
|
// NOTE: .hasProx is already broken in the same way for the non-perfield case,
|
||||||
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
|
// if there is a fieldinfo with prox that has no postings, you get a 0 byte file.
|
||||||
return consumer.addField(field);
|
return consumer.consumer.addField(field);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -110,6 +145,10 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
IOUtils.close(formats.values());
|
IOUtils.close(formats.values());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static String getSuffix(String formatName, String suffix) {
|
||||||
|
return formatName + "_" + suffix;
|
||||||
|
}
|
||||||
|
|
||||||
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
|
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
|
||||||
if (outerSegmentSuffix.length() == 0) {
|
if (outerSegmentSuffix.length() == 0) {
|
||||||
|
@ -125,7 +164,7 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
private class FieldsReader extends FieldsProducer {
|
private class FieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
|
private final Map<String,FieldsProducer> fields = new TreeMap<String,FieldsProducer>();
|
||||||
private final Map<PostingsFormat,FieldsProducer> formats = new IdentityHashMap<PostingsFormat,FieldsProducer>();
|
private final Map<String,FieldsProducer> formats = new HashMap<String,FieldsProducer>();
|
||||||
|
|
||||||
public FieldsReader(final SegmentReadState readState) throws IOException {
|
public FieldsReader(final SegmentReadState readState) throws IOException {
|
||||||
|
|
||||||
|
@ -139,11 +178,14 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
||||||
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
|
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
|
||||||
if (formatName != null) {
|
if (formatName != null) {
|
||||||
// null formatName means the field is in fieldInfos, but has no postings!
|
// null formatName means the field is in fieldInfos, but has no postings!
|
||||||
|
final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
|
||||||
|
assert suffix != null;
|
||||||
PostingsFormat format = PostingsFormat.forName(formatName);
|
PostingsFormat format = PostingsFormat.forName(formatName);
|
||||||
if (!formats.containsKey(format)) {
|
String segmentSuffix = getSuffix(formatName, suffix);
|
||||||
formats.put(format, format.fieldsProducer(new SegmentReadState(readState, formatName)));
|
if (!formats.containsKey(segmentSuffix)) {
|
||||||
|
formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
|
||||||
}
|
}
|
||||||
fields.put(fieldName, formats.get(format));
|
fields.put(fieldName, formats.get(segmentSuffix));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,10 +19,12 @@ package org.apache.lucene.codecs.perfield;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||||
import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
|
import org.apache.lucene.codecs.mocksep.MockSepPostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
|
||||||
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
|
import org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
@ -34,6 +36,7 @@ import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.LogDocMergePolicy;
|
import org.apache.lucene.index.LogDocMergePolicy;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
@ -264,4 +267,60 @@ public class TestPerFieldPostingsFormat extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSameCodecDifferentInstance() throws Exception {
|
||||||
|
Codec codec = new Lucene40Codec() {
|
||||||
|
@Override
|
||||||
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
|
if ("id".equals(field)) {
|
||||||
|
return new Pulsing40PostingsFormat(1);
|
||||||
|
} else if ("date".equals(field)) {
|
||||||
|
return new Pulsing40PostingsFormat(1);
|
||||||
|
} else {
|
||||||
|
return super.getPostingsFormatForField(field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
doTestMixedPostings(codec);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSameCodecDifferentParams() throws Exception {
|
||||||
|
Codec codec = new Lucene40Codec() {
|
||||||
|
@Override
|
||||||
|
public PostingsFormat getPostingsFormatForField(String field) {
|
||||||
|
if ("id".equals(field)) {
|
||||||
|
return new Pulsing40PostingsFormat(1);
|
||||||
|
} else if ("date".equals(field)) {
|
||||||
|
return new Pulsing40PostingsFormat(2);
|
||||||
|
} else {
|
||||||
|
return super.getPostingsFormatForField(field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
doTestMixedPostings(codec);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doTestMixedPostings(Codec codec) throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||||
|
iwc.setCodec(codec);
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
Document doc = new Document();
|
||||||
|
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
|
||||||
|
// turn on vectors for the checkindex cross-check
|
||||||
|
ft.setStoreTermVectors(true);
|
||||||
|
ft.setStoreTermVectorOffsets(true);
|
||||||
|
ft.setStoreTermVectorPositions(true);
|
||||||
|
Field idField = new Field("id", "", ft);
|
||||||
|
Field dateField = new Field("date", "", ft);
|
||||||
|
doc.add(idField);
|
||||||
|
doc.add(dateField);
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
idField.setStringValue(Integer.toString(random().nextInt(50)));
|
||||||
|
dateField.setStringValue(Integer.toString(random().nextInt(100)));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
}
|
||||||
|
iw.close();
|
||||||
|
dir.close(); // checkindex
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -87,12 +87,11 @@ public class RandomCodec extends Lucene40Codec {
|
||||||
int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
|
int minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
|
||||||
int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
|
int maxItemsPerBlock = 2*(Math.max(2, minItemsPerBlock-1)) + random.nextInt(100);
|
||||||
|
|
||||||
// TODO: make it possible to specify min/max iterms per block via CL:
|
|
||||||
minItemsPerBlock = _TestUtil.nextInt(random, 2, 100);
|
|
||||||
maxItemsPerBlock = 2*(Math.max(1, minItemsPerBlock-1)) + random.nextInt(100);
|
|
||||||
add(avoidCodecs,
|
add(avoidCodecs,
|
||||||
new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
|
new Lucene40PostingsFormat(minItemsPerBlock, maxItemsPerBlock),
|
||||||
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||||
|
// add pulsing again with (usually) different parameters
|
||||||
|
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||||
new MockSepPostingsFormat(),
|
new MockSepPostingsFormat(),
|
||||||
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
|
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
|
||||||
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
|
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
|
||||||
|
@ -100,7 +99,8 @@ public class RandomCodec extends Lucene40Codec {
|
||||||
new NestedPulsingPostingsFormat(),
|
new NestedPulsingPostingsFormat(),
|
||||||
new Lucene40WithOrds(),
|
new Lucene40WithOrds(),
|
||||||
new SimpleTextPostingsFormat(),
|
new SimpleTextPostingsFormat(),
|
||||||
new MemoryPostingsFormat(random.nextBoolean()));
|
new MemoryPostingsFormat(true),
|
||||||
|
new MemoryPostingsFormat(false));
|
||||||
|
|
||||||
Collections.shuffle(formats, random);
|
Collections.shuffle(formats, random);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue