add per-field docvalues format. lucene41 (and maybe simpletext) are currently broken because they dont respect segment suffixes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1415208 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-11-29 15:28:02 +00:00
parent fdce30e099
commit c4a28360fc
10 changed files with 440 additions and 7 deletions

View File

@ -74,6 +74,10 @@ public class SimpleTextSimpleDocValuesFormat extends SimpleDocValuesFormat {
// used for sorted bytes
final static BytesRef NUMVALUES = new BytesRef(" numvalues ");
final static BytesRef ORDPATTERN = new BytesRef(" ordpattern ");
public SimpleTextSimpleDocValuesFormat() {
super("SimpleText");
}
@Override
public SimpleDVConsumer fieldsConsumer(SegmentWriteState state) throws IOException {

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.codecs.lucene41.values.Lucene41DocValuesFormat
org.apache.lucene.codecs.simpletext.SimpleTextSimpleDocValuesFormat

View File

@ -18,18 +18,81 @@ package org.apache.lucene.codecs;
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.NamedSPILoader;
public abstract class SimpleDocValuesFormat {
public abstract class SimpleDocValuesFormat implements NamedSPILoader.NamedSPI {
private static final NamedSPILoader<SimpleDocValuesFormat> loader =
new NamedSPILoader<SimpleDocValuesFormat>(SimpleDocValuesFormat.class);
/** Unique name that's used to retrieve this format when
* reading the index.
*/
private final String name;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected SimpleDocValuesFormat() {
/**
* Creates a new docvalues format.
* <p>
* The provided name will be written into the index segment in some configurations
* (such as when using {@code PerFieldDocValuesFormat}): in such configurations,
* for the segment to be read this class should be registered with Java's
* SPI mechanism (registered in META-INF/ of your jar file, etc).
* @param name must be all ascii alphanumeric, and less than 128 characters in length.
*/
// nocommit: @code -> @link
protected SimpleDocValuesFormat(String name) {
NamedSPILoader.checkServiceName(name);
this.name = name;
}
public abstract SimpleDVConsumer fieldsConsumer(SegmentWriteState state) throws IOException;
// nocommit do this:
public abstract SimpleDVProducer fieldsProducer(SegmentReadState state) throws IOException;
@Override
public final String getName() {
return name;
}
@Override
public String toString() {
return "DocValuesFormat(name=" + name + ")";
}
/** looks up a format by name */
public static SimpleDocValuesFormat forName(String name) {
if (loader == null) {
throw new IllegalStateException("You called DocValuesFormat.forName() before all formats could be initialized. "+
"This likely happens if you call it from a DocValuesFormat's ctor.");
}
return loader.lookup(name);
}
/** returns a list of all available format names */
public static Set<String> availableDocValuesFormats() {
if (loader == null) {
throw new IllegalStateException("You called DocValuesFormat.availableDocValuesFormats() before all formats could be initialized. "+
"This likely happens if you call it from a DocValuesFormat's ctor.");
}
return loader.availableServices();
}
/**
* Reloads the DocValues format list from the given {@link ClassLoader}.
* Changes to the docvalues formats are visible after the method ends, all
* iterators ({@link #availableDocValuesFormats()},...) stay consistent.
*
* <p><b>NOTE:</b> Only new docvalues formats are added, existing ones are
* never removed or replaced.
*
* <p><em>This method is expensive and should only be called for discovery
* of new docvalues formats on the given classpath/classloader!</em>
*/
public static void reloadDocValuesFormats(ClassLoader classloader) {
loader.reload(classloader);
}
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.codecs.lucene40.Lucene40SegmentInfoFormat;
import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40TermVectorsFormat;
import org.apache.lucene.codecs.lucene41.values.Lucene41DocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
/**
@ -65,6 +66,14 @@ public class Lucene41Codec extends Codec {
return Lucene41Codec.this.getPostingsFormatForField(field);
}
};
private final SimpleDocValuesFormat simpleDocValuesFormat = new PerFieldDocValuesFormat() {
@Override
public SimpleDocValuesFormat getDocValuesFormatForField(String field) {
return Lucene41Codec.this.getDocValuesFormatForField(field);
}
};
/** Sole constructor. */
public Lucene41Codec() {
@ -120,7 +129,14 @@ public class Lucene41Codec extends Codec {
return defaultFormat;
}
private final SimpleDocValuesFormat simpleDocValuesFormat = new Lucene41DocValuesFormat();
/** Returns the docvalues format that should be used for writing
* new segments of <code>field</code>.
*
* The default implementation always returns "Lucene41"
*/
public SimpleDocValuesFormat getDocValuesFormatForField(String field) {
return defaultDVFormat;
}
@Override
public SimpleDocValuesFormat simpleDocValuesFormat() {
@ -128,4 +144,5 @@ public class Lucene41Codec extends Codec {
}
private final PostingsFormat defaultFormat = PostingsFormat.forName("Lucene41");
private final SimpleDocValuesFormat defaultDVFormat = SimpleDocValuesFormat.forName("Lucene41");
}

View File

@ -27,6 +27,10 @@ import org.apache.lucene.index.SegmentWriteState;
public class Lucene41DocValuesFormat extends SimpleDocValuesFormat {
public Lucene41DocValuesFormat() {
super("Lucene41");
}
@Override
public SimpleDVConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {

View File

@ -0,0 +1,258 @@
package org.apache.lucene.codecs.perfield;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.ServiceLoader; // javadocs
import java.util.TreeMap;
import org.apache.lucene.codecs.BinaryDocValuesConsumer;
import org.apache.lucene.codecs.NumericDocValuesConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SimpleDVConsumer;
import org.apache.lucene.codecs.SimpleDVProducer;
import org.apache.lucene.codecs.SimpleDocValuesFormat;
import org.apache.lucene.codecs.SortedDocValuesConsumer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.IOUtils;
/**
* Enables per field docvalues support.
* <p>
* Note, when extending this class, the name ({@link #getName}) is
* written into the index. In order for the field to be read, the
* name must resolve to your implementation via {@link #forName(String)}.
* This method uses Java's
* {@link ServiceLoader Service Provider Interface} to resolve format names.
* <p>
* Files written by each docvalues format have an additional suffix containing the
* format name. For example, in a per-field configuration instead of <tt>_1.dat</tt>
* filenames would look like <tt>_1_Lucene40_0.dat</tt>.
* @see ServiceLoader
* @lucene.experimental
*/
public abstract class PerFieldDocValuesFormat extends SimpleDocValuesFormat {
/** Name of this {@link PostingsFormat}. */
public static final String PER_FIELD_NAME = "PerFieldDV40";
/** {@link FieldInfo} attribute name used to store the
* format name for each field. */
public static final String PER_FIELD_FORMAT_KEY = PerFieldDocValuesFormat.class.getSimpleName() + ".format";
/** {@link FieldInfo} attribute name used to store the
* segment suffix name for each field. */
public static final String PER_FIELD_SUFFIX_KEY = PerFieldDocValuesFormat.class.getSimpleName() + ".suffix";
/** Sole constructor. */
public PerFieldDocValuesFormat() {
super(PER_FIELD_NAME);
}
@Override
public final SimpleDVConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
return new FieldsWriter(state);
}
static class SimpleDVConsumerAndSuffix implements Closeable {
SimpleDVConsumer consumer;
int suffix;
@Override
public void close() throws IOException {
consumer.close();
}
}
private class FieldsWriter extends SimpleDVConsumer {
private final Map<SimpleDocValuesFormat,SimpleDVConsumerAndSuffix> formats = new HashMap<SimpleDocValuesFormat,SimpleDVConsumerAndSuffix>();
private final Map<String,Integer> suffixes = new HashMap<String,Integer>();
private final SegmentWriteState segmentWriteState;
public FieldsWriter(SegmentWriteState state) {
segmentWriteState = state;
}
@Override
public NumericDocValuesConsumer addNumericField(FieldInfo field, long minValue, long maxValue) throws IOException {
return getInstance(field).addNumericField(field, minValue, maxValue);
}
@Override
public BinaryDocValuesConsumer addBinaryField(FieldInfo field, boolean fixedLength, int maxLength) throws IOException {
return getInstance(field).addBinaryField(field, fixedLength, maxLength);
}
@Override
public SortedDocValuesConsumer addSortedField(FieldInfo field, int valueCount, boolean fixedLength, int maxLength) throws IOException {
return getInstance(field).addSortedField(field, valueCount, fixedLength, maxLength);
}
private SimpleDVConsumer getInstance(FieldInfo field) throws IOException {
final SimpleDocValuesFormat format = getDocValuesFormatForField(field.name);
if (format == null) {
throw new IllegalStateException("invalid null DocValuesFormat for field=\"" + field.name + "\"");
}
final String formatName = format.getName();
String previousValue = field.putAttribute(PER_FIELD_FORMAT_KEY, formatName);
assert previousValue == null;
Integer suffix;
SimpleDVConsumerAndSuffix consumer = formats.get(format);
if (consumer == null) {
// First time we are seeing this format; create a new instance
// bump the suffix
suffix = suffixes.get(formatName);
if (suffix == null) {
suffix = 0;
} else {
suffix = suffix + 1;
}
suffixes.put(formatName, suffix);
final String segmentSuffix = getFullSegmentSuffix(field.name,
segmentWriteState.segmentSuffix,
getSuffix(formatName, Integer.toString(suffix)));
consumer = new SimpleDVConsumerAndSuffix();
consumer.consumer = format.fieldsConsumer(new SegmentWriteState(segmentWriteState, segmentSuffix));
consumer.suffix = suffix;
formats.put(format, consumer);
} else {
// we've already seen this format, so just grab its suffix
assert suffixes.containsKey(formatName);
suffix = consumer.suffix;
}
previousValue = field.putAttribute(PER_FIELD_SUFFIX_KEY, Integer.toString(suffix));
assert previousValue == null;
// TODO: we should only provide the "slice" of FIS
// that this PF actually sees ...
return consumer.consumer;
}
@Override
public void close() throws IOException {
// Close all subs
IOUtils.close(formats.values());
}
}
static String getSuffix(String formatName, String suffix) {
return formatName + "_" + suffix;
}
static String getFullSegmentSuffix(String fieldName, String outerSegmentSuffix, String segmentSuffix) {
if (outerSegmentSuffix.length() == 0) {
return segmentSuffix;
} else {
// TODO: support embedding; I think it should work but
// we need a test confirm to confirm
// return outerSegmentSuffix + "_" + segmentSuffix;
throw new IllegalStateException("cannot embed PerFieldPostingsFormat inside itself (field \"" + fieldName + "\" returned PerFieldPostingsFormat)");
}
}
private class FieldsReader extends SimpleDVProducer {
private final Map<String,SimpleDVProducer> fields = new TreeMap<String,SimpleDVProducer>();
private final Map<String,SimpleDVProducer> formats = new HashMap<String,SimpleDVProducer>();
public FieldsReader(final SegmentReadState readState) throws IOException {
// Read _X.per and init each format:
boolean success = false;
try {
// Read field name -> format name
for (FieldInfo fi : readState.fieldInfos) {
if (fi.hasDocValues()) {
final String fieldName = fi.name;
final String formatName = fi.getAttribute(PER_FIELD_FORMAT_KEY);
if (formatName != null) {
// null formatName means the field is in fieldInfos, but has no docvalues!
final String suffix = fi.getAttribute(PER_FIELD_SUFFIX_KEY);
assert suffix != null;
SimpleDocValuesFormat format = SimpleDocValuesFormat.forName(formatName);
String segmentSuffix = getSuffix(formatName, suffix);
if (!formats.containsKey(segmentSuffix)) {
formats.put(segmentSuffix, format.fieldsProducer(new SegmentReadState(readState, segmentSuffix)));
}
fields.put(fieldName, formats.get(segmentSuffix));
}
}
}
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(formats.values());
}
}
}
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
SimpleDVProducer producer = fields.get(field.name);
return producer == null ? null : producer.getNumeric(field);
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
SimpleDVProducer producer = fields.get(field.name);
return producer == null ? null : producer.getBinary(field);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
SimpleDVProducer producer = fields.get(field.name);
return producer == null ? null : producer.getSorted(field);
}
@Override
public void close() throws IOException {
IOUtils.close(formats.values());
}
}
@Override
public final SimpleDVProducer fieldsProducer(SegmentReadState state) throws IOException {
return new FieldsReader(state);
}
/**
* Returns the doc values format that should be used for writing
* new segments of <code>field</code>.
* <p>
* The field to format mapping is written to the index, so
* this method is only invoked when writing, not when reading. */
public abstract SimpleDocValuesFormat getDocValuesFormatForField(String field);
}

View File

@ -37,7 +37,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.util.IOUtils;
/**
* Enables per field format support.
* Enables per field postings support.
* <p>
* Note, when extending this class, the name ({@link #getName}) is
* written into the index. In order for the field to be read, the

View File

@ -521,7 +521,10 @@ public class CheckIndex {
msg(" diagnostics = " + diagnostics);
}
// TODO: we could append the info attributes() to the msg?
Map<String,String> atts = info.info.attributes();
if (atts != null && !atts.isEmpty()) {
msg(" attributes = " + atts);
}
if (!info.hasDeletions()) {
msg(" no deletions");
@ -1442,6 +1445,11 @@ public class CheckIndex {
// nocommit
private void checkSimpleDocValues(FieldInfo fi, SegmentReader reader) throws Exception {
// nocommit: just for debugging
Map<String,String> atts = fi.attributes();
if (atts != null) {
msg(" field: " + fi.name + ": " + atts);
}
switch(fi.getDocValuesType()) {
case BYTES_FIXED_SORTED:
case BYTES_VAR_SORTED:

View File

@ -21,6 +21,8 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.SimpleDocValuesFormat;
import org.apache.lucene.codecs.lucene41.Lucene41Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatDocValuesField;
@ -42,6 +44,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Ignore;
/**
* A very simple demo used in the API documentation (src/java/overview.html).
@ -666,4 +669,61 @@ public class TestDemoDocValue extends LuceneTestCase {
ireader.close();
directory.close();
}
@Ignore("broken until we fix e.g. Lucene41's impl to actually handle suffixes correctly")
// nocommit: if we are going to pass down suffixes to segmentread/writestate,
// then they should be respected by *all* codec apis!
public void testDemoTwoFieldsTwoFormats() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
Directory directory = newDirectory();
// we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
// TODO: Fix the CFS/suffixing of Lucene41DocValues so it actually works with this
final SimpleDocValuesFormat fast = SimpleDocValuesFormat.forName("Lucene41");
final SimpleDocValuesFormat slow = SimpleDocValuesFormat.forName("SimpleText");
iwc.setCodec(new Lucene41Codec() {
@Override
public SimpleDocValuesFormat getDocValuesFormatForField(String field) {
if ("dv1".equals(field)) {
return fast;
} else {
return slow;
}
}
});
IndexWriter iwriter = new IndexWriter(directory, iwc);
Document doc = new Document();
String longTerm = "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm";
String text = "This is the text to be indexed. " + longTerm;
doc.add(newTextField("fieldname", text, Field.Store.YES));
doc.add(new PackedLongDocValuesField("dv1", 5));
doc.add(new StraightBytesDocValuesField("dv2", new BytesRef("hello world")));
iwriter.addDocument(doc);
iwriter.close();
// Now search the index:
IndexReader ireader = DirectoryReader.open(directory); // read-only=true
IndexSearcher isearcher = new IndexSearcher(ireader);
assertEquals(1, isearcher.search(new TermQuery(new Term("fieldname", longTerm)), 1).totalHits);
Query query = new TermQuery(new Term("fieldname", "text"));
TopDocs hits = isearcher.search(query, null, 1);
assertEquals(1, hits.totalHits);
BytesRef scratch = new BytesRef();
// Iterate through the results:
for (int i = 0; i < hits.scoreDocs.length; i++) {
StoredDocument hitDoc = isearcher.doc(hits.scoreDocs[i].doc);
assertEquals(text, hitDoc.get("fieldname"));
assert ireader.leaves().size() == 1;
NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv1");
assertEquals(5, dv.get(hits.scoreDocs[i].doc));
BinaryDocValues dv2 = ireader.leaves().get(0).reader().getBinaryDocValues("dv2");
dv2.get(hits.scoreDocs[i].doc, scratch);
assertEquals(new BytesRef("hello world"), scratch);
}
ireader.close();
directory.close();
}
}

View File

@ -36,6 +36,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.SimpleDocValuesFormat;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.handler.admin.CoreAdminHandler;
@ -177,6 +178,7 @@ public class SolrResourceLoader implements ResourceLoader
void reloadLuceneSPI() {
// Codecs:
PostingsFormat.reloadPostingsFormats(this.classLoader);
SimpleDocValuesFormat.reloadDocValuesFormats(this.classLoader);
Codec.reloadCodecs(this.classLoader);
// Analysis:
CharFilterFactory.reloadCharFilters(this.classLoader);