git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1438847 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-26 04:47:29 +00:00
parent e9b5edc750
commit 710a1ca160
21 changed files with 163 additions and 50 deletions

View File

@ -19,13 +19,20 @@ package org.apache.lucene.collation;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.search.FieldCacheRangeFilter;
import org.apache.lucene.util.BytesRef;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RawCollationKey;
/**
* nocommit
* Indexes collation keys as a single-valued {@link SortedDocValuesField}.
* <p>
* This is more efficient that {@link ICUCollationKeyAnalyzer} if the field
* only has one value: no uninversion is necessary to sort on the field,
* locale-sensitive range queries can still work via {@link FieldCacheRangeFilter},
* and the underlying data structures built at index-time are likely more efficient
* and use less memory than FieldCache.
*/
public final class ICUCollationDocValuesField extends Field {
private final String name;
@ -33,6 +40,17 @@ public final class ICUCollationDocValuesField extends Field {
private final BytesRef bytes = new BytesRef();
private final RawCollationKey key = new RawCollationKey();
/**
* Create a new ICUCollationDocValuesField.
* <p>
* NOTE: you should not create a new one for each document, instead
* just make one and reuse it during your indexing process, setting
* the value via {@link #setStringValue(String)}.
* @param name field name
* @param collator Collator for generating collation keys.
*/
// TODO: can we make this trap-free? maybe just synchronize on the collator
// instead?
public ICUCollationDocValuesField(String name, Collator collator) {
super(name, SortedDocValuesField.TYPE);
this.name = name;
@ -48,6 +66,7 @@ public final class ICUCollationDocValuesField extends Field {
return name;
}
@Override
public void setStringValue(String value) {
collator.getRawCollationKey(value, key);
bytes.bytes = key.bytes;
@ -60,5 +79,5 @@ public final class ICUCollationDocValuesField extends Field {
return bytes;
}
// nocommit: make this thing trap-free
// nocommit: UOE the other field methods? or set to empty bytesref initially so this just works...
}

View File

@ -58,6 +58,10 @@ import org.apache.lucene.util.PriorityQueue;
* @lucene.experimental
*/
public abstract class DocValuesConsumer implements Closeable {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected DocValuesConsumer() {}
/**
* Writes numeric docvalues for a field.
@ -232,7 +236,7 @@ public abstract class DocValuesConsumer implements Closeable {
});
}
public static class SortedBytesMerger {
static class SortedBytesMerger {
public int numMergedTerms;

View File

@ -62,8 +62,20 @@ public abstract class DocValuesFormat implements NamedSPILoader.NamedSPI {
this.name = name;
}
/** Returns a {@link DocValuesConsumer} to write docvalues to the
* index. */
public abstract DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException;
/**
* Returns a {@link DocValuesProducer} to read docvalues from the index.
* <p>
* NOTE: by the time this call returns, it must hold open any files it will
* need to use; else, those files may be deleted. Additionally, required files
* may be deleted during the execution of this call before there is a chance
* to open them. Under these circumstances an IOException should be thrown by
* the implementation. IOExceptions are expected and will automatically cause
* a retry of the segment opening logic with the newly revised segments.
*/
public abstract DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException;
@Override

View File

@ -25,16 +25,29 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
// nocommit add javadocs stating that this must open all
// necessary files "on init", not later eg in .getXXX, else
// an IW that deletes a commit will cause an SR to hit
// exceptions....
/** Abstract API that produces numeric, binary and
* sorted docvalues.
*
* @lucene.experimental
*/
public abstract class DocValuesProducer implements Closeable {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected DocValuesProducer() {}
/** Returns {@link NumericDocValues} for this field.
* The returned instance need not be thread-safe: it will only be
* used by a single thread. */
public abstract NumericDocValues getNumeric(FieldInfo field) throws IOException;
/** Returns {@link BinaryDocValues} for this field.
* The returned instance need not be thread-safe: it will only be
* used by a single thread. */
public abstract BinaryDocValues getBinary(FieldInfo field) throws IOException;
/** Returns {@link SortedDocValues} for this field.
* The returned instance need not be thread-safe: it will only be
* used by a single thread. */
public abstract SortedDocValues getSorted(FieldInfo field) throws IOException;
}

View File

@ -35,7 +35,15 @@ public abstract class NormsFormat {
* index. */
public abstract DocValuesConsumer normsConsumer(SegmentWriteState state) throws IOException;
/** Returns a {@link DocValuesProducer} to read norms from the
* index. */
/**
* Returns a {@link DocValuesProducer} to read norms from the index.
* <p>
* NOTE: by the time this call returns, it must hold open any files it will
* need to use; else, those files may be deleted. Additionally, required files
* may be deleted during the execution of this call before there is a chance
* to open them. Under these circumstances an IOException should be thrown by
* the implementation. IOExceptions are expected and will automatically cause
* a retry of the segment opening logic with the newly revised segments.
*/
public abstract DocValuesProducer normsProducer(SegmentReadState state) throws IOException;
}

View File

@ -1,8 +1,5 @@
package org.apache.lucene.document;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.BytesRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -20,14 +17,45 @@ import org.apache.lucene.util.BytesRef;
* limitations under the License.
*/
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.BytesRef;
/**
* Field that stores a per-document {@link BytesRef} value.
* <p>
* The values are stored directly with no sharing, which is a good fit when
* the fields don't share (many) values, such as a title field. If values
* may be shared and sorted it's better to use {@link SortedDocValuesField}.
* Here's an example usage:
*
* <pre class="prettyprint">
* document.add(new BinaryDocValuesField(name, new BytesRef("hello")));
* </pre>
*
* <p>
* If you also need to store the value, you should add a
* separate {@link StoredField} instance.
*
* @see BinaryDocValues
* */
public class BinaryDocValuesField extends StoredField {
/**
* Type for straight bytes DocValues.
*/
public static final FieldType TYPE = new FieldType();
static {
TYPE.setDocValueType(FieldInfo.DocValuesType.BINARY);
TYPE.freeze();
}
/**
* Create a new binary DocValues field.
* @param name field name
* @param value binary content
* @throws IllegalArgumentException if the field name is null
*/
public BinaryDocValuesField(String name, BytesRef value) {
super(name, TYPE);
fieldsData = value;

View File

@ -1,8 +1,5 @@
package org.apache.lucene.document;
import org.apache.lucene.index.AtomicReader; // javadocs
import org.apache.lucene.search.FieldCache; // javadocs
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -20,6 +17,9 @@ import org.apache.lucene.search.FieldCache; // javadocs
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReader; // javadocs
import org.apache.lucene.search.FieldCache; // javadocs
/**
* Syntactic sugar for encoding doubles as NumericDocValues
* via {@link Double#doubleToRawLongBits(double)}.
@ -33,6 +33,12 @@ import org.apache.lucene.search.FieldCache; // javadocs
*/
public class DoubleDocValuesField extends NumericDocValuesField {
/**
* Creates a new DocValues field with the specified 64-bit double value
* @param name field name
* @param value 64-bit double value
* @throws IllegalArgumentException if the field name is null
*/
public DoubleDocValuesField(String name, double value) {
super(name, Double.doubleToRawLongBits(value));
}

View File

@ -1,8 +1,5 @@
package org.apache.lucene.document;
import org.apache.lucene.index.AtomicReader; // javadocs
import org.apache.lucene.search.FieldCache; // javadocs
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -20,6 +17,9 @@ import org.apache.lucene.search.FieldCache; // javadocs
* limitations under the License.
*/
import org.apache.lucene.index.AtomicReader; // javadocs
import org.apache.lucene.search.FieldCache; // javadocs
/**
* Syntactic sugar for encoding floats as NumericDocValues
* via {@link Float#floatToRawIntBits(float)}.
@ -33,6 +33,12 @@ import org.apache.lucene.search.FieldCache; // javadocs
*/
public class FloatDocValuesField extends NumericDocValuesField {
/**
* Creates a new DocValues field with the specified 32-bit float value
* @param name field name
* @param value 32-bit float value
* @throws IllegalArgumentException if the field name is null
*/
public FloatDocValuesField(String name, float value) {
super(name, Float.floatToRawIntBits(value));
}

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.FieldInfo;
* sorting or value retrieval. Here's an example usage:
*
* <pre class="prettyprint">
* document.add(new LongDocValuesField(name, 22L));
* document.add(new NumericDocValuesField(name, 22L));
* </pre>
*
* <p>

View File

@ -27,7 +27,7 @@ import org.apache.lucene.util.BytesRef;
* sorting. Here's an example usage:
*
* <pre class="prettyprint">
* document.add(new SortedBytesDocValuesField(name, new BytesRef("hello")));
* document.add(new SortedDocValuesField(name, new BytesRef("hello")));
* </pre>
*
* <p>

View File

@ -23,6 +23,10 @@ import org.apache.lucene.util.BytesRef;
* A per-document byte[]
*/
public abstract class BinaryDocValues {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected BinaryDocValues() {}
/** Lookup the value for document.
*

View File

@ -1257,6 +1257,10 @@ public class CheckIndex {
return status;
}
/**
* Test docvalues.
* @lucene.experimental
*/
public static Status.DocValuesStatus testDocValues(AtomicReader reader,
PrintStream infoStream) {
final Status.DocValuesStatus status = new Status.DocValuesStatus();

View File

@ -48,6 +48,9 @@ import org.apache.lucene.util.Version;
// nocommit move this back to test-framework!!!
public class MultiDocValues {
/** No instantiation */
private MultiDocValues() {}
/** returns a NumericDocValues for a reader's norms (potentially merging on-the-fly) */
// moved to src/java so SlowWrapper can use it... uggggggh
public static NumericDocValues getNormValues(final IndexReader r, final String field) throws IOException {

View File

@ -21,6 +21,11 @@ package org.apache.lucene.index;
* A per-document numeric value.
*/
public abstract class NumericDocValues {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected NumericDocValues() {}
/**
* Returns the numeric value for the specified document ID.
* @param docID document ID to lookup

View File

@ -28,6 +28,11 @@ import org.apache.lucene.util.BytesRef;
* are dense and in increasing sorted order.
*/
public abstract class SortedDocValues extends BinaryDocValues {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected SortedDocValues() {}
/**
* Returns the ordinal for the specified docID.
* @param docID document ID to lookup

View File

@ -31,6 +31,7 @@ public class SortedDocValuesTermsEnum extends TermsEnum {
private int currentOrd = -1;
private final BytesRef term = new BytesRef();
/** Creates a new TermsEnum over the provided values */
public SortedDocValuesTermsEnum(SortedDocValues values) {
this.values = values;
}

View File

@ -46,27 +46,39 @@ import org.apache.lucene.util.RamUsageEstimator;
*/
public interface FieldCache {
/** Field values as 8-bit signed bytes */
public static abstract class Bytes {
/** Return a single Byte representation of this field's value. */
public abstract byte get(int docID);
}
/** Field values as 16-bit signed shorts */
public static abstract class Shorts {
/** Return a short representation of this field's value. */
public abstract short get(int docID);
}
/** Field values as 32-bit signed integers */
public static abstract class Ints {
/** Return an integer representation of this field's value. */
public abstract int get(int docID);
}
/** Field values as 32-bit signed long integers */
public static abstract class Longs {
/** Return an long representation of this field's value. */
public abstract long get(int docID);
}
/** Field values as 32-bit floats */
public static abstract class Floats {
/** Return an float representation of this field's value. */
public abstract float get(int docID);
}
/** Field values as 64-bit doubles */
public static abstract class Doubles {
/** Return an double representation of this field's value. */
public abstract double get(int docID);
}
@ -138,7 +150,7 @@ public interface FieldCache {
* @see FieldCache#getDoubles(AtomicReader, String, FieldCache.DoubleParser, boolean)
*/
public interface DoubleParser extends Parser {
/** Return an long representation of this field's value. */
/** Return an double representation of this field's value. */
public double parseDouble(BytesRef term);
}

View File

@ -1184,6 +1184,7 @@ public class PackedInts {
return new Header(format, valueCount, bitsPerValue, version);
}
/** Header identifying the structure of a packed integer array. */
public static class Header {
private final Format format;

View File

@ -80,8 +80,8 @@ field fall into a single group.</p>
<p>Known limitations:</p>
<ul>
<li> For the two-pass grouping search, the group field must be a
single-valued indexed field.
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.search.FieldCache.DocTermsIndex} for this field.
single-valued indexed field (or indexed as a {@link org.apache.lucene.document.SortedDocValuesField}).
{@link org.apache.lucene.search.FieldCache} is used to load the {@link org.apache.lucene.index.SortedDocValues} for this field.
<li> Although Solr support grouping by function and this module has abstraction of what a group is, there are currently only
implementations for grouping based on terms.
<li> Sharding is not directly supported, though is not too
@ -196,25 +196,5 @@ fields, <code>FieldCache</code>, etc.).
<code>GroupingSearch</code> convenience utility
</p>
<p>
There are also DocValues based implementations available for the group collectors. There are factory methods
available for creating dv based instances. A typical example using dv based grouping with the
<code>GroupingSearch</code> convenience utility:
</p>
<pre class="prettyprint">
boolean diskResident = true; // Whether values should fetched directly from disk by passing the Java heap space.
DocValues.Type docValuesType = DocValues.Type.BYTES_VAR_SORTED;
GroupingSearch groupingSearch = new GroupingSearch("author", docValuesType, diskResident);
groupingSearch.setGroupSort(groupSort);
groupingSearch.setFillSortFields(fillFields);
TermQuery query = new TermQuery(new Term("content", searchTerm));
// The docValuesType variable decides the generic type. When float is used this Double and in case of int this is Long
TopGroups&lt;BytesRef&gt; result = groupingSearch.search(indexSearcher, query, groupOffset, groupLimit);
// Render groupsResult...
</pre>
</body>
</html>

View File

@ -23,6 +23,7 @@ import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
/** Read-write version of {@link Lucene40DocValuesFormat} for testing */
public class Lucene40RWDocValuesFormat extends Lucene40DocValuesFormat {
@Override

View File

@ -1,11 +1,5 @@
package org.apache.lucene.codecs.lucene40;
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -23,6 +17,13 @@ import org.apache.lucene.index.SegmentWriteState;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
/** Read-write version of {@link Lucene40NormsFormat} for testing */
public class Lucene40RWNormsFormat extends Lucene40NormsFormat {
@Override