LUCENE-4690: Performance improvements and non-hashing versions of NumericUtils.*ToPrefixCoded

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1438242 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2013-01-24 22:41:59 +00:00
parent 18dfd4256e
commit 533fd2eeba
6 changed files with 74 additions and 56 deletions

View File

@ -58,6 +58,9 @@ Optimizations
* LUCENE-3298: FST can now be larger than 2.1 GB / 2.1 B nodes.
(James Dyer, Mike McCandless)
* LUCENE-4690: Performance improvements and non-hashing versions
of NumericUtils.*ToPrefixCoded() (yonik)
New Features
* LUCENE-4686: New specialized DGapVInt8IntEncoder for facets (now the

View File

@ -82,7 +82,7 @@ public final class NumericUtils {
/**
* The maximum term length (used for <code>byte[]</code> buffer size)
* for encoding <code>long</code> values.
* @see #longToPrefixCoded(long,int,BytesRef)
* @see #longToPrefixCodedBytes
*/
public static final int BUF_SIZE_LONG = 63/7 + 2;
@ -95,7 +95,7 @@ public final class NumericUtils {
/**
* The maximum term length (used for <code>byte[]</code> buffer size)
* for encoding <code>int</code> values.
* @see #intToPrefixCoded(int,int,BytesRef)
* @see #intToPrefixCodedBytes
*/
public static final int BUF_SIZE_INT = 31/7 + 2;
@ -109,28 +109,8 @@ public final class NumericUtils {
* @return the hash code for indexing (TermsHash)
*/
public static int longToPrefixCoded(final long val, final int shift, final BytesRef bytes) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
int hash, nChars = (63-shift)/7 + 1;
bytes.offset = 0;
bytes.length = nChars+1;
if (bytes.bytes.length < bytes.length) {
bytes.grow(NumericUtils.BUF_SIZE_LONG);
}
bytes.bytes[0] = (byte) (hash = (SHIFT_START_LONG + shift));
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
while (nChars > 0) {
// Store 7 bits per byte for compatibility
// with UTF-8 encoding of terms
bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
// calculate hash
for (int i = 1; i < bytes.length; i++) {
hash = 31*hash + bytes.bytes[i];
}
return hash;
longToPrefixCodedBytes(val, shift, bytes);
return bytes.hashCode();
}
/**
@ -143,15 +123,57 @@ public final class NumericUtils {
* @return the hash code for indexing (TermsHash)
*/
public static int intToPrefixCoded(final int val, final int shift, final BytesRef bytes) {
if (shift>31 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
int hash, nChars = (31-shift)/7 + 1;
bytes.offset = 0;
bytes.length = nChars+1;
if (bytes.bytes.length < bytes.length) {
bytes.grow(NumericUtils.BUF_SIZE_INT);
intToPrefixCodedBytes(val, shift, bytes);
return bytes.hashCode();
}
bytes.bytes[0] = (byte) (hash = (SHIFT_START_INT + shift));
/**
* Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
* This is method is used by {@link NumericTokenStream}.
* After encoding, {@code bytes.offset} will always be 0.
* @param val the numeric value
* @param shift how many bits to strip from the right
* @param bytes will contain the encoded value
*/
public static void longToPrefixCodedBytes(final long val, final int shift, final BytesRef bytes) {
if ((shift & ~0x3f) != 0) // ensure shift is 0..63
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
int nChars = (((63-shift)*37)>>8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63
bytes.offset = 0;
bytes.length = nChars+1; // one extra for the byte that contains the shift info
if (bytes.bytes.length < bytes.length) {
bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG]; // use the max
}
bytes.bytes[0] = (byte)(SHIFT_START_LONG + shift);
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
while (nChars > 0) {
// Store 7 bits per byte for compatibility
// with UTF-8 encoding of terms
bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
}
/**
* Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
* This is method is used by {@link NumericTokenStream}.
* After encoding, {@code bytes.offset} will always be 0.
* @param val the numeric value
* @param shift how many bits to strip from the right
* @param bytes will contain the encoded value
*/
public static void intToPrefixCodedBytes(final int val, final int shift, final BytesRef bytes) {
if ((shift & ~0x1f) != 0) // ensure shift is 0..31
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
int nChars = (((31-shift)*37)>>8) + 1; // i/7 is the same as (i*37)>>8 for i in 0..63
bytes.offset = 0;
bytes.length = nChars+1; // one extra for the byte that contains the shift info
if (bytes.bytes.length < bytes.length) {
bytes.bytes = new byte[NumericUtils.BUF_SIZE_LONG]; // use the max
}
bytes.bytes[0] = (byte)(SHIFT_START_INT + shift);
int sortableBits = val ^ 0x80000000;
sortableBits >>>= shift;
while (nChars > 0) {
@ -160,13 +182,9 @@ public final class NumericUtils {
bytes.bytes[nChars--] = (byte)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
// calculate hash
for (int i = 1; i < bytes.length; i++) {
hash = 31*hash + bytes.bytes[i];
}
return hash;
}
/**
* Returns the shift value from a prefix encoded {@code long}.
* @throws NumberFormatException if the supplied {@link BytesRef} is
@ -197,7 +215,7 @@ public final class NumericUtils {
* This method can be used to decode a term's value.
* @throws NumberFormatException if the supplied {@link BytesRef} is
* not correctly prefix encoded.
* @see #longToPrefixCoded(long,int,BytesRef)
* @see #longToPrefixCodedBytes
*/
public static long prefixCodedToLong(final BytesRef val) {
long sortableBits = 0L;
@ -221,7 +239,7 @@ public final class NumericUtils {
* This method can be used to decode a term's value.
* @throws NumberFormatException if the supplied {@link BytesRef} is
* not correctly prefix encoded.
* @see #intToPrefixCoded(int,int,BytesRef)
* @see #intToPrefixCodedBytes
*/
public static int prefixCodedToInt(final BytesRef val) {
int sortableBits = 0;
@ -402,8 +420,8 @@ public final class NumericUtils {
*/
public void addRange(final long min, final long max, final int shift) {
final BytesRef minBytes = new BytesRef(BUF_SIZE_LONG), maxBytes = new BytesRef(BUF_SIZE_LONG);
longToPrefixCoded(min, shift, minBytes);
longToPrefixCoded(max, shift, maxBytes);
longToPrefixCodedBytes(min, shift, minBytes);
longToPrefixCodedBytes(max, shift, maxBytes);
addRange(minBytes, maxBytes);
}
@ -431,8 +449,8 @@ public final class NumericUtils {
*/
public void addRange(final int min, final int max, final int shift) {
final BytesRef minBytes = new BytesRef(BUF_SIZE_INT), maxBytes = new BytesRef(BUF_SIZE_INT);
intToPrefixCoded(min, shift, minBytes);
intToPrefixCoded(max, shift, maxBytes);
intToPrefixCodedBytes(min, shift, minBytes);
intToPrefixCodedBytes(max, shift, maxBytes);
addRange(minBytes, maxBytes);
}

View File

@ -380,8 +380,8 @@ public class TestNumericRangeQuery32 extends LuceneTestCase {
int a=lower; lower=upper; upper=a;
}
final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_INT), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(lower, 0, lowerBytes);
NumericUtils.intToPrefixCoded(upper, 0, upperBytes);
NumericUtils.intToPrefixCodedBytes(lower, 0, lowerBytes);
NumericUtils.intToPrefixCodedBytes(upper, 0, upperBytes);
// test inclusive range
NumericRangeQuery<Integer> tq=NumericRangeQuery.newIntRange(field, precisionStep, lower, upper, true, true);

View File

@ -405,8 +405,8 @@ public class TestNumericRangeQuery64 extends LuceneTestCase {
long a=lower; lower=upper; upper=a;
}
final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
NumericUtils.longToPrefixCoded(lower, 0, lowerBytes);
NumericUtils.longToPrefixCoded(upper, 0, upperBytes);
NumericUtils.longToPrefixCodedBytes(lower, 0, lowerBytes);
NumericUtils.longToPrefixCodedBytes(upper, 0, upperBytes);
// test inclusive range
NumericRangeQuery<Long> tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true);

View File

@ -28,7 +28,7 @@ public class TestNumericUtils extends LuceneTestCase {
// generate a series of encoded longs, each numerical one bigger than the one before
BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_LONG);
for (long l=-100000L; l<100000L; l++) {
NumericUtils.longToPrefixCoded(l, 0, act);
NumericUtils.longToPrefixCodedBytes(l, 0, act);
if (last!=null) {
// test if smaller
assertTrue("actual bigger than last (BytesRef)", last.compareTo(act) < 0 );
@ -46,7 +46,7 @@ public class TestNumericUtils extends LuceneTestCase {
// generate a series of encoded ints, each numerical one bigger than the one before
BytesRef last=null, act=new BytesRef(NumericUtils.BUF_SIZE_INT);
for (int i=-100000; i<100000; i++) {
NumericUtils.intToPrefixCoded(i, 0, act);
NumericUtils.intToPrefixCodedBytes(i, 0, act);
if (last!=null) {
// test if smaller
assertTrue("actual bigger than last (BytesRef)", last.compareTo(act) < 0 );
@ -69,7 +69,7 @@ public class TestNumericUtils extends LuceneTestCase {
for (int i=0; i<vals.length; i++) {
prefixVals[i] = new BytesRef(NumericUtils.BUF_SIZE_LONG);
NumericUtils.longToPrefixCoded(vals[i], 0, prefixVals[i]);
NumericUtils.longToPrefixCodedBytes(vals[i], 0, prefixVals[i]);
// check forward and back conversion
assertEquals( "forward and back conversion should generate same long", vals[i], NumericUtils.prefixCodedToLong(prefixVals[i]) );
@ -92,7 +92,7 @@ public class TestNumericUtils extends LuceneTestCase {
final BytesRef ref = new BytesRef(NumericUtils.BUF_SIZE_LONG);
for (int i=0; i<vals.length; i++) {
for (int j=0; j<64; j++) {
NumericUtils.longToPrefixCoded(vals[i], j, ref);
NumericUtils.longToPrefixCodedBytes(vals[i], j, ref);
long prefixVal=NumericUtils.prefixCodedToLong(ref);
long mask=(1L << j) - 1L;
assertEquals( "difference between prefix val and original value for "+vals[i]+" with shift="+j, vals[i] & mask, vals[i]-prefixVal );
@ -109,7 +109,7 @@ public class TestNumericUtils extends LuceneTestCase {
for (int i=0; i<vals.length; i++) {
prefixVals[i] = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(vals[i], 0, prefixVals[i]);
NumericUtils.intToPrefixCodedBytes(vals[i], 0, prefixVals[i]);
// check forward and back conversion
assertEquals( "forward and back conversion should generate same int", vals[i], NumericUtils.prefixCodedToInt(prefixVals[i]) );
@ -132,7 +132,7 @@ public class TestNumericUtils extends LuceneTestCase {
final BytesRef ref = new BytesRef(NumericUtils.BUF_SIZE_LONG);
for (int i=0; i<vals.length; i++) {
for (int j=0; j<32; j++) {
NumericUtils.intToPrefixCoded(vals[i], j, ref);
NumericUtils.intToPrefixCodedBytes(vals[i], j, ref);
int prefixVal=NumericUtils.prefixCodedToInt(ref);
int mask=(1 << j) - 1;
assertEquals( "difference between prefix val and original value for "+vals[i]+" with shift="+j, vals[i] & mask, vals[i]-prefixVal );

View File

@ -29,8 +29,6 @@ import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.index.GeneralField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.DoubleFieldSource;
@ -48,7 +46,6 @@ import org.apache.solr.analysis.TrieTokenizerFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.function.*;
/**
* Provides field types to support for Lucene's {@link
@ -311,19 +308,19 @@ public class TrieField extends PrimitiveFieldType {
String s = val.toString();
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCoded(Integer.parseInt(s), 0, result);
NumericUtils.intToPrefixCodedBytes(Integer.parseInt(s), 0, result);
break;
case FLOAT:
NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
break;
case LONG:
NumericUtils.longToPrefixCoded(Long.parseLong(s), 0, result);
NumericUtils.longToPrefixCodedBytes(Long.parseLong(s), 0, result);
break;
case DOUBLE:
NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
break;
case DATE:
NumericUtils.longToPrefixCoded(dateField.parseMath(null, s).getTime(), 0, result);
NumericUtils.longToPrefixCodedBytes(dateField.parseMath(null, s).getTime(), 0, result);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
@ -419,17 +416,17 @@ public class TrieField extends PrimitiveFieldType {
if (val != null) {
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCoded(val.intValue(), 0, bytes);
NumericUtils.intToPrefixCodedBytes(val.intValue(), 0, bytes);
break;
case FLOAT:
NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(val.floatValue()), 0, bytes);
NumericUtils.intToPrefixCodedBytes(NumericUtils.floatToSortableInt(val.floatValue()), 0, bytes);
break;
case LONG: //fallthrough!
case DATE:
NumericUtils.longToPrefixCoded(val.longValue(), 0, bytes);
NumericUtils.longToPrefixCodedBytes(val.longValue(), 0, bytes);
break;
case DOUBLE:
NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(val.doubleValue()), 0, bytes);
NumericUtils.longToPrefixCodedBytes(NumericUtils.doubleToSortableLong(val.doubleValue()), 0, bytes);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + f.name());
@ -441,7 +438,7 @@ public class TrieField extends PrimitiveFieldType {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field contents: "+f.name());
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCoded(toInt(bytesRef.bytes, bytesRef.offset), 0, bytes);
NumericUtils.intToPrefixCodedBytes(toInt(bytesRef.bytes, bytesRef.offset), 0, bytes);
break;
case FLOAT: {
// WARNING: Code Duplication! Keep in sync with o.a.l.util.NumericUtils!
@ -449,12 +446,12 @@ public class TrieField extends PrimitiveFieldType {
// code in next 2 lines is identical to: int v = NumericUtils.floatToSortableInt(Float.intBitsToFloat(toInt(arr)));
int v = toInt(bytesRef.bytes, bytesRef.offset);
if (v<0) v ^= 0x7fffffff;
NumericUtils.intToPrefixCoded(v, 0, bytes);
NumericUtils.intToPrefixCodedBytes(v, 0, bytes);
break;
}
case LONG: //fallthrough!
case DATE:
NumericUtils.longToPrefixCoded(toLong(bytesRef.bytes, bytesRef.offset), 0, bytes);
NumericUtils.longToPrefixCodedBytes(toLong(bytesRef.bytes, bytesRef.offset), 0, bytes);
break;
case DOUBLE: {
// WARNING: Code Duplication! Keep in sync with o.a.l.util.NumericUtils!
@ -462,7 +459,7 @@ public class TrieField extends PrimitiveFieldType {
// code in next 2 lines is identical to: long v = NumericUtils.doubleToSortableLong(Double.longBitsToDouble(toLong(arr)));
long v = toLong(bytesRef.bytes, bytesRef.offset);
if (v<0) v ^= 0x7fffffffffffffffL;
NumericUtils.longToPrefixCoded(v, 0, bytes);
NumericUtils.longToPrefixCodedBytes(v, 0, bytes);
break;
}
default: