mirror of https://github.com/apache/lucene.git
LUCENE-2221: Use JVM intrinsics instead of BitUtil.{ntz,pop}.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1411712 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2c91a229cd
commit
c02dfdc6ac
|
@ -149,6 +149,11 @@ Bug Fixes
|
|||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
|
||||
Long.numberOfTrailingZeros (which are intrinsics since Java 6u18) instead of
|
||||
pure java bit twiddling routines in order to improve performance on modern
|
||||
JVMs/hardware. (Dawid Weiss, Adrien Grand)
|
||||
|
||||
* LUCENE-4509: Enable stored fields compression by default in the Lucene 4.1
|
||||
default codec. (Adrien Grand)
|
||||
|
||||
|
|
|
@ -24,791 +24,57 @@ public final class BitUtil {
|
|||
|
||||
private BitUtil() {} // no instance
|
||||
|
||||
/** Returns the number of bits set in the long */
|
||||
public static int pop(long x) {
|
||||
/* Hacker's Delight 32 bit pop function:
|
||||
* http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
|
||||
*
|
||||
int pop(unsigned x) {
|
||||
x = x - ((x >> 1) & 0x55555555);
|
||||
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
|
||||
x = (x + (x >> 4)) & 0x0F0F0F0F;
|
||||
x = x + (x >> 8);
|
||||
x = x + (x >> 16);
|
||||
return x & 0x0000003F;
|
||||
// The pop methods used to rely on bit-manipulation tricks for speed but it
|
||||
// turns out that it is faster to use the Long.bitCount method (which is an
|
||||
// intrinsic since Java 6u18) in a naive loop, see LUCENE-2221
|
||||
|
||||
/** Returns the number of set bits in an array of longs. */
|
||||
public static long pop_array(long[] arr, int wordOffset, int numWords) {
|
||||
long popCount = 0;
|
||||
for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
|
||||
popCount += Long.bitCount(arr[i]);
|
||||
}
|
||||
***/
|
||||
|
||||
// 64 bit java version of the C function from above
|
||||
x = x - ((x >>> 1) & 0x5555555555555555L);
|
||||
x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L);
|
||||
x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL;
|
||||
x = x + (x >>> 8);
|
||||
x = x + (x >>> 16);
|
||||
x = x + (x >>> 32);
|
||||
return ((int)x) & 0x7F;
|
||||
}
|
||||
|
||||
/*** Returns the number of set bits in an array of longs. */
|
||||
public static long pop_array(long A[], int wordOffset, int numWords) {
|
||||
/*
|
||||
* Robert Harley and David Seal's bit counting algorithm, as documented
|
||||
* in the revisions of Hacker's Delight
|
||||
* http://www.hackersdelight.org/revisions.pdf
|
||||
* http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc
|
||||
*
|
||||
* This function was adapted to Java, and extended to use 64 bit words.
|
||||
* if only we had access to wider registers like SSE from java...
|
||||
*
|
||||
* This function can be transformed to compute the popcount of other functions
|
||||
* on bitsets via something like this:
|
||||
* sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
|
||||
*
|
||||
*/
|
||||
int n = wordOffset+numWords;
|
||||
long tot=0, tot8=0;
|
||||
long ones=0, twos=0, fours=0;
|
||||
|
||||
int i;
|
||||
for (i = wordOffset; i <= n - 8; i+=8) {
|
||||
/*** C macro from Hacker's Delight
|
||||
#define CSA(h,l, a,b,c) \
|
||||
{unsigned u = a ^ b; unsigned v = c; \
|
||||
h = (a & b) | (u & v); l = u ^ v;}
|
||||
***/
|
||||
|
||||
long twosA,twosB,foursA,foursB,eights;
|
||||
|
||||
// CSA(twosA, ones, ones, A[i], A[i+1])
|
||||
{
|
||||
long b=A[i], c=A[i+1];
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, A[i+2], A[i+3])
|
||||
{
|
||||
long b=A[i+2], c=A[i+3];
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursA, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
//CSA(twosA, ones, ones, A[i+4], A[i+5])
|
||||
{
|
||||
long b=A[i+4], c=A[i+5];
|
||||
long u=ones^b;
|
||||
twosA=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, A[i+6], A[i+7])
|
||||
{
|
||||
long b=A[i+6], c=A[i+7];
|
||||
long u=ones^b;
|
||||
twosB=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursB, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursB=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
|
||||
//CSA(eights, fours, fours, foursA, foursB)
|
||||
{
|
||||
long u=fours^foursA;
|
||||
eights=(fours&foursA)|(u&foursB);
|
||||
fours=u^foursB;
|
||||
}
|
||||
tot8 += pop(eights);
|
||||
}
|
||||
|
||||
// handle trailing words in a binary-search manner...
|
||||
// derived from the loop above by setting specific elements to 0.
|
||||
// the original method in Hackers Delight used a simple for loop:
|
||||
// for (i = i; i < n; i++) // Add in the last elements
|
||||
// tot = tot + pop(A[i]);
|
||||
|
||||
if (i<=n-4) {
|
||||
long twosA, twosB, foursA, eights;
|
||||
{
|
||||
long b=A[i], c=A[i+1];
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long b=A[i+2], c=A[i+3];
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=4;
|
||||
}
|
||||
|
||||
if (i<=n-2) {
|
||||
long b=A[i], c=A[i+1];
|
||||
long u=ones ^ b;
|
||||
long twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
|
||||
long foursA=twos&twosA;
|
||||
twos=twos^twosA;
|
||||
|
||||
long eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=2;
|
||||
}
|
||||
|
||||
if (i<n) {
|
||||
tot += pop(A[i]);
|
||||
}
|
||||
|
||||
tot += (pop(fours)<<2)
|
||||
+ (pop(twos)<<1)
|
||||
+ pop(ones)
|
||||
+ (tot8<<3);
|
||||
|
||||
return tot;
|
||||
return popCount;
|
||||
}
|
||||
|
||||
/** Returns the popcount or cardinality of the two sets after an intersection.
|
||||
* Neither array is modified.
|
||||
*/
|
||||
public static long pop_intersect(long A[], long B[], int wordOffset, int numWords) {
|
||||
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g'
|
||||
int n = wordOffset+numWords;
|
||||
long tot=0, tot8=0;
|
||||
long ones=0, twos=0, fours=0;
|
||||
|
||||
int i;
|
||||
for (i = wordOffset; i <= n - 8; i+=8) {
|
||||
long twosA,twosB,foursA,foursB,eights;
|
||||
|
||||
// CSA(twosA, ones, ones, (A[i] & B[i]), (A[i+1] & B[i+1]))
|
||||
{
|
||||
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+2] & B[i+2]), (A[i+3] & B[i+3]))
|
||||
{
|
||||
long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursA, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
//CSA(twosA, ones, ones, (A[i+4] & B[i+4]), (A[i+5] & B[i+5]))
|
||||
{
|
||||
long b=(A[i+4] & B[i+4]), c=(A[i+5] & B[i+5]);
|
||||
long u=ones^b;
|
||||
twosA=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+6] & B[i+6]), (A[i+7] & B[i+7]))
|
||||
{
|
||||
long b=(A[i+6] & B[i+6]), c=(A[i+7] & B[i+7]);
|
||||
long u=ones^b;
|
||||
twosB=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursB, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursB=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
|
||||
//CSA(eights, fours, fours, foursA, foursB)
|
||||
{
|
||||
long u=fours^foursA;
|
||||
eights=(fours&foursA)|(u&foursB);
|
||||
fours=u^foursB;
|
||||
}
|
||||
tot8 += pop(eights);
|
||||
* Neither array is modified. */
|
||||
public static long pop_intersect(long[] arr1, long[] arr2, int wordOffset, int numWords) {
|
||||
long popCount = 0;
|
||||
for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
|
||||
popCount += Long.bitCount(arr1[i] & arr2[i]);
|
||||
}
|
||||
|
||||
|
||||
if (i<=n-4) {
|
||||
long twosA, twosB, foursA, eights;
|
||||
{
|
||||
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long b=(A[i+2] & B[i+2]), c=(A[i+3] & B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=4;
|
||||
}
|
||||
|
||||
if (i<=n-2) {
|
||||
long b=(A[i] & B[i]), c=(A[i+1] & B[i+1]);
|
||||
long u=ones ^ b;
|
||||
long twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
|
||||
long foursA=twos&twosA;
|
||||
twos=twos^twosA;
|
||||
|
||||
long eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=2;
|
||||
}
|
||||
|
||||
if (i<n) {
|
||||
tot += pop((A[i] & B[i]));
|
||||
}
|
||||
|
||||
tot += (pop(fours)<<2)
|
||||
+ (pop(twos)<<1)
|
||||
+ pop(ones)
|
||||
+ (tot8<<3);
|
||||
|
||||
return tot;
|
||||
return popCount;
|
||||
}
|
||||
|
||||
/** Returns the popcount or cardinality of the union of two sets.
|
||||
* Neither array is modified.
|
||||
*/
|
||||
public static long pop_union(long A[], long B[], int wordOffset, int numWords) {
|
||||
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \| B[\1]\)/g'
|
||||
int n = wordOffset+numWords;
|
||||
long tot=0, tot8=0;
|
||||
long ones=0, twos=0, fours=0;
|
||||
|
||||
int i;
|
||||
for (i = wordOffset; i <= n - 8; i+=8) {
|
||||
/*** C macro from Hacker's Delight
|
||||
#define CSA(h,l, a,b,c) \
|
||||
{unsigned u = a ^ b; unsigned v = c; \
|
||||
h = (a & b) | (u & v); l = u ^ v;}
|
||||
***/
|
||||
|
||||
long twosA,twosB,foursA,foursB,eights;
|
||||
|
||||
// CSA(twosA, ones, ones, (A[i] | B[i]), (A[i+1] | B[i+1]))
|
||||
{
|
||||
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+2] | B[i+2]), (A[i+3] | B[i+3]))
|
||||
{
|
||||
long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursA, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
//CSA(twosA, ones, ones, (A[i+4] | B[i+4]), (A[i+5] | B[i+5]))
|
||||
{
|
||||
long b=(A[i+4] | B[i+4]), c=(A[i+5] | B[i+5]);
|
||||
long u=ones^b;
|
||||
twosA=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+6] | B[i+6]), (A[i+7] | B[i+7]))
|
||||
{
|
||||
long b=(A[i+6] | B[i+6]), c=(A[i+7] | B[i+7]);
|
||||
long u=ones^b;
|
||||
twosB=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursB, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursB=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
|
||||
//CSA(eights, fours, fours, foursA, foursB)
|
||||
{
|
||||
long u=fours^foursA;
|
||||
eights=(fours&foursA)|(u&foursB);
|
||||
fours=u^foursB;
|
||||
}
|
||||
tot8 += pop(eights);
|
||||
/** Returns the popcount or cardinality of the union of two sets.
|
||||
* Neither array is modified. */
|
||||
public static long pop_union(long[] arr1, long[] arr2, int wordOffset, int numWords) {
|
||||
long popCount = 0;
|
||||
for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
|
||||
popCount += Long.bitCount(arr1[i] | arr2[i]);
|
||||
}
|
||||
|
||||
|
||||
if (i<=n-4) {
|
||||
long twosA, twosB, foursA, eights;
|
||||
{
|
||||
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long b=(A[i+2] | B[i+2]), c=(A[i+3] | B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=4;
|
||||
}
|
||||
|
||||
if (i<=n-2) {
|
||||
long b=(A[i] | B[i]), c=(A[i+1] | B[i+1]);
|
||||
long u=ones ^ b;
|
||||
long twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
|
||||
long foursA=twos&twosA;
|
||||
twos=twos^twosA;
|
||||
|
||||
long eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=2;
|
||||
}
|
||||
|
||||
if (i<n) {
|
||||
tot += pop((A[i] | B[i]));
|
||||
}
|
||||
|
||||
tot += (pop(fours)<<2)
|
||||
+ (pop(twos)<<1)
|
||||
+ pop(ones)
|
||||
+ (tot8<<3);
|
||||
|
||||
return tot;
|
||||
return popCount;
|
||||
}
|
||||
|
||||
/** Returns the popcount or cardinality of A & ~B
|
||||
* Neither array is modified.
|
||||
*/
|
||||
public static long pop_andnot(long A[], long B[], int wordOffset, int numWords) {
|
||||
// generated from pop_array via sed 's/A\[\([^]]*\)\]/\(A[\1] \& ~B[\1]\)/g'
|
||||
int n = wordOffset+numWords;
|
||||
long tot=0, tot8=0;
|
||||
long ones=0, twos=0, fours=0;
|
||||
|
||||
int i;
|
||||
for (i = wordOffset; i <= n - 8; i+=8) {
|
||||
/*** C macro from Hacker's Delight
|
||||
#define CSA(h,l, a,b,c) \
|
||||
{unsigned u = a ^ b; unsigned v = c; \
|
||||
h = (a & b) | (u & v); l = u ^ v;}
|
||||
***/
|
||||
|
||||
long twosA,twosB,foursA,foursB,eights;
|
||||
|
||||
// CSA(twosA, ones, ones, (A[i] & ~B[i]), (A[i+1] & ~B[i+1]))
|
||||
{
|
||||
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+2] & ~B[i+2]), (A[i+3] & ~B[i+3]))
|
||||
{
|
||||
long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursA, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
//CSA(twosA, ones, ones, (A[i+4] & ~B[i+4]), (A[i+5] & ~B[i+5]))
|
||||
{
|
||||
long b=(A[i+4] & ~B[i+4]), c=(A[i+5] & ~B[i+5]);
|
||||
long u=ones^b;
|
||||
twosA=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+6] & ~B[i+6]), (A[i+7] & ~B[i+7]))
|
||||
{
|
||||
long b=(A[i+6] & ~B[i+6]), c=(A[i+7] & ~B[i+7]);
|
||||
long u=ones^b;
|
||||
twosB=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursB, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursB=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
|
||||
//CSA(eights, fours, fours, foursA, foursB)
|
||||
{
|
||||
long u=fours^foursA;
|
||||
eights=(fours&foursA)|(u&foursB);
|
||||
fours=u^foursB;
|
||||
}
|
||||
tot8 += pop(eights);
|
||||
/** Returns the popcount or cardinality of A & ~B.
|
||||
* Neither array is modified. */
|
||||
public static long pop_andnot(long[] arr1, long[] arr2, int wordOffset, int numWords) {
|
||||
long popCount = 0;
|
||||
for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
|
||||
popCount += Long.bitCount(arr1[i] & ~arr2[i]);
|
||||
}
|
||||
|
||||
|
||||
if (i<=n-4) {
|
||||
long twosA, twosB, foursA, eights;
|
||||
{
|
||||
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long b=(A[i+2] & ~B[i+2]), c=(A[i+3] & ~B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=4;
|
||||
}
|
||||
|
||||
if (i<=n-2) {
|
||||
long b=(A[i] & ~B[i]), c=(A[i+1] & ~B[i+1]);
|
||||
long u=ones ^ b;
|
||||
long twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
|
||||
long foursA=twos&twosA;
|
||||
twos=twos^twosA;
|
||||
|
||||
long eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=2;
|
||||
}
|
||||
|
||||
if (i<n) {
|
||||
tot += pop((A[i] & ~B[i]));
|
||||
}
|
||||
|
||||
tot += (pop(fours)<<2)
|
||||
+ (pop(twos)<<1)
|
||||
+ pop(ones)
|
||||
+ (tot8<<3);
|
||||
|
||||
return tot;
|
||||
return popCount;
|
||||
}
|
||||
|
||||
public static long pop_xor(long A[], long B[], int wordOffset, int numWords) {
|
||||
int n = wordOffset+numWords;
|
||||
long tot=0, tot8=0;
|
||||
long ones=0, twos=0, fours=0;
|
||||
|
||||
int i;
|
||||
for (i = wordOffset; i <= n - 8; i+=8) {
|
||||
/*** C macro from Hacker's Delight
|
||||
#define CSA(h,l, a,b,c) \
|
||||
{unsigned u = a ^ b; unsigned v = c; \
|
||||
h = (a & b) | (u & v); l = u ^ v;}
|
||||
***/
|
||||
|
||||
long twosA,twosB,foursA,foursB,eights;
|
||||
|
||||
// CSA(twosA, ones, ones, (A[i] ^ B[i]), (A[i+1] ^ B[i+1]))
|
||||
{
|
||||
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+2] ^ B[i+2]), (A[i+3] ^ B[i+3]))
|
||||
{
|
||||
long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursA, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
//CSA(twosA, ones, ones, (A[i+4] ^ B[i+4]), (A[i+5] ^ B[i+5]))
|
||||
{
|
||||
long b=(A[i+4] ^ B[i+4]), c=(A[i+5] ^ B[i+5]);
|
||||
long u=ones^b;
|
||||
twosA=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
// CSA(twosB, ones, ones, (A[i+6] ^ B[i+6]), (A[i+7] ^ B[i+7]))
|
||||
{
|
||||
long b=(A[i+6] ^ B[i+6]), c=(A[i+7] ^ B[i+7]);
|
||||
long u=ones^b;
|
||||
twosB=(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
//CSA(foursB, twos, twos, twosA, twosB)
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursB=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
|
||||
//CSA(eights, fours, fours, foursA, foursB)
|
||||
{
|
||||
long u=fours^foursA;
|
||||
eights=(fours&foursA)|(u&foursB);
|
||||
fours=u^foursB;
|
||||
}
|
||||
tot8 += pop(eights);
|
||||
/** Returns the popcount or cardinality of A ^ B
|
||||
* Neither array is modified. */
|
||||
public static long pop_xor(long[] arr1, long[] arr2, int wordOffset, int numWords) {
|
||||
long popCount = 0;
|
||||
for (int i = wordOffset, end = wordOffset + numWords; i < end; ++i) {
|
||||
popCount += Long.bitCount(arr1[i] ^ arr2[i]);
|
||||
}
|
||||
|
||||
|
||||
if (i<=n-4) {
|
||||
long twosA, twosB, foursA, eights;
|
||||
{
|
||||
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
|
||||
long u=ones ^ b;
|
||||
twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long b=(A[i+2] ^ B[i+2]), c=(A[i+3] ^ B[i+3]);
|
||||
long u=ones^b;
|
||||
twosB =(ones&b)|(u&c);
|
||||
ones=u^c;
|
||||
}
|
||||
{
|
||||
long u=twos^twosA;
|
||||
foursA=(twos&twosA)|(u&twosB);
|
||||
twos=u^twosB;
|
||||
}
|
||||
eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=4;
|
||||
}
|
||||
|
||||
if (i<=n-2) {
|
||||
long b=(A[i] ^ B[i]), c=(A[i+1] ^ B[i+1]);
|
||||
long u=ones ^ b;
|
||||
long twosA=(ones & b)|( u & c);
|
||||
ones=u^c;
|
||||
|
||||
long foursA=twos&twosA;
|
||||
twos=twos^twosA;
|
||||
|
||||
long eights=fours&foursA;
|
||||
fours=fours^foursA;
|
||||
|
||||
tot8 += pop(eights);
|
||||
i+=2;
|
||||
}
|
||||
|
||||
if (i<n) {
|
||||
tot += pop((A[i] ^ B[i]));
|
||||
}
|
||||
|
||||
tot += (pop(fours)<<2)
|
||||
+ (pop(twos)<<1)
|
||||
+ pop(ones)
|
||||
+ (tot8<<3);
|
||||
|
||||
return tot;
|
||||
}
|
||||
|
||||
/* python code to generate ntzTable
|
||||
def ntz(val):
|
||||
if val==0: return 8
|
||||
i=0
|
||||
while (val&0x01)==0:
|
||||
i = i+1
|
||||
val >>= 1
|
||||
return i
|
||||
print ','.join([ str(ntz(i)) for i in range(256) ])
|
||||
***/
|
||||
/** table of number of trailing zeros in a byte */
|
||||
public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
|
||||
|
||||
|
||||
/** Returns number of trailing zeros in a 64 bit long value. */
|
||||
public static int ntz(long val) {
|
||||
// A full binary search to determine the low byte was slower than
|
||||
// a linear search for nextSetBit(). This is most likely because
|
||||
// the implementation of nextSetBit() shifts bits to the right, increasing
|
||||
// the probability that the first non-zero byte is in the rhs.
|
||||
//
|
||||
// This implementation does a single binary search at the top level only
|
||||
// so that all other bit shifting can be done on ints instead of longs to
|
||||
// remain friendly to 32 bit architectures. In addition, the case of a
|
||||
// non-zero first byte is checked for first because it is the most common
|
||||
// in dense bit arrays.
|
||||
|
||||
int lower = (int)val;
|
||||
int lowByte = lower & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte];
|
||||
|
||||
if (lower!=0) {
|
||||
lowByte = (lower>>>8) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 8;
|
||||
lowByte = (lower>>>16) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 16;
|
||||
// no need to mask off low byte for the last byte in the 32 bit word
|
||||
// no need to check for zero on the last byte either.
|
||||
return ntzTable[lower>>>24] + 24;
|
||||
} else {
|
||||
// grab upper 32 bits
|
||||
int upper=(int)(val>>32);
|
||||
lowByte = upper & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 32;
|
||||
lowByte = (upper>>>8) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 40;
|
||||
lowByte = (upper>>>16) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 48;
|
||||
// no need to mask off low byte for the last byte in the 32 bit word
|
||||
// no need to check for zero on the last byte either.
|
||||
return ntzTable[upper>>>24] + 56;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns number of trailing zeros in a 32 bit int value. */
|
||||
public static int ntz(int val) {
|
||||
// This implementation does a single binary search at the top level only.
|
||||
// In addition, the case of a non-zero first byte is checked for first
|
||||
// because it is the most common in dense bit arrays.
|
||||
|
||||
int lowByte = val & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte];
|
||||
lowByte = (val>>>8) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 8;
|
||||
lowByte = (val>>>16) & 0xff;
|
||||
if (lowByte != 0) return ntzTable[lowByte] + 16;
|
||||
// no need to mask off low byte for the last byte.
|
||||
// no need to check for zero on the last byte either.
|
||||
return ntzTable[val>>>24] + 24;
|
||||
}
|
||||
|
||||
/** returns 0 based index of first set bit
|
||||
* (only works for x!=0)
|
||||
* <br/> This is an alternate implementation of ntz()
|
||||
*/
|
||||
public static int ntz2(long x) {
|
||||
int n = 0;
|
||||
int y = (int)x;
|
||||
if (y==0) {n+=32; y = (int)(x>>>32); } // the only 64 bit shift necessary
|
||||
if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
|
||||
if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
|
||||
return (ntzTable[ y & 0xff ]) + n;
|
||||
}
|
||||
|
||||
/** returns 0 based index of first set bit
|
||||
* <br/> This is an alternate implementation of ntz()
|
||||
*/
|
||||
public static int ntz3(long x) {
|
||||
// another implementation taken from Hackers Delight, extended to 64 bits
|
||||
// and converted to Java.
|
||||
// Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc
|
||||
int n = 1;
|
||||
|
||||
// do the first step as a long, all others as ints.
|
||||
int y = (int)x;
|
||||
if (y==0) {n+=32; y = (int)(x>>>32); }
|
||||
if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; }
|
||||
if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; }
|
||||
if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; }
|
||||
if ((y & 0x00000003) == 0) { n+=2; y>>>=2; }
|
||||
return n - (y & 1);
|
||||
}
|
||||
|
||||
/** table of number of leading zeros in a byte */
|
||||
public static final byte[] nlzTable = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
/** Returns the number of leading zero bits.
|
||||
*/
|
||||
public static int nlz(long x) {
|
||||
int n = 0;
|
||||
// do the first step as a long
|
||||
int y = (int)(x>>>32);
|
||||
if (y==0) {n+=32; y = (int)(x); }
|
||||
if ((y & 0xFFFF0000) == 0) { n+=16; y<<=16; }
|
||||
if ((y & 0xFF000000) == 0) { n+=8; y<<=8; }
|
||||
return n + nlzTable[y >>> 24];
|
||||
/* implementation without table:
|
||||
if ((y & 0xF0000000) == 0) { n+=4; y<<=4; }
|
||||
if ((y & 0xC0000000) == 0) { n+=2; y<<=2; }
|
||||
if ((y & 0x80000000) == 0) { n+=1; y<<=1; }
|
||||
if ((y & 0x80000000) == 0) { n+=1;}
|
||||
return n;
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/** returns true if v is a power of two or zero*/
|
||||
public static boolean isPowerOfTwo(int v) {
|
||||
return ((v & (v-1)) == 0);
|
||||
}
|
||||
|
||||
/** returns true if v is a power of two or zero*/
|
||||
public static boolean isPowerOfTwo(long v) {
|
||||
return ((v & (v-1)) == 0);
|
||||
return popCount;
|
||||
}
|
||||
|
||||
/** returns the next highest power of two, or the current value if it's already a power of two or zero*/
|
||||
|
|
|
@ -155,13 +155,13 @@ public final class FixedBitSet extends DocIdSet implements Bits {
|
|||
long word = bits[i] >> subIndex; // skip all the bits to the right of index
|
||||
|
||||
if (word!=0) {
|
||||
return (i<<6) + subIndex + BitUtil.ntz(word);
|
||||
return (i<<6) + subIndex + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
|
||||
while(++i < bits.length) {
|
||||
word = bits[i];
|
||||
if (word != 0) {
|
||||
return (i<<6) + BitUtil.ntz(word);
|
||||
return (i<<6) + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -629,12 +629,12 @@ public class OpenBitSet extends DocIdSet implements Bits, Cloneable {
|
|||
long word = bits[i] >> subIndex; // skip all the bits to the right of index
|
||||
|
||||
if (word!=0) {
|
||||
return (i<<6) + subIndex + BitUtil.ntz(word);
|
||||
return (i<<6) + subIndex + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
|
||||
while(++i < wlen) {
|
||||
word = bits[i];
|
||||
if (word!=0) return (i<<6) + BitUtil.ntz(word);
|
||||
if (word!=0) return (i<<6) + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
@ -650,12 +650,12 @@ public class OpenBitSet extends DocIdSet implements Bits, Cloneable {
|
|||
long word = bits[i] >>> subIndex; // skip all the bits to the right of index
|
||||
|
||||
if (word!=0) {
|
||||
return (((long)i)<<6) + (subIndex + BitUtil.ntz(word));
|
||||
return (((long)i)<<6) + (subIndex + Long.numberOfTrailingZeros(word));
|
||||
}
|
||||
|
||||
while(++i < wlen) {
|
||||
word = bits[i];
|
||||
if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word);
|
||||
if (word!=0) return (((long)i)<<6) + Long.numberOfTrailingZeros(word);
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.util;
|
||||
|
||||
public class TestBitUtil extends LuceneTestCase {
|
||||
|
||||
private static int slowNlz(long x) {
|
||||
if (x == 0L) return 64;
|
||||
int nlz = 0;
|
||||
while ( ((x << nlz) & (1L << 63)) == 0) {
|
||||
nlz++;
|
||||
}
|
||||
return nlz;
|
||||
}
|
||||
|
||||
private void checkNlz(long x) {
|
||||
assertEquals(slowNlz(x), BitUtil.nlz(x));
|
||||
assertEquals(Long.numberOfLeadingZeros(x), BitUtil.nlz(x));
|
||||
}
|
||||
|
||||
public void testNlz() {
|
||||
checkNlz(0L);
|
||||
checkNlz(1L);
|
||||
checkNlz(-1L);
|
||||
for (int i = 1; i <= 63; i++) {
|
||||
checkNlz(1L << i);
|
||||
checkNlz((1L << i) + (1L << (i>>1)));
|
||||
}
|
||||
}
|
||||
|
||||
public void testBitUtils() {
|
||||
long num = 100000;
|
||||
assertEquals( 5, BitUtil.ntz(num) );
|
||||
assertEquals( 5, BitUtil.ntz2(num) );
|
||||
assertEquals( 5, BitUtil.ntz3(num) );
|
||||
|
||||
num = 10;
|
||||
assertEquals( 1, BitUtil.ntz(num) );
|
||||
assertEquals( 1, BitUtil.ntz2(num) );
|
||||
assertEquals( 1, BitUtil.ntz3(num) );
|
||||
|
||||
for (int i=0; i<64; i++) {
|
||||
num = 1L << i;
|
||||
assertEquals( i, BitUtil.ntz(num) );
|
||||
assertEquals( i, BitUtil.ntz2(num) );
|
||||
assertEquals( i, BitUtil.ntz3(num) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private long testArg(int shift) {
|
||||
return (1L << shift) + (1L << (shift>>1));
|
||||
}
|
||||
|
||||
private long nlzBitUtilBasicLoop(int iters) {
|
||||
long sumRes = 0;
|
||||
while (iters-- >= 0) {
|
||||
for (int i = 1; i <= 63; i++) {
|
||||
long a = testArg(i);
|
||||
sumRes += BitUtil.nlz(a);
|
||||
sumRes += BitUtil.nlz(a + 1);
|
||||
sumRes += BitUtil.nlz(a - 1);
|
||||
sumRes += BitUtil.nlz(a + 10);
|
||||
sumRes += BitUtil.nlz(a - 10);
|
||||
}
|
||||
}
|
||||
return sumRes;
|
||||
}
|
||||
|
||||
private long nlzLongBasicLoop(int iters) {
|
||||
long sumRes = 0;
|
||||
while (iters-- >= 0) {
|
||||
for (int i = 1; i <= 63; i++) {
|
||||
long a = testArg(i);
|
||||
sumRes += Long.numberOfLeadingZeros(a);
|
||||
sumRes += Long.numberOfLeadingZeros(a + 1);
|
||||
sumRes += Long.numberOfLeadingZeros(a - 1);
|
||||
sumRes += Long.numberOfLeadingZeros(a + 10);
|
||||
sumRes += Long.numberOfLeadingZeros(a - 10);
|
||||
}
|
||||
}
|
||||
return sumRes;
|
||||
}
|
||||
|
||||
public void tstPerfNlz() { // See LUCENE-3197, prefer to use Long.numberOfLeadingZeros() over BitUtil.nlz().
|
||||
final long measureMilliSecs = 2000;
|
||||
final int basicIters = 100000;
|
||||
long startTime;
|
||||
long endTime;
|
||||
long curTime;
|
||||
long dummy = 0; // avoid optimizing away
|
||||
|
||||
dummy = 0;
|
||||
int bitUtilLoops = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
endTime = startTime + measureMilliSecs;
|
||||
do {
|
||||
dummy += nlzBitUtilBasicLoop(basicIters);
|
||||
bitUtilLoops++;
|
||||
curTime = System.currentTimeMillis();
|
||||
} while (curTime < endTime);
|
||||
int bitUtilPsTime = (int) (1000000000 * (curTime - startTime) / (basicIters * 5 * 63 * (float) bitUtilLoops));
|
||||
System.out.println("BitUtil nlz time: " + (bitUtilPsTime/1) + " picosec/call, dummy: " + dummy);
|
||||
|
||||
|
||||
dummy = 0;
|
||||
int longLoops = 0;
|
||||
startTime = System.currentTimeMillis();
|
||||
endTime = startTime + measureMilliSecs;
|
||||
do {
|
||||
dummy += nlzLongBasicLoop(basicIters);
|
||||
longLoops++;
|
||||
curTime = System.currentTimeMillis();
|
||||
} while (curTime < endTime);
|
||||
int longPsTime = (int) (1000000000 * (curTime - startTime) / (basicIters * 5 * 63 * (float) longLoops));
|
||||
System.out.println("Long nlz time: " + longPsTime + " picosec/call, dummy: " + dummy);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue