mirror of https://github.com/apache/lucene.git
[10.x] Ensure Panama float vector distance impls inlinable (#14031)
This commit reduces the Panama vector distance float implementations to less than the maximum bytecode size of a hot method to be inlined (325). E.g. Previously: org.apache.lucene.internal.vectorization.PanamaVectorUtilSupport::dotProductBody (355 bytes) failed to inline: callee is too large. After: org.apache.lucene.internal.vectorization.PanamaVectorUtilSupport::dotProductBody (3xx bytes) inline (hot) This helps things a little. Co-authored-by: Robert Muir <rmuir@apache.org>
This commit is contained in:
parent
8762de7f11
commit
290847a80e
|
@ -96,6 +96,9 @@ Optimizations
|
||||||
* GITHUB#14032: Speed up PostingsEnum when positions are requested.
|
* GITHUB#14032: Speed up PostingsEnum when positions are requested.
|
||||||
(Adrien Grand)
|
(Adrien Grand)
|
||||||
|
|
||||||
|
* GITHUB#14031: Ensure Panama float vector distance impls inlinable.
|
||||||
|
(Robert Muir, Chris Hegarty)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
|
* GITHUB#13832: Fixed an issue where the DefaultPassageFormatter.format method did not format passages as intended
|
||||||
|
|
|
@ -75,6 +75,9 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cached vector sizes for smaller method bodies
|
||||||
|
private static final int FLOAT_SPECIES_LENGTH = FLOAT_SPECIES.length();
|
||||||
|
|
||||||
// the way FMA should work! if available use it, otherwise fall back to mul/add
|
// the way FMA should work! if available use it, otherwise fall back to mul/add
|
||||||
private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) {
|
private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) {
|
||||||
if (Constants.HAS_FAST_VECTOR_FMA) {
|
if (Constants.HAS_FAST_VECTOR_FMA) {
|
||||||
|
@ -99,7 +102,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
float res = 0;
|
float res = 0;
|
||||||
|
|
||||||
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
||||||
if (a.length > 2 * FLOAT_SPECIES.length()) {
|
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
|
||||||
i += FLOAT_SPECIES.loopBound(a.length);
|
i += FLOAT_SPECIES.loopBound(a.length);
|
||||||
res += dotProductBody(a, b, i);
|
res += dotProductBody(a, b, i);
|
||||||
}
|
}
|
||||||
|
@ -120,30 +123,33 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
|
final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
|
||||||
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
|
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
|
||||||
// one
|
// one
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
acc1 = fma(va, vb, acc1);
|
acc1 = fma(va, vb, acc1);
|
||||||
|
|
||||||
// two
|
// two
|
||||||
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
|
final int i2 = i + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
|
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
|
||||||
|
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
|
||||||
acc2 = fma(vc, vd, acc2);
|
acc2 = fma(vc, vd, acc2);
|
||||||
|
|
||||||
// three
|
// three
|
||||||
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
|
final int i3 = i2 + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
|
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
|
||||||
|
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
|
||||||
acc3 = fma(ve, vf, acc3);
|
acc3 = fma(ve, vf, acc3);
|
||||||
|
|
||||||
// four
|
// four
|
||||||
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
|
final int i4 = i3 + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
|
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
|
||||||
|
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
|
||||||
acc4 = fma(vg, vh, acc4);
|
acc4 = fma(vg, vh, acc4);
|
||||||
}
|
}
|
||||||
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
||||||
for (; i < limit; i += FLOAT_SPECIES.length()) {
|
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
acc1 = fma(va, vb, acc1);
|
acc1 = fma(va, vb, acc1);
|
||||||
|
@ -162,7 +168,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
float norm2 = 0;
|
float norm2 = 0;
|
||||||
|
|
||||||
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
||||||
if (a.length > 2 * FLOAT_SPECIES.length()) {
|
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
|
||||||
i += FLOAT_SPECIES.loopBound(a.length);
|
i += FLOAT_SPECIES.loopBound(a.length);
|
||||||
float[] ret = cosineBody(a, b, i);
|
float[] ret = cosineBody(a, b, i);
|
||||||
sum += ret[0];
|
sum += ret[0];
|
||||||
|
@ -190,8 +196,8 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
FloatVector norm1_2 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector norm1_2 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector norm2_1 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector norm2_1 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector norm2_2 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector norm2_2 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
int unrolledLimit = limit - FLOAT_SPECIES.length();
|
final int unrolledLimit = limit - FLOAT_SPECIES_LENGTH;
|
||||||
for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES.length()) {
|
for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES_LENGTH) {
|
||||||
// one
|
// one
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
|
@ -200,14 +206,15 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
norm2_1 = fma(vb, vb, norm2_1);
|
norm2_1 = fma(vb, vb, norm2_1);
|
||||||
|
|
||||||
// two
|
// two
|
||||||
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
|
final int i2 = i + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
|
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
|
||||||
|
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
|
||||||
sum2 = fma(vc, vd, sum2);
|
sum2 = fma(vc, vd, sum2);
|
||||||
norm1_2 = fma(vc, vc, norm1_2);
|
norm1_2 = fma(vc, vc, norm1_2);
|
||||||
norm2_2 = fma(vd, vd, norm2_2);
|
norm2_2 = fma(vd, vd, norm2_2);
|
||||||
}
|
}
|
||||||
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
||||||
for (; i < limit; i += FLOAT_SPECIES.length()) {
|
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
sum1 = fma(va, vb, sum1);
|
sum1 = fma(va, vb, sum1);
|
||||||
|
@ -227,7 +234,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
float res = 0;
|
float res = 0;
|
||||||
|
|
||||||
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
// if the array size is large (> 2x platform vector size), its worth the overhead to vectorize
|
||||||
if (a.length > 2 * FLOAT_SPECIES.length()) {
|
if (a.length > 2 * FLOAT_SPECIES_LENGTH) {
|
||||||
i += FLOAT_SPECIES.loopBound(a.length);
|
i += FLOAT_SPECIES.loopBound(a.length);
|
||||||
res += squareDistanceBody(a, b, i);
|
res += squareDistanceBody(a, b, i);
|
||||||
}
|
}
|
||||||
|
@ -240,6 +247,12 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** helper: returns fma(a.sub(b), a.sub(b), c) */
|
||||||
|
private static FloatVector square(FloatVector a, FloatVector b, FloatVector c) {
|
||||||
|
FloatVector diff = a.sub(b);
|
||||||
|
return fma(diff, diff, c);
|
||||||
|
}
|
||||||
|
|
||||||
/** vectorized square distance body */
|
/** vectorized square distance body */
|
||||||
private float squareDistanceBody(float[] a, float[] b, int limit) {
|
private float squareDistanceBody(float[] a, float[] b, int limit) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
@ -249,38 +262,36 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
|
||||||
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
|
FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES);
|
||||||
int unrolledLimit = limit - 3 * FLOAT_SPECIES.length();
|
final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH;
|
||||||
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) {
|
for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) {
|
||||||
// one
|
// one
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
FloatVector diff1 = va.sub(vb);
|
acc1 = square(va, vb, acc1);
|
||||||
acc1 = fma(diff1, diff1, acc1);
|
|
||||||
|
|
||||||
// two
|
// two
|
||||||
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length());
|
final int i2 = i + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length());
|
FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2);
|
||||||
FloatVector diff2 = vc.sub(vd);
|
FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2);
|
||||||
acc2 = fma(diff2, diff2, acc2);
|
acc2 = square(vc, vd, acc2);
|
||||||
|
|
||||||
// three
|
// three
|
||||||
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length());
|
final int i3 = i2 + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length());
|
FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3);
|
||||||
FloatVector diff3 = ve.sub(vf);
|
FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3);
|
||||||
acc3 = fma(diff3, diff3, acc3);
|
acc3 = square(ve, vf, acc3);
|
||||||
|
|
||||||
// four
|
// four
|
||||||
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length());
|
final int i4 = i3 + FLOAT_SPECIES_LENGTH;
|
||||||
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length());
|
FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4);
|
||||||
FloatVector diff4 = vg.sub(vh);
|
FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4);
|
||||||
acc4 = fma(diff4, diff4, acc4);
|
acc4 = square(vg, vh, acc4);
|
||||||
}
|
}
|
||||||
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
// vector tail: less scalar computations for unaligned sizes, esp with big vector sizes
|
||||||
for (; i < limit; i += FLOAT_SPECIES.length()) {
|
for (; i < limit; i += FLOAT_SPECIES_LENGTH) {
|
||||||
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i);
|
||||||
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i);
|
||||||
FloatVector diff = va.sub(vb);
|
acc1 = square(va, vb, acc1);
|
||||||
acc1 = fma(diff, diff, acc1);
|
|
||||||
}
|
}
|
||||||
// reduce
|
// reduce
|
||||||
FloatVector res1 = acc1.add(acc2);
|
FloatVector res1 = acc1.add(acc2);
|
||||||
|
|
Loading…
Reference in New Issue