diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0c8eb903113..bb888113969 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -103,9 +103,6 @@ Optimizations * GITHUB#14032: Speed up PostingsEnum when positions are requested. (Adrien Grand) -* GITHUB#14031: Ensure Panama float vector distance impls inlinable. - (Robert Muir, Chris Hegarty) - * GITHUB#14011: Reduce allocation rate in HNSW concurrent merge. (Viliam Durina) Bug Fixes diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 18ef76914bb..9273f7c5a81 100644 --- a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -75,9 +75,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { } } - // cached vector sizes for smaller method bodies - private static final int FLOAT_SPECIES_LENGTH = FLOAT_SPECIES.length(); - // the way FMA should work! if available use it, otherwise fall back to mul/add private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) { if (Constants.HAS_FAST_VECTOR_FMA) { @@ -102,7 +99,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { float res = 0; // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize - if (a.length > 2 * FLOAT_SPECIES_LENGTH) { + if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += dotProductBody(a, b, i); } @@ -123,33 +120,30 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES); FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES); FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES); - final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH; - for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) { + int unrolledLimit = limit - 3 * FLOAT_SPECIES.length(); + for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) { // one FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); acc1 = fma(va, vb, acc1); // two - final int i2 = i + FLOAT_SPECIES_LENGTH; - FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2); - FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2); + FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length()); + FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length()); acc2 = fma(vc, vd, acc2); // three - final int i3 = i2 + FLOAT_SPECIES_LENGTH; - FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3); - FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3); + FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length()); + FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length()); acc3 = fma(ve, vf, acc3); // four - final int i4 = i3 + FLOAT_SPECIES_LENGTH; - FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4); - FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4); + FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length()); + FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length()); acc4 = fma(vg, vh, acc4); } // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes - for (; i < limit; i += FLOAT_SPECIES_LENGTH) { + for (; i < limit; i += FLOAT_SPECIES.length()) { FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); acc1 = fma(va, vb, acc1); @@ -168,7 +162,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { float norm2 = 0; // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize - if (a.length > 2 * FLOAT_SPECIES_LENGTH) { + if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); float[] ret = cosineBody(a, b, i); sum += ret[0]; @@ -196,8 +190,8 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { FloatVector norm1_2 = FloatVector.zero(FLOAT_SPECIES); FloatVector norm2_1 = FloatVector.zero(FLOAT_SPECIES); FloatVector norm2_2 = FloatVector.zero(FLOAT_SPECIES); - final int unrolledLimit = limit - FLOAT_SPECIES_LENGTH; - for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES_LENGTH) { + int unrolledLimit = limit - FLOAT_SPECIES.length(); + for (; i < unrolledLimit; i += 2 * FLOAT_SPECIES.length()) { // one FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); @@ -206,15 +200,14 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { norm2_1 = fma(vb, vb, norm2_1); // two - final int i2 = i + FLOAT_SPECIES_LENGTH; - FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2); - FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2); + FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length()); + FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length()); sum2 = fma(vc, vd, sum2); norm1_2 = fma(vc, vc, norm1_2); norm2_2 = fma(vd, vd, norm2_2); } // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes - for (; i < limit; i += FLOAT_SPECIES_LENGTH) { + for (; i < limit; i += FLOAT_SPECIES.length()) { FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); sum1 = fma(va, vb, sum1); @@ -234,7 +227,7 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { float res = 0; // if the array size is large (> 2x platform vector size), its worth the overhead to vectorize - if (a.length > 2 * FLOAT_SPECIES_LENGTH) { + if (a.length > 2 * FLOAT_SPECIES.length()) { i += FLOAT_SPECIES.loopBound(a.length); res += squareDistanceBody(a, b, i); } @@ -247,12 +240,6 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { return res; } - /** helper: returns fma(a.sub(b), a.sub(b), c) */ - private static FloatVector square(FloatVector a, FloatVector b, FloatVector c) { - FloatVector diff = a.sub(b); - return fma(diff, diff, c); - } - /** vectorized square distance body */ private float squareDistanceBody(float[] a, float[] b, int limit) { int i = 0; @@ -262,36 +249,38 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { FloatVector acc2 = FloatVector.zero(FLOAT_SPECIES); FloatVector acc3 = FloatVector.zero(FLOAT_SPECIES); FloatVector acc4 = FloatVector.zero(FLOAT_SPECIES); - final int unrolledLimit = limit - 3 * FLOAT_SPECIES_LENGTH; - for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES_LENGTH) { + int unrolledLimit = limit - 3 * FLOAT_SPECIES.length(); + for (; i < unrolledLimit; i += 4 * FLOAT_SPECIES.length()) { // one FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); - acc1 = square(va, vb, acc1); + FloatVector diff1 = va.sub(vb); + acc1 = fma(diff1, diff1, acc1); // two - final int i2 = i + FLOAT_SPECIES_LENGTH; - FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i2); - FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i2); - acc2 = square(vc, vd, acc2); + FloatVector vc = FloatVector.fromArray(FLOAT_SPECIES, a, i + FLOAT_SPECIES.length()); + FloatVector vd = FloatVector.fromArray(FLOAT_SPECIES, b, i + FLOAT_SPECIES.length()); + FloatVector diff2 = vc.sub(vd); + acc2 = fma(diff2, diff2, acc2); // three - final int i3 = i2 + FLOAT_SPECIES_LENGTH; - FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i3); - FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i3); - acc3 = square(ve, vf, acc3); + FloatVector ve = FloatVector.fromArray(FLOAT_SPECIES, a, i + 2 * FLOAT_SPECIES.length()); + FloatVector vf = FloatVector.fromArray(FLOAT_SPECIES, b, i + 2 * FLOAT_SPECIES.length()); + FloatVector diff3 = ve.sub(vf); + acc3 = fma(diff3, diff3, acc3); // four - final int i4 = i3 + FLOAT_SPECIES_LENGTH; - FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i4); - FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i4); - acc4 = square(vg, vh, acc4); + FloatVector vg = FloatVector.fromArray(FLOAT_SPECIES, a, i + 3 * FLOAT_SPECIES.length()); + FloatVector vh = FloatVector.fromArray(FLOAT_SPECIES, b, i + 3 * FLOAT_SPECIES.length()); + FloatVector diff4 = vg.sub(vh); + acc4 = fma(diff4, diff4, acc4); } // vector tail: less scalar computations for unaligned sizes, esp with big vector sizes - for (; i < limit; i += FLOAT_SPECIES_LENGTH) { + for (; i < limit; i += FLOAT_SPECIES.length()) { FloatVector va = FloatVector.fromArray(FLOAT_SPECIES, a, i); FloatVector vb = FloatVector.fromArray(FLOAT_SPECIES, b, i); - acc1 = square(va, vb, acc1); + FloatVector diff = va.sub(vb); + acc1 = fma(diff, diff, acc1); } // reduce FloatVector res1 = acc1.add(acc2);