mirror of https://github.com/apache/lucene.git
Added JMH micro-benchmarks submodule (#12663)
This commit is contained in:
@ -27,6 +27,7 @@ configure(rootProject) {
["ForbiddenApis", "help/forbiddenApis.txt", "How to add/apply rules for forbidden APIs."],
["LocalSettings", "help/localSettings.txt", "Local settings, overrides and build performance tweaks."],
["Regeneration", "help/regeneration.txt", "How to refresh generated and derived resources."],
["Jmh", "lucene/benchmark-jmh/README.txt", "JMH micro-benchmarks."],
["Git", "help/git.txt", "Git assistance and guides."],
["IDEs", "help/IDEs.txt", "IDE support."],
["Publishing", "help/publishing.txt", "Maven and other artifact publishing, signing, etc."],
@ -17,7 +17,7 @@
// Configure Java project defaults.
allprojects {
allprojects { project ->
plugins.withType(JavaPlugin) {
sourceCompatibility = rootProject.minJavaVersion
targetCompatibility = rootProject.minJavaVersion
@ -69,12 +69,20 @@ allprojects {
"-proc:none", // proc:none was added because of LOG4J2-1925 / JDK-8186647
if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
options.compilerArgs += "-Werror"
if (project.path == ":lucene:benchmark-jmh") {
// JMH benchmarks use JMH preprocessor and incubating modules.
} else {
// proc:none was added because of LOG4J2-1925 / JDK-8186647
options.compilerArgs += [
if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
options.compilerArgs += "-Werror"
@ -40,7 +40,9 @@ configure(rootProject) {
// Exclude the native module.
// Exclude test fixtures.
// Exclude JMH benchmarks.
// Exclude all subprojects that are modular test projects and those explicitly
@ -0,0 +1,21 @@
The :lucene:benchmark-jmh module contains can be used to compile
and execute JMH (https://github.com/openjdk/jmh) micro-benchmarks.
Look at existing classes and JMH documentation for inspiration on how
to write good micro-benchmarks.
To compile the project and prepare JMH launcher, run:
gradlew :lucene:benchmark-jmh:assemble
The above target will display exact commands to execute JMH from
command line, for example:
java --module-path lucene\benchmark-jmh\build\benchmarks --module org.apache.lucene.benchmark.jmh
You can pass any JMH options to the above command, for example:
-h displays verbose help for all options
-l list available benchmarks
-lp list benchmarks that pass the filter and their parameters
regexp execute all benchmark containing regexp
@ -0,0 +1,107 @@
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
plugins {
id "java-library"
description = 'Lucene JMH micro-benchmarking module'
dependencies {
moduleImplementation project(':lucene:core')
moduleImplementation "org.openjdk.jmh:jmh-core:1.37"
annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37"
// Exclude JMH-generated files and certain classes that require incubating classes
// from forbiddenapis validation.
tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
failOnMissingClasses = false
// Skip certain infrastructure tasks that we can't use or don't care about.
tasks.matching { it.name in [
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception
// but this seems fine for test/build only tools).
"validateJarChecksums", "validateJarLicenses",
// No special javadocs for JMH benchmarks.
]}.configureEach {
it.enabled = false
// Assemble benchmark JAR and its dependencies here.
File dependenciesDir = project.layout.buildDirectory.dir("benchmarks").get().asFile
def syncDependencies = tasks.register("copyDependencies", Sync, {
from configurations.runtimeClasspath
from jar
into dependenciesDir
// Module entry point. For some reason can't be a package from outside the module
// so I wrote a simple redirecting delegate.
tasks.compileJava.configure {
jar.configure { Jar jarTask ->
dependsOn configurations.runtimeClasspath
manifest {
"Main-Class": "org.apache.lucene.benchmark.jmh.Main",
"Class-Path": new Object () {
String toString() {
return configurations.runtimeClasspath.collect { f -> f.name }.join(" ")
assemble {
dependsOn syncDependencies
doLast {
JMH benchmarks compiled. Run them with:
java -jar ${rootDir.toPath().relativize(dependenciesDir.toPath().resolve(jar.archiveFile.get().asFile.name))}
java --module-path ${rootDir.toPath().relativize(dependenciesDir.toPath())} --module org.apache.lucene.benchmark.jmh
JMH options you can use with the above:
-h displays verbose help for all options
-l list available benchmarks
-lp list benchmarks that pass the filter and their parameters
regexp execute all benchmark containing regexp
@ -0,0 +1,27 @@
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
/** Lucene JMH benchmarks. */
module org.apache.lucene.benchmark.jmh {
requires jmh.core;
requires jdk.incubator.vector;
requires jdk.unsupported;
requires org.apache.lucene.core;
exports org.apache.lucene.benchmark.jmh;
exports org.apache.lucene.benchmark.jmh.jmh_generated;
@ -0,0 +1,282 @@
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.lucene.benchmark.jmh;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorShape;
import jdk.incubator.vector.VectorSpecies;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.*;
@Warmup(iterations = 3, time = 3)
@Measurement(iterations = 5, time = 3)
value = 1,
jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
public class BinaryCosineBenchmark {
private byte[] a;
private byte[] b;
@Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
// @Param({"1", "4", "6", "8", "13", "16", "25", "32", "64", "100" })
// @Param({"1024"})
// @Param({"16", "32", "64"})
int size;
public void init() {
a = new byte[size];
b = new byte[size];
if (cosineDistanceNew() != cosineDistanceOld()) {
throw new RuntimeException("New is wrong");
static final VectorSpecies<Byte> PREFERRED_BYTE_SPECIES;
static final VectorSpecies<Short> PREFERRED_SHORT_SPECIES;
static {
if (IntVector.SPECIES_PREFERRED.vectorBitSize() >= 256) {
VectorShape.forBitSize(IntVector.SPECIES_PREFERRED.vectorBitSize() >> 2));
VectorShape.forBitSize(IntVector.SPECIES_PREFERRED.vectorBitSize() >> 1));
} else {
private static final boolean IS_AMD64_WITHOUT_AVX2 =
&& IntVector.SPECIES_PREFERRED.vectorBitSize() < 256;
public float cosineDistanceVectorUtil() {
return VectorUtil.cosine(a, b);
public float cosineDistanceNewNew() {
int i = 0;
int sum = 0;
int norm1 = 0;
int norm2 = 0;
final int vectorSize = IntVector.SPECIES_PREFERRED.vectorBitSize();
// only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
// vectors
if (a.length >= 16 && vectorSize >= 128 && IS_AMD64_WITHOUT_AVX2 == false) {
if (vectorSize >= 256) {
// optimized 256/512 bit implementation, processes 8/16 bytes at a time
int upperBound = PREFERRED_BYTE_SPECIES.loopBound(a.length);
IntVector accSum = IntVector.zero(IntVector.SPECIES_PREFERRED);
IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_PREFERRED);
IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_PREFERRED);
for (; i < upperBound; i += PREFERRED_BYTE_SPECIES.length()) {
ByteVector va8 = ByteVector.fromArray(PREFERRED_BYTE_SPECIES, a, i);
ByteVector vb8 = ByteVector.fromArray(PREFERRED_BYTE_SPECIES, b, i);
Vector<Short> va16 = va8.convertShape(VectorOperators.B2S, PREFERRED_SHORT_SPECIES, 0);
Vector<Short> vb16 = vb8.convertShape(VectorOperators.B2S, PREFERRED_SHORT_SPECIES, 0);
Vector<Short> prod16 = va16.mul(vb16);
Vector<Short> norm1_16 = va16.mul(va16);
Vector<Short> norm2_16 = vb16.mul(vb16);
Vector<Integer> prod32 =
prod16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
Vector<Integer> norm1_32 =
norm1_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
Vector<Integer> norm2_32 =
norm2_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
accSum = accSum.add(prod32);
accNorm1 = accNorm1.add(norm1_32);
accNorm2 = accNorm2.add(norm2_32);
// reduce
sum += accSum.reduceLanes(VectorOperators.ADD);
norm1 += accNorm1.reduceLanes(VectorOperators.ADD);
norm2 += accNorm2.reduceLanes(VectorOperators.ADD);
} else {
// 128-bit impl, which is tricky since we don't have SPECIES_32, it does "overlapping read"
int upperBound = ByteVector.SPECIES_64.loopBound(a.length - ByteVector.SPECIES_64.length());
IntVector accSum = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_128);
for (; i < upperBound; i += ByteVector.SPECIES_64.length() >> 1) {
ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i);
ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i);
// process first half only
Vector<Short> va16 = va8.convert(VectorOperators.B2S, 0);
Vector<Short> vb16 = vb8.convert(VectorOperators.B2S, 0);
Vector<Short> norm1_16 = va16.mul(va16);
Vector<Short> norm2_16 = vb16.mul(vb16);
Vector<Short> prod16 = va16.mul(vb16);
// sum into accumulators
accNorm1 =
accNorm1.add(norm1_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0));
accNorm2 =
accNorm2.add(norm2_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0));
accSum = accSum.add(prod16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0));
// reduce
sum += accSum.reduceLanes(VectorOperators.ADD);
norm1 += accNorm1.reduceLanes(VectorOperators.ADD);
norm2 += accNorm2.reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
byte elem1 = a[i];
byte elem2 = b[i];
sum += elem1 * elem2;
norm1 += elem1 * elem1;
norm2 += elem2 * elem2;
return (float) (sum / Math.sqrt((double) norm1 * (double) norm2));
public float cosineDistanceNew() {
int i = 0;
int sum = 0;
int norm1 = 0;
int norm2 = 0;
final int vectorSize = IntVector.SPECIES_PREFERRED.vectorBitSize();
// only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
// vectors
if (a.length >= 16 && vectorSize >= 128 && IS_AMD64_WITHOUT_AVX2 == false) {
// acts like:
// int sum = 0;
// for (...) {
// short difference = (short) (x[i] - y[i]);
// sum += (int) difference * (int) difference;
// }
if (vectorSize >= 256) {
// optimized 256/512 bit implementation, processes 8/16 bytes at a time
int upperBound = PREFERRED_BYTE_SPECIES.loopBound(a.length);
IntVector accSum = IntVector.zero(IntVector.SPECIES_PREFERRED);
IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_PREFERRED);
IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_PREFERRED);
for (; i < upperBound; i += PREFERRED_BYTE_SPECIES.length()) {
ByteVector va8 = ByteVector.fromArray(PREFERRED_BYTE_SPECIES, a, i);
ByteVector vb8 = ByteVector.fromArray(PREFERRED_BYTE_SPECIES, b, i);
Vector<Short> va16 = va8.convertShape(VectorOperators.B2S, PREFERRED_SHORT_SPECIES, 0);
Vector<Short> vb16 = vb8.convertShape(VectorOperators.B2S, PREFERRED_SHORT_SPECIES, 0);
Vector<Short> prod16 = va16.mul(vb16);
Vector<Short> norm1_16 = va16.mul(va16);
Vector<Short> norm2_16 = vb16.mul(vb16);
Vector<Integer> prod32 =
prod16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
Vector<Integer> norm1_32 =
norm1_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
Vector<Integer> norm2_32 =
norm2_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_PREFERRED, 0);
accSum = accSum.add(prod32);
accNorm1 = accNorm1.add(norm1_32);
accNorm2 = accNorm2.add(norm2_32);
// reduce
sum += accSum.reduceLanes(VectorOperators.ADD);
norm1 += accNorm1.reduceLanes(VectorOperators.ADD);
norm2 += accNorm2.reduceLanes(VectorOperators.ADD);
} else {
// 128-bit implementation, which must "split up" vectors due to widening conversions
int upperBound = ByteVector.SPECIES_64.loopBound(a.length);
IntVector accSum1 = IntVector.zero(IntVector.SPECIES_128);
IntVector accSum2 = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm1_1 = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm1_2 = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm2_1 = IntVector.zero(IntVector.SPECIES_128);
IntVector accNorm2_2 = IntVector.zero(IntVector.SPECIES_128);
for (; i < upperBound; i += ByteVector.SPECIES_64.length()) {
ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i);
ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i);
// expand each byte vector into short vector and perform multiplications
Vector<Short> va16 = va8.convertShape(VectorOperators.B2S, ShortVector.SPECIES_128, 0);
Vector<Short> vb16 = vb8.convertShape(VectorOperators.B2S, ShortVector.SPECIES_128, 0);
Vector<Short> prod16 = va16.mul(vb16);
Vector<Short> norm1_16 = va16.mul(va16);
Vector<Short> norm2_16 = vb16.mul(vb16);
// split each short vector into two int vectors and add
Vector<Integer> prod32_1 =
prod16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0);
Vector<Integer> prod32_2 =
prod16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 1);
Vector<Integer> norm1_32_1 =
norm1_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0);
Vector<Integer> norm1_32_2 =
norm1_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 1);
Vector<Integer> norm2_32_1 =
norm2_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 0);
Vector<Integer> norm2_32_2 =
norm2_16.convertShape(VectorOperators.S2I, IntVector.SPECIES_128, 1);
accSum1 = accSum1.add(prod32_1);
accSum2 = accSum2.add(prod32_2);
accNorm1_1 = accNorm1_1.add(norm1_32_1);
accNorm1_2 = accNorm1_2.add(norm1_32_2);
accNorm2_1 = accNorm2_1.add(norm2_32_1);
accNorm2_2 = accNorm2_2.add(norm2_32_2);
// reduce
sum += accSum1.add(accSum2).reduceLanes(VectorOperators.ADD);
norm1 += accNorm1_1.add(accNorm1_2).reduceLanes(VectorOperators.ADD);
norm2 += accNorm2_1.add(accNorm2_2).reduceLanes(VectorOperators.ADD);
for (; i < a.length; i++) {
byte elem1 = a[i];
byte elem2 = b[i];
sum += elem1 * elem2;
norm1 += elem1 * elem1;
norm2 += elem2 * elem2;
return (float) (sum / Math.sqrt((double) norm1 * (double) norm2));
/** Returns the cosine similarity between the two vectors. */
public float cosineDistanceOld() {
// Note: this will not overflow if dim < 2^18, since max(byte * byte) = 2^14.
int sum = 0;
int norm1 = 0;
int norm2 = 0;
for (int i = 0; i < a.length; i++) {
byte elem1 = a[i];
byte elem2 = b[i];
sum += elem1 * elem2;
norm1 += elem1 * elem1;
norm2 += elem2 * elem2;
return (float) (sum / Math.sqrt((double) norm1 * (double) norm2));
@ -0,0 +1,27 @@
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.lucene.benchmark.jmh;
* Just redirect to JMH so that the package is within the initially launched module, otherwise
* {@code --module xyz} does not work.
public class Main {
public static void main(String[] args) throws Exception {
@ -0,0 +1,20 @@
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.lucene.benchmark.jmh.jmh_generated;
/** Just to keep the module descriptor happy (generated classes must be exported). */
public class DummyExport {}
@ -128,6 +128,7 @@ public class TestModularLayer extends AbstractLuceneDistributionTest {
@ -43,6 +43,7 @@ include "lucene:analysis:stempel"
include "lucene:analysis.tests"
include "lucene:backward-codecs"
include "lucene:benchmark"
include "lucene:benchmark-jmh"
include "lucene:classification"
include "lucene:codecs"
include "lucene:core"
@ -5,15 +5,18 @@ com.ibm.icu:icu4j:70.1 (1 constraints: dc040a31)
commons-codec:commons-codec:1.13 (1 constraints: d904f430)
io.sgr:s2-geometry-library-java:1.0.0 (1 constraints: 0305f035)
junit:junit:4.13.1 (1 constraints: 3b05453b)
net.sf.jopt-simple:jopt-simple:5.0.4 (1 constraints: be0ad6cc)
net.sourceforge.nekohtml:nekohtml:1.9.17 (1 constraints: 4405503b)
org.antlr:antlr4-runtime:4.11.1 (1 constraints: 39053f3b)
org.apache.commons:commons-compress:1.19 (1 constraints: df04fa30)
org.apache.commons:commons-math3:3.6.1 (1 constraints: bf0adbcc)
org.apache.opennlp:opennlp-tools:1.9.1 (1 constraints: 0d050c36)
org.carrot2:morfologik-fsa:2.1.9 (1 constraints: db0d9c36)
org.carrot2:morfologik-polish:2.1.9 (1 constraints: 0e050136)
org.carrot2:morfologik-stemming:2.1.9 (2 constraints: 1312040d)
org.hamcrest:hamcrest:2.2 (1 constraints: a8041f2c)
org.locationtech.spatial4j:spatial4j:0.8 (1 constraints: ac041f2c)
org.openjdk.jmh:jmh-core:1.37 (1 constraints: df04fc30)
org.ow2.asm:asm:7.2 (3 constraints: 2717d96b)
org.ow2.asm:asm-analysis:7.2 (1 constraints: e409d9a5)
org.ow2.asm:asm-commons:7.2 (1 constraints: ad042e2c)
Reference in New Issue