diff --git a/.gitattributes b/.gitattributes index e4f4bf8b496..a3135003e80 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,6 @@ # Ignore all differences in line endings for the lock file. -versions.lock text eol=lf -versions.props text eol=lf +versions.lock text eol=lf +versions.toml text eol=lf # Gradle files are always in LF. *.gradle text eol=lf diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d568b18def4..566c199e148 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -58,7 +58,7 @@ In case your contribution fixes a bug, please create a new test case that fails ### IDE support -- *IntelliJ* - IntelliJ idea can import and build gradle-based projects out of the box. +- *IntelliJ* - IntelliJ idea can import and build gradle-based projects out of the box. It will default to running tests by calling the gradle wrapper, and while this works, it is can be a bit slow. If instead you configure IntelliJ to use its own built-in test runner by (in 2024 version) navigating to settings for Build Execution & Deployment/Build Tools/Gradle (under File/Settings menu on some platforms) and selecting "Build and Run using: IntelliJ IDEA" and "Run Tests using: IntelliJ IDEA", then some tests will run faster. However some other tests will not run using this configuration. - *Eclipse* - Basic support ([help/IDEs.txt](https://github.com/apache/lucene/blob/main/help/IDEs.txt#L7)). - *Netbeans* - Not tested. diff --git a/README.md b/README.md index 7a167e7455d..fe523af81b2 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,8 @@ comprehensive documentation, visit: - Latest Releases: - Nightly: +- New contributors should start by reading [Contributing Guide](./CONTRIBUTING.md) - Build System Documentation: [help/](./help/) -- Developer Documentation: [dev-docs/](./dev-docs/) - Migration Guide: [lucene/MIGRATE.md](./lucene/MIGRATE.md) ## Building @@ -45,8 +45,6 @@ comprehensive documentation, visit: We'll assume that you know how to get and set up the JDK - if you don't, then we suggest starting at https://jdk.java.net/ and learning more about Java, before returning to this README. -See [Contributing Guide](./CONTRIBUTING.md) for details. - ## Contributing Bug fixes, improvements and new features are always welcome! @@ -54,6 +52,8 @@ Please review the [Contributing to Lucene Guide](./CONTRIBUTING.md) for information on contributing. +- Additional Developer Documentation: [dev-docs/](./dev-docs/) + ## Discussion and Support - [Users Mailing List](https://lucene.apache.org/core/discussion.html#java-user-list-java-userluceneapacheorg) diff --git a/buildSrc/build.gradle b/build-tools/build-infra/build.gradle similarity index 56% rename from buildSrc/build.gradle rename to build-tools/build-infra/build.gradle index db63485924d..5cb1426cba9 100644 --- a/buildSrc/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -15,30 +15,50 @@ * limitations under the License. */ +plugins { + id "java-gradle-plugin" + alias(deps.plugins.spotless) apply false + alias(deps.plugins.forbiddenapis) apply false +} + repositories { mavenCentral() } -ext { - // Minimum Java version required to compile buildSrc. - minJavaVersion = JavaVersion.VERSION_21 -} - +group = "org.apache" // Make sure the build environment is consistent. -apply from: file('../gradle/validation/check-environment.gradle') +apply from: file('../../gradle/conventions.gradle') +apply from: file('../../gradle/validation/check-environment.gradle') -// Load common buildSrc and script deps. -apply from: file("scriptDepVersions.gradle") +// Add spotless/ tidy. +tasks.register("checkJdkInternalsExportedToGradle") {} +apply from: file('../../gradle/validation/spotless.gradle') +apply from: file('../../gradle/validation/forbidden-apis.gradle') + +java { + sourceCompatibility = JavaVersion.toVersion(deps.versions.minJava.get()) + targetCompatibility = JavaVersion.toVersion(deps.versions.minJava.get()) +} + +gradlePlugin { + automatedPublishing = false + + plugins { + buildInfra { + id = 'lucene.build-infra' + implementationClass = 'org.apache.lucene.gradle.buildinfra.BuildInfraPlugin' + } + } +} dependencies { implementation gradleApi() implementation localGroovy() - - implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}" + implementation deps.commons.codec } -if (!rootProject.hasJavaFlightRecorder) { +if (!hasJavaFlightRecorder) { logger.warn('Module jdk.jfr is not available; skipping compilation of Java Flight Recorder support.') tasks.named('compileJava').configure { exclude('**/ProfileResults.java') diff --git a/dev-tools/missing-doclet/build.gradle b/build-tools/build-infra/settings.gradle similarity index 68% rename from dev-tools/missing-doclet/build.gradle rename to build-tools/build-infra/settings.gradle index d7dcdf8d32f..92ad0654e87 100644 --- a/dev-tools/missing-doclet/build.gradle +++ b/build-tools/build-infra/settings.gradle @@ -15,18 +15,12 @@ * limitations under the License. */ -plugins { - id 'java-library' -} - -version = "1.0.0-SNAPSHOT" -group = "org.apache.lucene.tools" -description = 'Doclet-based javadoc validation' - -sourceCompatibility = JavaVersion.VERSION_21 -targetCompatibility = JavaVersion.VERSION_21 - -tasks.withType(JavaCompile) { - options.compilerArgs += ["--release", targetCompatibility.toString()] - options.encoding = "UTF-8" +rootProject.name = 'build-infra' + +dependencyResolutionManagement { + versionCatalogs { + deps { + from(files('../../versions.toml')) + } + } } diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/Checksum.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/Checksum.java similarity index 97% rename from buildSrc/src/main/java/org/apache/lucene/gradle/Checksum.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/Checksum.java index f9487fe5a96..7566102294f 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/Checksum.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/Checksum.java @@ -27,6 +27,11 @@ package org.apache.lucene.gradle; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.Locale; import org.apache.commons.codec.digest.DigestUtils; import org.gradle.api.DefaultTask; import org.gradle.api.GradleException; @@ -39,16 +44,10 @@ import org.gradle.api.tasks.TaskAction; import org.gradle.work.Incremental; import org.gradle.work.InputChanges; -import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.util.Locale; - public class Checksum extends DefaultTask { private FileCollection files; private File outputDir; - private Algorithm algorithm; + private Algorithm algorithm = Algorithm.SHA512; public enum Algorithm { MD5(new DigestUtils(DigestUtils.getMd5Digest())), @@ -69,7 +68,6 @@ public class Checksum extends DefaultTask { public Checksum() { outputDir = new File(getProject().getBuildDir(), "checksums"); - algorithm = Algorithm.SHA256; } @InputFiles @@ -190,6 +188,8 @@ public class Checksum extends DefaultTask { private FileCollection filesFor(final Algorithm algo) { return getProject() - .fileTree(getOutputDir(), files -> files.include("**/*." + algo.toString().toLowerCase(Locale.ROOT))); + .fileTree( + getOutputDir(), + files -> files.include("**/*." + algo.toString().toLowerCase(Locale.ROOT))); } } diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java new file mode 100644 index 00000000000..be2873ced47 --- /dev/null +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.gradle; + +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.io.Writer; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; +import org.gradle.api.internal.tasks.testing.logging.FullExceptionFormatter; +import org.gradle.api.internal.tasks.testing.logging.TestExceptionFormatter; +import org.gradle.api.logging.Logger; +import org.gradle.api.logging.Logging; +import org.gradle.api.tasks.testing.TestDescriptor; +import org.gradle.api.tasks.testing.TestListener; +import org.gradle.api.tasks.testing.TestOutputEvent; +import org.gradle.api.tasks.testing.TestOutputListener; +import org.gradle.api.tasks.testing.TestResult; +import org.gradle.api.tasks.testing.logging.TestLogging; + +/** + * An error reporting listener that queues test output streams and displays them on failure. + * + *

Heavily inspired by Elasticsearch's ErrorReportingTestListener (ASL 2.0 licensed). + */ +public class ErrorReportingTestListener implements TestOutputListener, TestListener { + private static final Logger LOGGER = Logging.getLogger(ErrorReportingTestListener.class); + + private final TestExceptionFormatter formatter; + private final Map outputHandlers = new ConcurrentHashMap<>(); + private final Path spillDir; + private final Path outputsDir; + private final boolean verboseMode; + + public ErrorReportingTestListener( + TestLogging testLogging, Path spillDir, Path outputsDir, boolean verboseMode) { + this.formatter = new FullExceptionFormatter(testLogging); + this.spillDir = spillDir; + this.outputsDir = outputsDir; + this.verboseMode = verboseMode; + } + + @Override + public void onOutput(TestDescriptor testDescriptor, TestOutputEvent outputEvent) { + handlerFor(testDescriptor).write(outputEvent); + } + + @Override + public void beforeSuite(TestDescriptor suite) { + // noop. + } + + @Override + public void beforeTest(TestDescriptor testDescriptor) { + // Noop. + } + + @Override + public void afterSuite(final TestDescriptor suite, TestResult result) { + if (suite.getParent() == null || suite.getName().startsWith("Gradle")) { + return; + } + + TestKey key = TestKey.of(suite); + try { + OutputHandler outputHandler = outputHandlers.get(key); + if (outputHandler != null) { + long length = outputHandler.length(); + if (length > 1024 * 1024 * 10) { + LOGGER.warn( + String.format( + Locale.ROOT, + "WARNING: Test %s wrote %,d bytes of output.", + suite.getName(), + length)); + } + } + + boolean echoOutput = Objects.equals(result.getResultType(), TestResult.ResultType.FAILURE); + boolean dumpOutput = echoOutput; + + // If the test suite failed, report output. + if (dumpOutput || echoOutput) { + Files.createDirectories(outputsDir); + Path outputLog = outputsDir.resolve(getOutputLogName(suite)); + + // Save the output of a failing test to disk. + try (Writer w = Files.newBufferedWriter(outputLog, StandardCharsets.UTF_8)) { + if (outputHandler != null) { + outputHandler.copyTo(w); + } + } + + if (echoOutput && !verboseMode) { + synchronized (this) { + System.out.println(); + System.out.println( + suite.getClassName() + + " > test suite's output saved to " + + outputLog + + ", copied below:"); + try (BufferedReader reader = + Files.newBufferedReader(outputLog, StandardCharsets.UTF_8)) { + char[] buf = new char[1024]; + int len; + while ((len = reader.read(buf)) >= 0) { + System.out.print(new String(buf, 0, len)); + } + System.out.println(); + } + } + } + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } finally { + OutputHandler handler = outputHandlers.remove(key); + if (handler != null) { + try { + handler.close(); + } catch (IOException e) { + LOGGER.error("Failed to close output handler for: " + key, e); + } + } + } + } + + private static Pattern SANITIZE = Pattern.compile("[^a-zA-Z .\\-_0-9]+"); + + public static String getOutputLogName(TestDescriptor suite) { + return SANITIZE.matcher("OUTPUT-" + suite.getName() + ".txt").replaceAll("_"); + } + + @Override + public void afterTest(TestDescriptor testDescriptor, TestResult result) { + // Include test failure exception stacktrace(s) in test output log. + if (result.getResultType() == TestResult.ResultType.FAILURE) { + if (result.getExceptions().size() > 0) { + String message = formatter.format(testDescriptor, result.getExceptions()); + handlerFor(testDescriptor).write(message); + } + } + } + + private OutputHandler handlerFor(TestDescriptor descriptor) { + // Attach output of leaves (individual tests) to their parent. + if (!descriptor.isComposite()) { + descriptor = descriptor.getParent(); + } + return outputHandlers.computeIfAbsent(TestKey.of(descriptor), (key) -> new OutputHandler()); + } + + public static class TestKey { + private final String key; + + private TestKey(String key) { + this.key = key; + } + + public static TestKey of(TestDescriptor d) { + StringBuilder key = new StringBuilder(); + key.append(d.getClassName()); + key.append("::"); + key.append(d.getName()); + key.append("::"); + key.append(d.getParent() == null ? "-" : d.getParent().toString()); + return new TestKey(key.toString()); + } + + @Override + public boolean equals(Object o) { + return o != null && o.getClass() == this.getClass() && Objects.equals(((TestKey) o).key, key); + } + + @Override + public int hashCode() { + return key.hashCode(); + } + + @Override + public String toString() { + return key; + } + } + + private class OutputHandler implements Closeable { + // Max single-line buffer before automatic wrap occurs. + private static final int MAX_LINE_WIDTH = 1024 * 4; + + private final SpillWriter buffer; + + // internal stream. + private final PrefixedWriter sint; + // stdout + private final PrefixedWriter sout; + // stderr + private final PrefixedWriter serr; + + // last used stream (so that we can flush it properly and prefixes are not screwed up). + private PrefixedWriter last; + + public OutputHandler() { + buffer = + new SpillWriter( + () -> { + try { + return Files.createTempFile(spillDir, "spill-", ".tmp"); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + + Writer sink = buffer; + if (verboseMode) { + sink = new StdOutTeeWriter(buffer); + } + + sint = new PrefixedWriter(" > ", sink, MAX_LINE_WIDTH); + sout = new PrefixedWriter(" 1> ", sink, MAX_LINE_WIDTH); + serr = new PrefixedWriter(" 2> ", sink, MAX_LINE_WIDTH); + last = sint; + } + + public void write(TestOutputEvent event) { + write( + (event.getDestination() == TestOutputEvent.Destination.StdOut ? sout : serr), + event.getMessage()); + } + + public void write(String message) { + write(sint, message); + } + + public long length() throws IOException { + return buffer.length(); + } + + private void write(PrefixedWriter out, String message) { + try { + if (out != last) { + last.completeLine(); + last = out; + } + out.write(message); + } catch (IOException e) { + throw new UncheckedIOException("Unable to write to test output.", e); + } + } + + public void copyTo(Writer out) throws IOException { + flush(); + buffer.copyTo(out); + } + + public void flush() throws IOException { + sout.completeLine(); + serr.completeLine(); + buffer.flush(); + } + + @Override + public void close() throws IOException { + buffer.close(); + } + } +} diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java similarity index 96% rename from buildSrc/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java index db4f804f12e..5436afe70f8 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java @@ -67,6 +67,6 @@ public class GradlePropertiesGenerator { fileContent = fileContent.replace(entry.getKey(), String.valueOf(entry.getValue())); } Files.writeString( - destination, fileContent, StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW); + destination, fileContent, StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW); } } diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java similarity index 91% rename from buildSrc/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java index 7281d496001..3dc663e8332 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/PrefixedWriter.java @@ -20,12 +20,13 @@ import java.io.IOException; import java.io.Writer; /** - * Prefixes every new line with a given string, synchronizing multiple streams to emit consistent lines. + * Prefixes every new line with a given string, synchronizing multiple streams to emit consistent + * lines. */ public class PrefixedWriter extends Writer { Writer sink; - private final static char LF = '\n'; + private static final char LF = '\n'; private final String prefix; private final StringBuilder lineBuffer = new StringBuilder(); private final int maxLineLength; @@ -45,7 +46,7 @@ public class PrefixedWriter extends Writer { sink.write(LF); lineBuffer.setLength(0); - if (c != LF) { + if (c != LF) { lineBuffer.append((char) c); } } else { @@ -70,9 +71,7 @@ public class PrefixedWriter extends Writer { throw new UnsupportedOperationException(); } - /** - * Complete the current line (emit LF if not at the start of the line already). - */ + /** Complete the current line (emit LF if not at the start of the line already). */ public void completeLine() throws IOException { if (lineBuffer.length() > 0) { write(LF); diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/ProfileResults.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java similarity index 76% rename from buildSrc/src/main/java/org/apache/lucene/gradle/ProfileResults.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java index 8e6ce1f2d8d..5f0e9331664 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/ProfileResults.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/ProfileResults.java @@ -20,13 +20,12 @@ package org.apache.lucene.gradle; import java.io.IOException; import java.nio.file.Paths; import java.util.AbstractMap.SimpleEntry; -import java.util.Arrays; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; - import jdk.jfr.consumer.RecordedClass; import jdk.jfr.consumer.RecordedEvent; import jdk.jfr.consumer.RecordedFrame; @@ -36,15 +35,12 @@ import jdk.jfr.consumer.RecordedThread; import jdk.jfr.consumer.RecordingFile; /** - * Processes an array of recording files (from tests), and prints a simple histogram. - * Inspired by the JFR example code. - * Whole stacks are deduplicated (with the default stacksize being 1): you can drill deeper - * by adjusting the parameters. + * Processes an array of recording files (from tests), and prints a simple histogram. Inspired by + * the JFR example code. Whole stacks are deduplicated (with the default stacksize being 1): you can + * drill deeper by adjusting the parameters. */ public class ProfileResults { - /** - * Formats a frame to a formatted line. This is deduplicated on! - */ + /** Formats a frame to a formatted line. This is deduplicated on! */ static String frameToString(RecordedFrame frame, boolean lineNumbers) { StringBuilder builder = new StringBuilder(); RecordedMethod method = frame.getMethod(); @@ -84,29 +80,32 @@ public class ProfileResults { /** * Driver method, for testing standalone. + * *

    * java -Dtests.profile.count=5 buildSrc/src/main/java/org/apache/lucene/gradle/ProfileResults.java \
    *   ./lucene/core/build/tmp/tests-cwd/somefile.jfr ...
    * 
*/ public static void main(String[] args) throws IOException { - printReport(Arrays.asList(args), - System.getProperty(MODE_KEY, MODE_DEFAULT), - Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)), - Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)), - Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT))); + printReport( + Arrays.asList(args), + System.getProperty(MODE_KEY, MODE_DEFAULT), + Integer.parseInt(System.getProperty(STACKSIZE_KEY, STACKSIZE_DEFAULT)), + Integer.parseInt(System.getProperty(COUNT_KEY, COUNT_DEFAULT)), + Boolean.parseBoolean(System.getProperty(LINENUMBERS_KEY, LINENUMBERS_DEFAULT))); } /** true if we care about this event */ static boolean isInteresting(String mode, RecordedEvent event) { String name = event.getEventType().getName(); - switch(mode) { + switch (mode) { case "cpu": - return (name.equals("jdk.ExecutionSample") || name.equals("jdk.NativeMethodSample")) && - !isGradlePollThread(event.getThread("sampledThread")); + return (name.equals("jdk.ExecutionSample") || name.equals("jdk.NativeMethodSample")) + && !isGradlePollThread(event.getThread("sampledThread")); case "heap": - return (name.equals("jdk.ObjectAllocationInNewTLAB") || name.equals("jdk.ObjectAllocationOutsideTLAB")) && - !isGradlePollThread(event.getThread("eventThread")); + return (name.equals("jdk.ObjectAllocationInNewTLAB") + || name.equals("jdk.ObjectAllocationOutsideTLAB")) + && !isGradlePollThread(event.getThread("eventThread")); default: throw new UnsupportedOperationException(event.toString()); } @@ -119,7 +118,7 @@ public class ProfileResults { /** value we accumulate for this event */ static long getValue(RecordedEvent event) { - switch(event.getEventType().getName()) { + switch (event.getEventType().getName()) { case "jdk.ObjectAllocationInNewTLAB": return event.getLong("tlabSize"); case "jdk.ObjectAllocationOutsideTLAB": @@ -133,10 +132,10 @@ public class ProfileResults { } } - /** format a value, if its huge, we show millions */ + /** format a value, if it's huge, we show millions */ static String formatValue(long value) { if (value > 1_000_000) { - return String.format("%dM", value / 1_000_000); + return String.format(Locale.ROOT, "%dM", value / 1_000_000); } else { return Long.toString(value); } @@ -144,15 +143,17 @@ public class ProfileResults { /** fixed width used for printing the different columns */ private static final int COLUMN_SIZE = 14; + private static final String COLUMN_PAD = "%-" + COLUMN_SIZE + "s"; + private static String pad(String input) { return String.format(Locale.ROOT, COLUMN_PAD, input); } - /** - * Process all the JFR files passed in args and print a merged summary. - */ - public static void printReport(List files, String mode, int stacksize, int count, boolean lineNumbers) throws IOException { + /** Process all the JFR files passed in args and print a merged summary. */ + public static void printReport( + List files, String mode, int stacksize, int count, boolean lineNumbers) + throws IOException { if (!"cpu".equals(mode) && !"heap".equals(mode)) { throw new IllegalArgumentException("tests.profile.mode must be one of (cpu,heap)"); } @@ -178,14 +179,13 @@ public class ProfileResults { StringBuilder stack = new StringBuilder(); for (int i = 0; i < Math.min(stacksize, trace.getFrames().size()); i++) { if (stack.length() > 0) { - stack.append("\n") - .append(framePadding) - .append(" at "); + stack.append("\n").append(framePadding).append(" at "); } stack.append(frameToString(trace.getFrames().get(i), lineNumbers)); } String line = stack.toString(); - SimpleEntry entry = histogram.computeIfAbsent(line, u -> new SimpleEntry(line, 0L)); + SimpleEntry entry = + histogram.computeIfAbsent(line, u -> new SimpleEntry(line, 0L)); long value = getValue(event); entry.setValue(entry.getValue() + value); totalEvents++; @@ -195,12 +195,20 @@ public class ProfileResults { } } // print summary from histogram - System.out.printf(Locale.ROOT, "PROFILE SUMMARY from %d events (total: %s)\n", totalEvents, formatValue(sumValues)); + System.out.printf( + Locale.ROOT, + "PROFILE SUMMARY from %d events (total: %s)\n", + totalEvents, + formatValue(sumValues)); System.out.printf(Locale.ROOT, " tests.profile.mode=%s\n", mode); System.out.printf(Locale.ROOT, " tests.profile.count=%d\n", count); System.out.printf(Locale.ROOT, " tests.profile.stacksize=%d\n", stacksize); System.out.printf(Locale.ROOT, " tests.profile.linenumbers=%b\n", lineNumbers); - System.out.printf(Locale.ROOT, "%s%sSTACK\n", pad("PERCENT"), pad(mode.toUpperCase(Locale.ROOT) + " SAMPLES")); + System.out.printf( + Locale.ROOT, + "%s%sSTACK\n", + pad("PERCENT"), + pad(mode.toUpperCase(Locale.ROOT) + " SAMPLES")); List> entries = new ArrayList<>(histogram.values()); entries.sort((u, v) -> v.getValue().compareTo(u.getValue())); int seen = 0; @@ -208,8 +216,10 @@ public class ProfileResults { if (seen++ == count) { break; } - String percent = String.format("%2.2f%%", 100 * (c.getValue() / (float) sumValues)); - System.out.printf(Locale.ROOT, "%s%s%s\n", pad(percent), pad(formatValue(c.getValue())), c.getKey()); + String percent = + String.format(Locale.ROOT, "%2.2f%%", 100 * (c.getValue() / (float) sumValues)); + System.out.printf( + Locale.ROOT, "%s%s%s\n", pad(percent), pad(formatValue(c.getValue())), c.getKey()); } } } diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/SpillWriter.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/SpillWriter.java similarity index 98% rename from buildSrc/src/main/java/org/apache/lucene/gradle/SpillWriter.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/SpillWriter.java index f89977c2503..9539bddbbfe 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/SpillWriter.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/SpillWriter.java @@ -26,7 +26,7 @@ import java.nio.file.Path; import java.util.function.Supplier; public class SpillWriter extends Writer { - private final static int MAX_BUFFERED = 2 * 1024; + private static final int MAX_BUFFERED = 2 * 1024; private final StringWriter buffer = new StringWriter(MAX_BUFFERED); private final Supplier spillPathSupplier; diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java new file mode 100644 index 00000000000..8bd2256e091 --- /dev/null +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.gradle; + +import java.io.IOException; +import java.io.PrintStream; +import java.io.Writer; + +class StdOutTeeWriter extends Writer { + private final Writer delegate; + private final PrintStream out = System.out; + + public StdOutTeeWriter(Writer delegate) { + this.delegate = delegate; + } + + @Override + public void write(int c) throws IOException { + delegate.write(c); + out.write(c); + } + + @Override + public void write(char[] cbuf) throws IOException { + delegate.write(cbuf); + out.print(cbuf); + } + + @Override + public void write(String str) throws IOException { + delegate.write(str); + out.print(str); + } + + @Override + public void write(String str, int off, int len) throws IOException { + delegate.write(str, off, len); + out.append(str, off, len); + } + + @Override + public Writer append(CharSequence csq) throws IOException { + delegate.append(csq); + out.append(csq); + return this; + } + + @Override + public Writer append(CharSequence csq, int start, int end) throws IOException { + delegate.append(csq, start, end); + out.append(csq, start, end); + return this; + } + + @Override + public Writer append(char c) throws IOException { + delegate.append(c); + out.append(c); + return this; + } + + @Override + public void write(char[] cbuf, int off, int len) throws IOException { + delegate.write(cbuf, off, len); + out.print(new String(cbuf, off, len)); + } + + @Override + public void flush() throws IOException { + delegate.flush(); + out.flush(); + } + + @Override + public void close() throws IOException { + delegate.close(); + // Don't close the actual output. + } +} diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java similarity index 71% rename from buildSrc/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java index e05640e25ee..aaf7059bb5f 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java @@ -16,12 +16,18 @@ */ package org.apache.lucene.gradle; +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; import java.net.HttpURLConnection; -import java.net.URL; import java.net.URI; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -31,12 +37,10 @@ import java.security.NoSuchAlgorithmException; import java.util.Locale; import java.util.concurrent.TimeUnit; -import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; - /** * Standalone class that can be used to download a gradle-wrapper.jar - *

- * Has no dependencies outside of standard java libraries + * + *

Has no dependencies outside of standard java libraries */ public class WrapperDownloader { public static void main(String[] args) { @@ -62,13 +66,15 @@ public class WrapperDownloader { } public void run(Path destination) throws IOException, NoSuchAlgorithmException { - Path checksumPath = destination.resolveSibling(destination.getFileName().toString() + ".sha256"); + Path checksumPath = + destination.resolveSibling(destination.getFileName().toString() + ".sha256"); if (!Files.exists(checksumPath)) { throw new IOException("Checksum file not found: " + checksumPath); } String expectedChecksum = Files.readString(checksumPath, StandardCharsets.UTF_8).trim(); - Path versionPath = destination.resolveSibling(destination.getFileName().toString() + ".version"); + Path versionPath = + destination.resolveSibling(destination.getFileName().toString() + ".version"); if (!Files.exists(versionPath)) { throw new IOException("Wrapper version file not found: " + versionPath); } @@ -87,7 +93,12 @@ public class WrapperDownloader { } } - URL url = URI.create("https://raw.githubusercontent.com/gradle/gradle/v" + wrapperVersion + "/gradle/wrapper/gradle-wrapper.jar").toURL(); + URL url = + URI.create( + "https://raw.githubusercontent.com/gradle/gradle/v" + + wrapperVersion + + "/gradle/wrapper/gradle-wrapper.jar") + .toURL(); System.err.println("Downloading gradle-wrapper.jar from " + url); // Zero-copy save the jar to a temp file @@ -103,8 +114,9 @@ public class WrapperDownloader { } catch (IOException e) { if (retries-- > 0) { // Retry after a short delay - System.err.println("Error connecting to server: " + e + ", will retry in " + retryDelay + " seconds."); - Thread.sleep(TimeUnit.SECONDS.toMillis(retryDelay)); + System.err.println( + "Error connecting to server: " + e + ", will retry in " + retryDelay + " seconds."); + sleep(TimeUnit.SECONDS.toMillis(retryDelay)); continue; } } @@ -115,8 +127,13 @@ public class WrapperDownloader { case HttpURLConnection.HTTP_BAD_GATEWAY: if (retries-- > 0) { // Retry after a short delay. - System.err.println("Server returned HTTP " + connection.getResponseCode() + ", will retry in " + retryDelay + " seconds."); - Thread.sleep(TimeUnit.SECONDS.toMillis(retryDelay)); + System.err.println( + "Server returned HTTP " + + connection.getResponseCode() + + ", will retry in " + + retryDelay + + " seconds."); + sleep(TimeUnit.SECONDS.toMillis(retryDelay)); continue; } } @@ -126,13 +143,15 @@ public class WrapperDownloader { } try (InputStream is = connection.getInputStream(); - OutputStream out = Files.newOutputStream(temp)){ + OutputStream out = Files.newOutputStream(temp)) { is.transferTo(out); } String checksum = checksum(digest, temp); if (!checksum.equalsIgnoreCase(expectedChecksum)) { - throw new IOException(String.format(Locale.ROOT, + throw new IOException( + String.format( + Locale.ROOT, "Checksum mismatch on downloaded gradle-wrapper.jar (was: %s, expected: %s).", checksum, expectedChecksum)); @@ -141,8 +160,12 @@ public class WrapperDownloader { Files.move(temp, destination, REPLACE_EXISTING); temp = null; } catch (IOException | InterruptedException e) { - throw new IOException("Could not download gradle-wrapper.jar (" + - e.getClass().getSimpleName() + ": " + e.getMessage() + ")."); + throw new IOException( + "Could not download gradle-wrapper.jar (" + + e.getClass().getSimpleName() + + ": " + + e.getMessage() + + ")."); } finally { if (temp != null) { Files.deleteIfExists(temp); @@ -150,6 +173,11 @@ public class WrapperDownloader { } } + @SuppressForbidden(reason = "Correct use of thread.sleep.") + private static void sleep(long millis) throws InterruptedException { + Thread.sleep(millis); + } + private String checksum(MessageDigest messageDigest, Path path) throws IOException { try { char[] hex = "0123456789abcdef".toCharArray(); @@ -160,7 +188,15 @@ public class WrapperDownloader { } return sb.toString(); } catch (IOException e) { - throw new IOException("Could not compute digest of file: " + path + " (" + e.getMessage() + ")"); + throw new IOException( + "Could not compute digest of file: " + path + " (" + e.getMessage() + ")"); } } + + @Retention(RetentionPolicy.CLASS) + @Target({ElementType.CONSTRUCTOR, ElementType.FIELD, ElementType.METHOD, ElementType.TYPE}) + public @interface SuppressForbidden { + /** A reason for suppressing should always be given. */ + String reason(); + } } diff --git a/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/buildinfra/BuildInfraPlugin.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/buildinfra/BuildInfraPlugin.java new file mode 100644 index 00000000000..58974090ea1 --- /dev/null +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/buildinfra/BuildInfraPlugin.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.gradle.buildinfra; + +import java.nio.file.Path; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.lucene.gradle.Checksum; +import org.apache.lucene.gradle.ErrorReportingTestListener; +import org.apache.lucene.gradle.datasets.ExtractReuters; +import org.gradle.api.Plugin; +import org.gradle.api.Project; +import org.gradle.api.tasks.testing.TestDescriptor; +import org.gradle.api.tasks.testing.logging.TestLogging; + +public class BuildInfraPlugin implements Plugin { + @Override + public void apply(Project project) { + project.getExtensions().create(BuildInfraExtension.NAME, BuildInfraExtension.class); + } + + public static class BuildInfraExtension { + public static final String NAME = "buildinfra"; + + public ErrorReportingTestListener newErrorReportingTestListener( + TestLogging testLogging, Path spillDir, Path outputsDir, boolean verboseMode) { + return new ErrorReportingTestListener(testLogging, spillDir, outputsDir, verboseMode); + } + + public DigestUtils sha1Digest() { + return new DigestUtils(DigestUtils.getSha1Digest()); + } + + public void extractReuters(String reutersDir, String outputDir) throws Exception { + ExtractReuters.main(new String[] {reutersDir, outputDir}); + } + + public String getOutputLogName(TestDescriptor suite) { + return ErrorReportingTestListener.getOutputLogName(suite); + } + + public Class checksumClass() { + return Checksum.class; + } + } +} diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java similarity index 97% rename from buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java rename to build-tools/build-infra/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java index e95d0297e14..2e732348389 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java +++ b/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java @@ -30,8 +30,7 @@ import java.util.regex.Pattern; import java.util.stream.Stream; /** - * Split the Reuters SGML documents into Simple Text files containing: - * Title, Date, Dateline, Body + * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body */ public class ExtractReuters { private final Path reutersDir; @@ -67,7 +66,9 @@ public class ExtractReuters { private static final String[] META_CHARS = {"&", "<", ">", "\"", "'"}; - private static final String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"}; + private static final String[] META_CHARS_SERIALIZATIONS = { + "&", "<", ">", """, "'" + }; /** Override if you wish to change what is extracted */ protected void extractFile(Path sgmFile) throws IOException { diff --git a/build-tools/missing-doclet/build.gradle b/build-tools/missing-doclet/build.gradle new file mode 100644 index 00000000000..8036e949138 --- /dev/null +++ b/build-tools/missing-doclet/build.gradle @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +plugins { + id 'java-library' + alias(deps.plugins.spotless) apply false + alias(deps.plugins.forbiddenapis) apply false +} + +repositories { + mavenCentral() +} + +version = "1.0.0-SNAPSHOT" +group = "org.apache.lucene.tools" +description = 'Doclet-based javadoc validation' + +// Make sure the build environment is consistent. +apply from: file('../../gradle/conventions.gradle') +apply from: file('../../gradle/validation/check-environment.gradle') + +// Add spotless/ tidy. +tasks.register("checkJdkInternalsExportedToGradle") {} +apply from: file('../../gradle/validation/spotless.gradle') +apply from: file('../../gradle/validation/forbidden-apis.gradle') + +java { + sourceCompatibility = JavaVersion.toVersion(deps.versions.minJava.get()) + targetCompatibility = JavaVersion.toVersion(deps.versions.minJava.get()) +} + +tasks.withType(JavaCompile).configureEach { + options.compilerArgs += ["--release", java.targetCompatibility.toString()] + options.encoding = "UTF-8" +} diff --git a/dev-tools/missing-doclet/settings.gradle b/build-tools/missing-doclet/settings.gradle similarity index 85% rename from dev-tools/missing-doclet/settings.gradle rename to build-tools/missing-doclet/settings.gradle index 6a9c064171d..2c272500fa6 100644 --- a/dev-tools/missing-doclet/settings.gradle +++ b/build-tools/missing-doclet/settings.gradle @@ -15,3 +15,10 @@ * limitations under the License. */ +dependencyResolutionManagement { + versionCatalogs { + deps { + from(files('../../versions.toml')) + } + } +} diff --git a/dev-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java b/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java similarity index 64% rename from dev-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java rename to build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java index 2b07008ec59..d37c2bb1ea9 100644 --- a/dev-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java +++ b/build-tools/missing-doclet/src/main/java/org/apache/lucene/missingdoclet/MissingDoclet.java @@ -16,6 +16,9 @@ */ package org.apache.lucene.missingdoclet; +import com.sun.source.doctree.DocCommentTree; +import com.sun.source.doctree.ParamTree; +import com.sun.source.util.DocTrees; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; @@ -24,7 +27,6 @@ import java.util.Locale; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; - import javax.lang.model.element.Element; import javax.lang.model.element.ElementKind; import javax.lang.model.element.ExecutableElement; @@ -36,24 +38,19 @@ import javax.lang.model.util.ElementFilter; import javax.lang.model.util.Elements; import javax.lang.model.util.Elements.Origin; import javax.tools.Diagnostic; - -import com.sun.source.doctree.DocCommentTree; -import com.sun.source.doctree.ParamTree; -import com.sun.source.util.DocTrees; - import jdk.javadoc.doclet.Doclet; import jdk.javadoc.doclet.DocletEnvironment; import jdk.javadoc.doclet.Reporter; import jdk.javadoc.doclet.StandardDoclet; /** - * Checks for missing javadocs, where missing also means "only whitespace" or "license header". - * Has option --missing-level (package, class, method, parameter) so that we can improve over time. - * Has option --missing-ignore to ignore individual elements (such as split packages). - * It isn't recursive, just ignores exactly the elements you tell it. - * This should be removed when packaging is fixed to no longer be split across JARs. - * Has option --missing-method to apply "method" level to selected packages (fix one at a time). - * Matches package names exactly: so you'll need to list subpackages separately. + * Checks for missing javadocs, where missing also means "only whitespace" or "license header". Has + * option --missing-level (package, class, method, parameter) so that we can improve over time. Has + * option --missing-ignore to ignore individual elements (such as split packages). It isn't + * recursive, just ignores exactly the elements you tell it. This should be removed when packaging + * is fixed to no longer be split across JARs. Has option --missing-method to apply "method" level + * to selected packages (fix one at a time). Matches package names exactly: so you'll need to list + * subpackages separately. */ public class MissingDoclet extends StandardDoclet { // checks that modules and packages have documentation @@ -71,120 +68,123 @@ public class MissingDoclet extends StandardDoclet { Elements elementUtils; Set ignored = Collections.emptySet(); Set methodPackages = Collections.emptySet(); - + @Override public Set getSupportedOptions() { Set options = new HashSet<>(super.getSupportedOptions()); - options.add(new Doclet.Option() { - @Override - public int getArgumentCount() { - return 1; - } + options.add( + new Doclet.Option() { + @Override + public int getArgumentCount() { + return 1; + } - @Override - public String getDescription() { - return "level to enforce for missing javadocs: [package, class, method, parameter]"; - } + @Override + public String getDescription() { + return "level to enforce for missing javadocs: [package, class, method, parameter]"; + } - @Override - public Kind getKind() { - return Option.Kind.STANDARD; - } + @Override + public Kind getKind() { + return Option.Kind.STANDARD; + } - @Override - public List getNames() { - return Collections.singletonList("--missing-level"); - } + @Override + public List getNames() { + return Collections.singletonList("--missing-level"); + } - @Override - public String getParameters() { - return "level"; - } + @Override + public String getParameters() { + return "level"; + } - @Override - public boolean process(String option, List arguments) { - switch (arguments.get(0)) { - case "package": - level = PACKAGE; + @Override + public boolean process(String option, List arguments) { + switch (arguments.get(0)) { + case "package": + level = PACKAGE; + return true; + case "class": + level = CLASS; + return true; + case "method": + level = METHOD; + return true; + case "parameter": + level = PARAMETER; + return true; + default: + return false; + } + } + }); + options.add( + new Doclet.Option() { + @Override + public int getArgumentCount() { + return 1; + } + + @Override + public String getDescription() { + return "comma separated list of element names to ignore (e.g. as a workaround for split packages)"; + } + + @Override + public Kind getKind() { + return Option.Kind.STANDARD; + } + + @Override + public List getNames() { + return Collections.singletonList("--missing-ignore"); + } + + @Override + public String getParameters() { + return "ignoredNames"; + } + + @Override + public boolean process(String option, List arguments) { + ignored = new HashSet<>(Arrays.asList(arguments.get(0).split(","))); return true; - case "class": - level = CLASS; + } + }); + options.add( + new Doclet.Option() { + @Override + public int getArgumentCount() { + return 1; + } + + @Override + public String getDescription() { + return "comma separated list of packages to check at 'method' level"; + } + + @Override + public Kind getKind() { + return Option.Kind.STANDARD; + } + + @Override + public List getNames() { + return Collections.singletonList("--missing-method"); + } + + @Override + public String getParameters() { + return "packages"; + } + + @Override + public boolean process(String option, List arguments) { + methodPackages = new HashSet<>(Arrays.asList(arguments.get(0).split(","))); return true; - case "method": - level = METHOD; - return true; - case "parameter": - level = PARAMETER; - return true; - default: - return false; - } - } - }); - options.add(new Doclet.Option() { - @Override - public int getArgumentCount() { - return 1; - } - - @Override - public String getDescription() { - return "comma separated list of element names to ignore (e.g. as a workaround for split packages)"; - } - - @Override - public Kind getKind() { - return Option.Kind.STANDARD; - } - - @Override - public List getNames() { - return Collections.singletonList("--missing-ignore"); - } - - @Override - public String getParameters() { - return "ignoredNames"; - } - - @Override - public boolean process(String option, List arguments) { - ignored = new HashSet<>(Arrays.asList(arguments.get(0).split(","))); - return true; - } - }); - options.add(new Doclet.Option() { - @Override - public int getArgumentCount() { - return 1; - } - - @Override - public String getDescription() { - return "comma separated list of packages to check at 'method' level"; - } - - @Override - public Kind getKind() { - return Option.Kind.STANDARD; - } - - @Override - public List getNames() { - return Collections.singletonList("--missing-method"); - } - - @Override - public String getParameters() { - return "packages"; - } - - @Override - public boolean process(String option, List arguments) { - methodPackages = new HashSet<>(Arrays.asList(arguments.get(0).split(","))); - return true; - } - }); + } + }); return options; } @@ -205,10 +205,8 @@ public class MissingDoclet extends StandardDoclet { return super.run(docEnv); } - - /** - * Returns effective check level for this element - */ + + /** Returns effective check level for this element */ private int level(Element element) { String pkg = elementUtils.getPackageOf(element).getQualifiedName().toString(); if (methodPackages.contains(pkg)) { @@ -217,24 +215,24 @@ public class MissingDoclet extends StandardDoclet { return level; } } - - /** - * Check an individual element. - * This checks packages and types from the doctrees. - * It will recursively check methods/fields from encountered types when the level is "method" + + /** + * Check an individual element. This checks packages and types from the doctrees. It will + * recursively check methods/fields from encountered types when the level is "method" */ private void check(Element element) { - switch(element.getKind()) { + switch (element.getKind()) { case MODULE: // don't check the unnamed module, it won't have javadocs - if (!((ModuleElement)element).isUnnamed()) { + if (!((ModuleElement) element).isUnnamed()) { checkComment(element); } break; case PACKAGE: checkComment(element); break; - // class-like elements, check them, then recursively check their children (fields and methods) + // class-like elements, check them, then recursively check their children (fields and + // methods) case CLASS: case INTERFACE: case ENUM: @@ -242,21 +240,24 @@ public class MissingDoclet extends StandardDoclet { case ANNOTATION_TYPE: if (level(element) >= CLASS) { checkComment(element); - if (element instanceof TypeElement te && element.getKind() == ElementKind.RECORD && level(element) >= METHOD) { + if (element instanceof TypeElement te + && element.getKind() == ElementKind.RECORD + && level(element) >= METHOD) { checkRecordParameters(te, docTrees.getDocCommentTree(element)); } for (var subElement : element.getEnclosedElements()) { - // don't recurse into enclosed types, otherwise we'll double-check since they are already in the included docTree - if (subElement.getKind() == ElementKind.METHOD || - subElement.getKind() == ElementKind.CONSTRUCTOR || - subElement.getKind() == ElementKind.FIELD || - subElement.getKind() == ElementKind.ENUM_CONSTANT) { + // don't recurse into enclosed types, otherwise we'll double-check since they are + // already in the included docTree + if (subElement.getKind() == ElementKind.METHOD + || subElement.getKind() == ElementKind.CONSTRUCTOR + || subElement.getKind() == ElementKind.FIELD + || subElement.getKind() == ElementKind.ENUM_CONSTANT) { check(subElement); } } } break; - // method-like elements, check them if we are configured to do so + // method-like elements, check them if we are configured to do so case METHOD: case CONSTRUCTOR: case FIELD: @@ -272,8 +273,8 @@ public class MissingDoclet extends StandardDoclet { /** * Return true if the method is synthetic enum (values/valueOf) or record accessor method. - * According to the doctree documentation, the "included" set never includes synthetic/mandated elements. - * UweSays: It should not happen but it happens! + * According to the doctree documentation, the "included" set never includes synthetic/mandated + * elements. UweSays: It should not happen but it happens! */ private boolean isSyntheticMethod(Element element) { // exclude all not explicitely declared methods @@ -293,20 +294,23 @@ public class MissingDoclet extends StandardDoclet { } return false; } - + /** - * Checks that an element doesn't have missing javadocs. - * In addition to truly "missing", check that comments aren't solely whitespace (generated by some IDEs), - * that they aren't a license header masquerading as a javadoc comment. + * Checks that an element doesn't have missing javadocs. In addition to truly "missing", check + * that comments aren't solely whitespace (generated by some IDEs), that they aren't a license + * header masquerading as a javadoc comment. */ private void checkComment(Element element) { // sanity check that the element is really "included", because we do some recursion into types if (!docEnv.isIncluded(element)) { return; } - // check that this element isn't on our ignore list. This is only used as a workaround for "split packages". - // ignoring a package isn't recursive (on purpose), we still check all the classes, etc. inside it. - // we just need to cope with the fact package-info.java isn't there because it is split across multiple jars. + // check that this element isn't on our ignore list. This is only used as a workaround for + // "split packages". + // ignoring a package isn't recursive (on purpose), we still check all the classes, etc. inside + // it. + // we just need to cope with the fact package-info.java isn't there because it is split across + // multiple jars. if (ignored.contains(element.toString())) { return; } @@ -319,14 +323,17 @@ public class MissingDoclet extends StandardDoclet { error(element, "javadocs are missing"); } } else { - var normalized = tree.getFirstSentence().get(0).toString() - .replace('\u00A0', ' ') - .trim() - .toLowerCase(Locale.ROOT); + var normalized = + tree.getFirstSentence() + .get(0) + .toString() + .replace('\u00A0', ' ') + .trim() + .toLowerCase(Locale.ROOT); if (normalized.isEmpty()) { error(element, "blank javadoc comment"); - } else if (normalized.startsWith("licensed to the apache software foundation") || - normalized.startsWith("copyright 2004 the apache software foundation")) { + } else if (normalized.startsWith("licensed to the apache software foundation") + || normalized.startsWith("copyright 2004 the apache software foundation")) { error(element, "comment is really a license"); } } @@ -336,13 +343,15 @@ public class MissingDoclet extends StandardDoclet { } private boolean hasInheritedJavadocs(Element element) { - boolean hasOverrides = element.getAnnotationMirrors().stream() - .anyMatch(ann -> ann.getAnnotationType().toString().equals(Override.class.getName())); + boolean hasOverrides = + element.getAnnotationMirrors().stream() + .anyMatch(ann -> ann.getAnnotationType().toString().equals(Override.class.getName())); if (hasOverrides) { // If an element has explicit @Overrides annotation, assume it does // have inherited javadocs somewhere. - // reporter.print(Diagnostic.Kind.NOTE, element, "javadoc empty but @Override declared, skipping."); + // reporter.print(Diagnostic.Kind.NOTE, element, "javadoc empty but @Override declared, + // skipping."); return true; } @@ -359,7 +368,8 @@ public class MissingDoclet extends StandardDoclet { // We could check supMethod for non-empty javadoc here. Don't know if this makes // sense though as all methods will be verified in the end so it'd fail on the // top of the hierarchy (if empty) anyway. - // reporter.print(Diagnostic.Kind.NOTE, element, "javadoc empty but method overrides another, skipping."); + // reporter.print(Diagnostic.Kind.NOTE, element, "javadoc empty but method overrides + // another, skipping."); return true; } } @@ -369,15 +379,14 @@ public class MissingDoclet extends StandardDoclet { return false; } - /* Find types from which methods in type may inherit javadoc, in the proper order.*/ private Stream superTypeForInheritDoc(Element type) { TypeElement clazz = (TypeElement) type; - List interfaces = clazz.getInterfaces() - .stream() - .filter(tm -> tm.getKind() == TypeKind.DECLARED) - .map(tm -> ((DeclaredType) tm).asElement()) - .collect(Collectors.toList()); + List interfaces = + clazz.getInterfaces().stream() + .filter(tm -> tm.getKind() == TypeKind.DECLARED) + .map(tm -> ((DeclaredType) tm).asElement()) + .collect(Collectors.toList()); Stream result = interfaces.stream(); result = Stream.concat(result, interfaces.stream().flatMap(this::superTypeForInheritDoc)); @@ -394,12 +403,12 @@ public class MissingDoclet extends StandardDoclet { /** Returns all {@code @param} parameters we see in the javadocs of the element */ private Set getDocParameters(DocCommentTree tree) { return Stream.ofNullable(tree) - .flatMap(t -> t.getBlockTags().stream()) - .filter(ParamTree.class::isInstance) - .map(tag -> ((ParamTree)tag).getName().getName().toString()) - .collect(Collectors.toSet()); + .flatMap(t -> t.getBlockTags().stream()) + .filter(ParamTree.class::isInstance) + .map(tag -> ((ParamTree) tag).getName().getName().toString()) + .collect(Collectors.toSet()); } - + /** Checks there is a corresponding "param" tag for each method parameter */ private void checkMethodParameters(ExecutableElement element, DocCommentTree tree) { // record each @param that we see @@ -412,7 +421,7 @@ public class MissingDoclet extends StandardDoclet { } } } - + /** Checks there is a corresponding "param" tag for each record component */ private void checkRecordParameters(TypeElement element, DocCommentTree tree) { // record each @param that we see @@ -425,7 +434,7 @@ public class MissingDoclet extends StandardDoclet { } } } - + /** logs a new error for the particular element */ private void error(Element element, String message) { var fullMessage = new StringBuilder(); diff --git a/build.gradle b/build.gradle index a4901f94e4f..6705923d79d 100644 --- a/build.gradle +++ b/build.gradle @@ -20,13 +20,18 @@ import java.time.format.DateTimeFormatter plugins { id "base" - id "com.palantir.consistent-versions" version "2.11.0" - id "org.owasp.dependencycheck" version "7.2.0" - id 'de.thetaphi.forbiddenapis' version '3.7' apply false - id "de.undercouch.download" version "5.2.0" apply false - id "net.ltgt.errorprone" version "3.1.0" apply false - id 'com.diffplug.spotless' version "6.5.2" apply false - id 'org.barfuin.gradle.jacocolog' version "3.1.0" apply false + id "lucene.build-infra" + + alias(deps.plugins.dependencychecks) + alias(deps.plugins.spotless) apply false + alias(deps.plugins.benmanes.versions) + alias(deps.plugins.forbiddenapis) apply false + alias(deps.plugins.versionCatalogUpdate) apply false + alias(deps.plugins.randomizedtesting) apply false + alias(deps.plugins.owasp.dependencycheck) + alias(deps.plugins.undercouch.download) apply false + alias(deps.plugins.errorprone) apply false + alias(deps.plugins.jacocolog) apply false } apply from: file('gradle/globals.gradle') @@ -73,7 +78,7 @@ ext { } // Minimum Java version required to compile and run Lucene. - minJavaVersion = JavaVersion.VERSION_21 + minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get()) // snapshot build marker used in scripts. snapshotBuild = version.contains("SNAPSHOT") @@ -98,17 +103,15 @@ configurations { dependencies { // Use a newer groovy that doesn't have illegal reflective accesses. - groovy "org.codehaus.groovy:groovy-all:3.0.21" + groovy deps.groovy } -apply from: file('buildSrc/scriptDepVersions.gradle') - // Include smaller chunks configuring dedicated build areas. // Some of these intersect or add additional functionality. // The order of inclusion of these files shouldn't matter (but may // if the build file is incorrectly written and evaluates something // eagerly). - +apply from: file('gradle/conventions.gradle') apply from: file('gradle/generation/local-settings.gradle') // Make sure the build environment is consistent. @@ -140,15 +143,25 @@ apply from: file('gradle/validation/precommit.gradle') apply from: file('gradle/validation/forbidden-apis.gradle') apply from: file('gradle/validation/jar-checks.gradle') apply from: file('gradle/validation/git-status.gradle') -apply from: file('gradle/validation/versions-props-sorted.gradle') apply from: file('gradle/validation/validate-source-patterns.gradle') apply from: file('gradle/validation/rat-sources.gradle') apply from: file('gradle/validation/owasp-dependency-check.gradle') apply from: file('gradle/validation/ecj-lint.gradle') apply from: file('gradle/validation/gradlew-scripts-tweaked.gradle') - +apply from: file('gradle/validation/dependencies.gradle') apply from: file('gradle/validation/spotless.gradle') +// Wire up included builds to some validation tasks. +rootProject.tasks.named("tidy").configure { + dependsOn gradle.includedBuilds*.task(":tidy") +} +rootProject.tasks.named("clean").configure { + dependsOn gradle.includedBuilds*.task(":clean") +} +rootProject.tasks.named("check").configure { + dependsOn gradle.includedBuilds*.task(":forbiddenApis") +} + // Source or data regeneration tasks apply from: file('gradle/generation/regenerate.gradle') apply from: file('gradle/generation/jflex.gradle') diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java b/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java deleted file mode 100644 index 64abfa5aec8..00000000000 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.gradle; - -import java.io.BufferedReader; -import java.io.Closeable; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.io.Writer; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Locale; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; - -import org.gradle.api.internal.tasks.testing.logging.FullExceptionFormatter; -import org.gradle.api.internal.tasks.testing.logging.TestExceptionFormatter; -import org.gradle.api.logging.Logger; -import org.gradle.api.logging.Logging; -import org.gradle.api.tasks.testing.TestDescriptor; -import org.gradle.api.tasks.testing.TestListener; -import org.gradle.api.tasks.testing.TestOutputEvent; -import org.gradle.api.tasks.testing.TestOutputListener; -import org.gradle.api.tasks.testing.TestResult; -import org.gradle.api.tasks.testing.logging.TestLogging; - -/** - * An error reporting listener that queues test output streams and displays them - * on failure. - *

- * Heavily inspired by Elasticsearch's ErrorReportingTestListener (ASL 2.0 licensed). - */ -public class ErrorReportingTestListener implements TestOutputListener, TestListener { - private static final Logger LOGGER = Logging.getLogger(ErrorReportingTestListener.class); - - private final TestExceptionFormatter formatter; - private final Map outputHandlers = new ConcurrentHashMap<>(); - private final Path spillDir; - private final Path outputsDir; - private final boolean verboseMode; - - public ErrorReportingTestListener(TestLogging testLogging, Path spillDir, Path outputsDir, boolean verboseMode) { - this.formatter = new FullExceptionFormatter(testLogging); - this.spillDir = spillDir; - this.outputsDir = outputsDir; - this.verboseMode = verboseMode; - } - - @Override - public void onOutput(TestDescriptor testDescriptor, TestOutputEvent outputEvent) { - handlerFor(testDescriptor).write(outputEvent); - } - - @Override - public void beforeSuite(TestDescriptor suite) { - // noop. - } - - @Override - public void beforeTest(TestDescriptor testDescriptor) { - // Noop. - } - - @Override - public void afterSuite(final TestDescriptor suite, TestResult result) { - if (suite.getParent() == null || suite.getName().startsWith("Gradle")) { - return; - } - - TestKey key = TestKey.of(suite); - try { - OutputHandler outputHandler = outputHandlers.get(key); - if (outputHandler != null) { - long length = outputHandler.length(); - if (length > 1024 * 1024 * 10) { - LOGGER.warn(String.format(Locale.ROOT, "WARNING: Test %s wrote %,d bytes of output.", - suite.getName(), - length)); - } - } - - boolean echoOutput = Objects.equals(result.getResultType(), TestResult.ResultType.FAILURE); - boolean dumpOutput = echoOutput; - - // If the test suite failed, report output. - if (dumpOutput || echoOutput) { - Files.createDirectories(outputsDir); - Path outputLog = outputsDir.resolve(getOutputLogName(suite)); - - // Save the output of a failing test to disk. - try (Writer w = Files.newBufferedWriter(outputLog, StandardCharsets.UTF_8)) { - if (outputHandler != null) { - outputHandler.copyTo(w); - } - } - - if (echoOutput && !verboseMode) { - synchronized (this) { - System.out.println(); - System.out.println(suite.getClassName() + " > test suite's output saved to " + outputLog + ", copied below:"); - try (BufferedReader reader = Files.newBufferedReader(outputLog, StandardCharsets.UTF_8)) { - char[] buf = new char[1024]; - int len; - while ((len = reader.read(buf)) >= 0) { - System.out.print(new String(buf, 0, len)); - } - System.out.println(); - } - } - } - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } finally { - OutputHandler handler = outputHandlers.remove(key); - if (handler != null) { - try { - handler.close(); - } catch (IOException e) { - LOGGER.error("Failed to close output handler for: " + key, e); - } - } - } - } - - private static Pattern SANITIZE = Pattern.compile("[^a-zA-Z .\\-_0-9]+"); - - public static String getOutputLogName(TestDescriptor suite) { - return SANITIZE.matcher("OUTPUT-" + suite.getName() + ".txt").replaceAll("_"); - } - - @Override - public void afterTest(TestDescriptor testDescriptor, TestResult result) { - // Include test failure exception stacktrace(s) in test output log. - if (result.getResultType() == TestResult.ResultType.FAILURE) { - if (result.getExceptions().size() > 0) { - String message = formatter.format(testDescriptor, result.getExceptions()); - handlerFor(testDescriptor).write(message); - } - } - } - - private OutputHandler handlerFor(TestDescriptor descriptor) { - // Attach output of leaves (individual tests) to their parent. - if (!descriptor.isComposite()) { - descriptor = descriptor.getParent(); - } - return outputHandlers.computeIfAbsent(TestKey.of(descriptor), (key) -> new OutputHandler()); - } - - public static class TestKey { - private final String key; - - private TestKey(String key) { - this.key = key; - } - - public static TestKey of(TestDescriptor d) { - StringBuilder key = new StringBuilder(); - key.append(d.getClassName()); - key.append("::"); - key.append(d.getName()); - key.append("::"); - key.append(d.getParent() == null ? "-" : d.getParent().toString()); - return new TestKey(key.toString()); - } - - @Override - public boolean equals(Object o) { - return o != null && - o.getClass() == this.getClass() && - Objects.equals(((TestKey) o).key, key); - } - - @Override - public int hashCode() { - return key.hashCode(); - } - - @Override - public String toString() { - return key; - } - } - - private class OutputHandler implements Closeable { - // Max single-line buffer before automatic wrap occurs. - private static final int MAX_LINE_WIDTH = 1024 * 4; - - private final SpillWriter buffer; - - // internal stream. - private final PrefixedWriter sint; - // stdout - private final PrefixedWriter sout; - // stderr - private final PrefixedWriter serr; - - // last used stream (so that we can flush it properly and prefixes are not screwed up). - private PrefixedWriter last; - - public OutputHandler() { - buffer = new SpillWriter(() -> { - try { - return Files.createTempFile(spillDir, "spill-", ".tmp"); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - }); - - Writer sink = buffer; - if (verboseMode) { - sink = new StdOutTeeWriter(buffer); - } - - sint = new PrefixedWriter(" > ", sink, MAX_LINE_WIDTH); - sout = new PrefixedWriter(" 1> ", sink, MAX_LINE_WIDTH); - serr = new PrefixedWriter(" 2> ", sink, MAX_LINE_WIDTH); - last = sint; - } - - public void write(TestOutputEvent event) { - write((event.getDestination() == TestOutputEvent.Destination.StdOut ? sout : serr), event.getMessage()); - } - - public void write(String message) { - write(sint, message); - } - - public long length() throws IOException { - return buffer.length(); - } - - private void write(PrefixedWriter out, String message) { - try { - if (out != last) { - last.completeLine(); - last = out; - } - out.write(message); - } catch (IOException e) { - throw new UncheckedIOException("Unable to write to test output.", e); - } - } - - public void copyTo(Writer out) throws IOException { - flush(); - buffer.copyTo(out); - } - - public void flush() throws IOException { - sout.completeLine(); - serr.completeLine(); - buffer.flush(); - } - - @Override - public void close() throws IOException { - buffer.close(); - } - } -} diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java b/buildSrc/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java deleted file mode 100644 index 20a4c8524f6..00000000000 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/StdOutTeeWriter.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.gradle; - -import java.io.IOException; -import java.io.PrintStream; -import java.io.Writer; - -class StdOutTeeWriter extends Writer { - private final Writer delegate; - private final PrintStream out = System.out; - - public StdOutTeeWriter(Writer delegate) { - this.delegate = delegate; - } - - @Override - public void write(int c) throws IOException { - delegate.write(c); - out.write(c); - } - - @Override - public void write(char[] cbuf) throws IOException { - delegate.write(cbuf); - out.print(cbuf); - } - - @Override - public void write(String str) throws IOException { - delegate.write(str); - out.print(str); - } - - @Override - public void write(String str, int off, int len) throws IOException { - delegate.write(str, off, len); - out.append(str, off, len); - } - - @Override - public Writer append(CharSequence csq) throws IOException { - delegate.append(csq); - out.append(csq); - return this; - } - - @Override - public Writer append(CharSequence csq, int start, int end) throws IOException { - delegate.append(csq, start, end); - out.append(csq, start, end); - return this; - } - - @Override - public Writer append(char c) throws IOException { - delegate.append(c); - out.append(c); - return this; - } - - @Override - public void write(char[] cbuf, int off, int len) throws IOException { - delegate.write(cbuf, off, len); - out.print(new String(cbuf, off, len)); - } - - @Override - public void flush() throws IOException { - delegate.flush(); - out.flush(); - } - - @Override - public void close() throws IOException { - delegate.close(); - // Don't close the actual output. - } -} diff --git a/dev-tools/scripts/addBackcompatIndexes.py b/dev-tools/scripts/addBackcompatIndexes.py index 7faacb8b8e3..80272ec0f0c 100755 --- a/dev-tools/scripts/addBackcompatIndexes.py +++ b/dev-tools/scripts/addBackcompatIndexes.py @@ -40,6 +40,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'index', 'nocfs': 'index', 'sorted': 'sorted', + 'int8_hnsw': 'int8_hnsw', 'moreterms': 'moreterms', 'dvupdates': 'dvupdates', 'emptyIndex': 'empty' @@ -60,6 +61,7 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'cfs': 'testCreateCFS', 'nocfs': 'testCreateNoCFS', 'sorted': 'testCreateSortedIndex', + 'int8_hnsw': 'testCreateInt8HNSWIndices', 'moreterms': 'testCreateMoreTermsIndex', 'dvupdates': 'testCreateIndexWithDocValuesUpdates', 'emptyIndex': 'testCreateEmptyIndex' @@ -204,6 +206,7 @@ def main(): current_version = scriptutil.Version.parse(scriptutil.find_current_version()) create_and_add_index(source, 'cfs', c.version, current_version, c.temp_dir) create_and_add_index(source, 'nocfs', c.version, current_version, c.temp_dir) + create_and_add_index(source, 'int8_hnsw', c.version, current_version, c.temp_dir) should_make_sorted = current_version.is_back_compat_with(c.version) \ and (c.version.major > 6 or (c.version.major == 6 and c.version.minor >= 2)) if should_make_sorted: diff --git a/dev-tools/scripts/smokeTestRelease.py b/dev-tools/scripts/smokeTestRelease.py index 87e70adbb1f..82946f4e5b3 100755 --- a/dev-tools/scripts/smokeTestRelease.py +++ b/dev-tools/scripts/smokeTestRelease.py @@ -582,8 +582,8 @@ def verifyUnpacked(java, artifact, unpackPath, gitRevision, version, testArgs): 'luke', 'memory', 'misc', 'monitor', 'queries', 'queryparser', 'replicator', 'sandbox', 'spatial-extras', 'spatial-test-fixtures', 'spatial3d', 'suggest', 'test-framework', 'licenses'] if isSrc: - expected_src_root_files = ['build.gradle', 'buildSrc', 'CONTRIBUTING.md', 'dev-docs', 'dev-tools', 'gradle', 'gradlew', - 'gradlew.bat', 'help', 'lucene', 'settings.gradle', 'versions.lock', 'versions.props'] + expected_src_root_files = ['build.gradle', 'build-tools', 'CONTRIBUTING.md', 'dev-docs', 'dev-tools', 'gradle', 'gradlew', + 'gradlew.bat', 'help', 'lucene', 'settings.gradle', 'versions.lock', 'versions.toml'] expected_src_lucene_files = ['build.gradle', 'documentation', 'distribution', 'dev-docs'] is_in_list(in_root_folder, expected_src_root_files) is_in_list(in_lucene_folder, expected_folders) diff --git a/buildSrc/scriptDepVersions.gradle b/gradle/conventions.gradle similarity index 64% rename from buildSrc/scriptDepVersions.gradle rename to gradle/conventions.gradle index 5c3be02f5ac..eb676dcb189 100644 --- a/buildSrc/scriptDepVersions.gradle +++ b/gradle/conventions.gradle @@ -15,19 +15,19 @@ * limitations under the License. */ -// Declare script dependency versions outside of palantir's -// version unification control. These are not our main dependencies -// but are reused in buildSrc and across applied scripts. - -ext { - scriptDepVersions = [ - "apache-rat": "0.14", - "asm": "9.7", - "commons-codec": "1.13", - "ecj": "3.36.0", - "flexmark": "0.61.24", - "javacc": "7.0.12", - "jflex": "1.8.2", - "jgit": "5.13.1.202206130422-r", - ] +configure(allprojects) { + tasks.register("tidy").configure { + description "Applies formatters and cleanups to sources." + group "verification" + } } + +// Locate script-relative resource folder. This is context-sensitive so pass +// the right buildscript (top-level). +configure(rootProject) { + ext { + scriptResources = { buildscript -> + return file(buildscript.sourceFile.absolutePath.replaceAll('.gradle$', "")) + } + } +} \ No newline at end of file diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle index 2d6ae8e13d5..44fd38117bb 100644 --- a/gradle/datasets/external-datasets.gradle +++ b/gradle/datasets/external-datasets.gradle @@ -1,5 +1,3 @@ -import org.apache.lucene.gradle.datasets.ExtractReuters - import java.nio.file.Files /* @@ -25,7 +23,7 @@ buildscript { } dependencies { - classpath "com.github.luben:zstd-jni:1.5.5-11" + classpath deps.zstd } } @@ -40,7 +38,7 @@ def unzstd(java.nio.file.Path src, java.nio.file.Path dst) { // TODO: not sure whether this should live in benchmarks, but for now let it be. configure(project(":lucene:benchmark")) { apply plugin: "java" - apply plugin: "de.undercouch.download" + apply plugin: deps.plugins.undercouch.download.get().pluginId ext { dataDir = file("work") @@ -164,7 +162,7 @@ configure(project(":lucene:benchmark")) { logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...") ext.dst.deleteDir() - ExtractReuters.main(untarPath.toString(), ext.dst.toString()) + buildinfra.extractReuters(untarPath.toString(), ext.dst.toString()) } } diff --git a/gradle/documentation/markdown.gradle b/gradle/documentation/markdown.gradle index 0f0767c54d5..1f9be37868e 100644 --- a/gradle/documentation/markdown.gradle +++ b/gradle/documentation/markdown.gradle @@ -34,11 +34,11 @@ buildscript { } dependencies { - classpath "com.vladsch.flexmark:flexmark:${scriptDepVersions['flexmark']}" - classpath "com.vladsch.flexmark:flexmark-ext-abbreviation:${scriptDepVersions['flexmark']}" - classpath "com.vladsch.flexmark:flexmark-ext-attributes:${scriptDepVersions['flexmark']}" - classpath "com.vladsch.flexmark:flexmark-ext-autolink:${scriptDepVersions['flexmark']}" - classpath "com.vladsch.flexmark:flexmark-ext-tables:${scriptDepVersions['flexmark']}" + classpath deps.flexmark.core + classpath deps.flexmark.ext.abbreviation + classpath deps.flexmark.ext.attributes + classpath deps.flexmark.ext.autolink + classpath deps.flexmark.ext.tables } } diff --git a/gradle/generation/antlr.gradle b/gradle/generation/antlr.gradle index 9e5d3fcb7c7..9ccb522fb4c 100644 --- a/gradle/generation/antlr.gradle +++ b/gradle/generation/antlr.gradle @@ -23,7 +23,7 @@ configure(project(":lucene:expressions")) { } dependencies { - antlr "org.antlr:antlr4" + antlr deps.antlr.core } task generateAntlrInternal() { diff --git a/gradle/generation/extract-jdk-apis.gradle b/gradle/generation/extract-jdk-apis.gradle index 1371fe9d391..3c8e1efa447 100644 --- a/gradle/generation/extract-jdk-apis.gradle +++ b/gradle/generation/extract-jdk-apis.gradle @@ -35,42 +35,44 @@ configure(project(":lucene:core")) { } dependencies { - apiextractor "org.ow2.asm:asm:${scriptDepVersions['asm']}" + apiextractor deps.asm.core } - mrjarJavaVersions.each { jdkVersion -> - def task = tasks.create(name: "generateJdkApiJar${jdkVersion}", type: JavaExec) { - description "Regenerate the API-only JAR file with public Panama Foreign & Vector API from JDK ${jdkVersion}" - group "generation" - - javaLauncher = javaToolchains.launcherFor { - languageVersion = JavaLanguageVersion.of(jdkVersion) - } - - onlyIf { - try { - javaLauncher.get() - return true - } catch (Exception e) { - logger.warn('Launcher for Java {} is not available; skipping regeneration of Panama Foreign & Vector API JAR.', jdkVersion) - logger.warn('Error: {}', e.cause?.message) - logger.warn("Please make sure to point env 'JAVA{}_HOME' to exactly JDK version {} or enable Gradle toolchain auto-download.", jdkVersion, jdkVersion) - return false + plugins.withType(JavaPlugin) { + mrjarJavaVersions.each { jdkVersion -> + def task = tasks.create(name: "generateJdkApiJar${jdkVersion}", type: JavaExec) { + description "Regenerate the API-only JAR file with public Panama Foreign & Vector API from JDK ${jdkVersion}" + group "generation" + + javaLauncher = javaToolchains.launcherFor { + languageVersion = JavaLanguageVersion.of(jdkVersion) } + + onlyIf { + try { + javaLauncher.get() + return true + } catch (Exception e) { + logger.warn('Launcher for Java {} is not available; skipping regeneration of Panama Foreign & Vector API JAR.', jdkVersion) + logger.warn('Error: {}', e.cause?.message) + logger.warn("Please make sure to point env 'JAVA{}_HOME' to exactly JDK version {} or enable Gradle toolchain auto-download.", jdkVersion, jdkVersion) + return false + } + } + + classpath = configurations.apiextractor + mainClass = file("${resources}/ExtractJdkApis.java") as String + systemProperties = [ + 'user.timezone': 'UTC', + 'file.encoding': 'UTF-8', + ] + args = [ + jdkVersion, + apijars.file("jdk${jdkVersion}.apijar"), + ] } - classpath = configurations.apiextractor - mainClass = file("${resources}/ExtractJdkApis.java") as String - systemProperties = [ - 'user.timezone': 'UTC', - 'file.encoding': 'UTF-8', - ] - args = [ - jdkVersion, - apijars.file("jdk${jdkVersion}.apijar"), - ] + regenerate.dependsOn task } - - regenerate.dependsOn task } } diff --git a/gradle/generation/icu.gradle b/gradle/generation/icu.gradle index 042f756bbdb..6e3d5f9062f 100644 --- a/gradle/generation/icu.gradle +++ b/gradle/generation/icu.gradle @@ -33,18 +33,11 @@ def resources = scriptResources(buildscript) // Configure different icu4j dependencies. configure(rootProject) { configurations { - // icu_xyz icu_current } dependencies { - // icu_xyz "com.ibm.icu:icu4j:xyz" - icu_current 'com.ibm.icu:icu4j' - } - - // Exclude explicit ICU configs from palantir's version unification. - versionRecommendations { - // excludeConfigurations "icu_xyz" + icu_current deps.icu4j } } diff --git a/gradle/generation/javacc.gradle b/gradle/generation/javacc.gradle index 5409e3883a4..559d07fc6d9 100644 --- a/gradle/generation/javacc.gradle +++ b/gradle/generation/javacc.gradle @@ -26,7 +26,7 @@ configure(rootProject) { } dependencies { - javacc "net.java.dev.javacc:javacc:${scriptDepVersions['javacc']}" + javacc deps.javacc } task javacc() { diff --git a/gradle/generation/jflex.gradle b/gradle/generation/jflex.gradle index 6c028b1279e..e1d538b19b6 100644 --- a/gradle/generation/jflex.gradle +++ b/gradle/generation/jflex.gradle @@ -25,7 +25,7 @@ configure(rootProject) { } dependencies { - jflex "de.jflex:jflex:${scriptDepVersions['jflex']}" + jflex deps.jflex } } diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle index ec1c14c0a45..cfe2cd559ce 100644 --- a/gradle/generation/kuromoji.gradle +++ b/gradle/generation/kuromoji.gradle @@ -30,7 +30,7 @@ def recompileDictionary(project, dictionaryName, Closure closure) { } configure(project(":lucene:analysis:kuromoji")) { - apply plugin: "de.undercouch.download" + apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { ext { diff --git a/gradle/generation/moman.gradle b/gradle/generation/moman.gradle index f825090c27c..de80705eefa 100644 --- a/gradle/generation/moman.gradle +++ b/gradle/generation/moman.gradle @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -apply plugin: "de.undercouch.download" +apply plugin: deps.plugins.undercouch.download.get().pluginId def resources = scriptResources(buildscript) diff --git a/gradle/generation/nori.gradle b/gradle/generation/nori.gradle index 63a857e4a73..db05babdf03 100644 --- a/gradle/generation/nori.gradle +++ b/gradle/generation/nori.gradle @@ -30,7 +30,7 @@ def recompileDictionary(project, dictionaryName, Closure closure) { } configure(project(":lucene:analysis:nori")) { - apply plugin: "de.undercouch.download" + apply plugin: deps.plugins.undercouch.download.get().pluginId plugins.withType(JavaPlugin) { ext { diff --git a/gradle/generation/regenerate.gradle b/gradle/generation/regenerate.gradle index 06640a8619b..d23cfd7d54f 100644 --- a/gradle/generation/regenerate.gradle +++ b/gradle/generation/regenerate.gradle @@ -1,7 +1,5 @@ import groovy.json.JsonOutput import groovy.json.JsonSlurper -import org.apache.commons.codec.digest.DigestUtils - import java.util.function.Function /* @@ -58,7 +56,7 @@ def computeChecksummedEntries = { Task sourceTask -> allFiles.files.forEach { file -> allEntries.put( sourceTask.project.rootDir.relativePath(file), - file.exists() ? new DigestUtils(DigestUtils.sha1Digest).digestAsHex(file).trim() : "--") + file.exists() ? buildinfra.sha1Digest().digestAsHex(file).trim() : "--") } return allEntries diff --git a/gradle/generation/snowball.gradle b/gradle/generation/snowball.gradle index 50dd7e0ad2a..86398b5129f 100644 --- a/gradle/generation/snowball.gradle +++ b/gradle/generation/snowball.gradle @@ -19,7 +19,7 @@ import org.apache.tools.ant.taskdefs.condition.Os def resources = scriptResources(buildscript) -apply plugin: "de.undercouch.download" +apply plugin: deps.plugins.undercouch.download.get().pluginId configure(project(":lucene:analysis:common")) { ext { diff --git a/gradle/globals.gradle b/gradle/globals.gradle index 662b58d4205..bcab6461ea9 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -27,7 +27,7 @@ allprojects { // Artifacts will have names after full gradle project path // so :solr:core will have solr-core.jar, etc. - project.archivesBaseName = project.path.replaceAll("^:", "").replace(':', '-') + project.base.archivesName = project.path.replaceAll("^:", "").replace(':', '-') ext { // Utility method to support passing overrides via -P or -D. @@ -59,12 +59,6 @@ allprojects { return propertyOrDefault(propName, envOrDefault(envName, defValue)); } - // Locate script-relative resource folder. This is context-sensitive so pass - // the right buildscript (top-level). - scriptResources = { buildscript -> - return file(buildscript.sourceFile.absolutePath.replaceAll('.gradle$', "")) - } - // Utility function similar to project.exec but not emitting // any output unless an error code is returned from the executed command. quietExec = { closure -> diff --git a/gradle/hacks/gradle-archives.gradle b/gradle/hacks/gradle-archives.gradle index a10a640a44c..cc8561c47a0 100644 --- a/gradle/hacks/gradle-archives.gradle +++ b/gradle/hacks/gradle-archives.gradle @@ -20,7 +20,11 @@ allprojects { tasks.withType(AbstractArchiveTask).configureEach { task -> duplicatesStrategy = DuplicatesStrategy.FAIL reproducibleFileOrder = true - dirMode = 0755 - fileMode = 0644 + dirPermissions { + it.unix(0755) + } + filePermissions { + it.unix(0644) + } } } diff --git a/gradle/ide/eclipse.gradle b/gradle/ide/eclipse.gradle index aea23b065ba..8e5c44cff9d 100644 --- a/gradle/ide/eclipse.gradle +++ b/gradle/ide/eclipse.gradle @@ -22,48 +22,49 @@ import org.gradle.plugins.ide.eclipse.model.ClasspathEntry def resources = scriptResources(buildscript) configure(rootProject) { - apply plugin: "eclipse" + plugins.withType(JavaPlugin) { + apply plugin: "eclipse" - def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion) - def relativize = { other -> rootProject.rootDir.relativePath(other).toString() } + def eclipseJavaVersion = propertyOrDefault("eclipse.javaVersion", rootProject.minJavaVersion) + def relativize = { other -> rootProject.rootDir.relativePath(other).toString() } - eclipse { - project { - name = "Apache Lucene ${version}" - } + eclipse { + project { + name = "Apache Lucene ${version}" + } - classpath { - defaultOutputDir = file('build/eclipse') + classpath { + defaultOutputDir = file('build/eclipse') - file { - beforeMerged { classpath -> classpath.entries.removeAll { it.kind == "src" } } + file { + beforeMerged { classpath -> classpath.entries.removeAll { it.kind == "src" } } - whenMerged { classpath -> - def projects = allprojects.findAll { prj -> - return prj.plugins.hasPlugin(JavaPlugin) - } - - Set sourceSetNames = ['main', 'test', "main${eclipseJavaVersion}" as String, "test${eclipseJavaVersion}" as String, 'tools'] as Set - Set sources = [] - Set jars = [] - projects.each { prj -> - prj.sourceSets.each { sourceSet -> - if (sourceSetNames.contains(sourceSet.name)) { - sources += sourceSet.java.srcDirs.findAll { dir -> dir.exists() }.collect { dir -> relativize(dir) } - sources += sourceSet.resources.srcDirs.findAll { dir -> dir.exists() }.collect { dir -> relativize(dir) } - } + whenMerged { classpath -> + def projects = allprojects.findAll { prj -> + return prj.plugins.hasPlugin(JavaPlugin) } - // This is hacky - we take the resolved compile classpath and just - // include JAR files from there. We should probably make it smarter - // by looking at real dependencies. But then: this Eclipse configuration - // doesn't really separate sources anyway so why bother. - jars += prj.configurations.compileClasspath.resolve() - jars += prj.configurations.testCompileClasspath.resolve() - } + Set sourceSetNames = ['main', 'test', "main${eclipseJavaVersion}" as String, "test${eclipseJavaVersion}" as String, 'tools'] as Set + Set sources = [] + Set jars = [] + projects.each { prj -> + prj.sourceSets.each { sourceSet -> + if (sourceSetNames.contains(sourceSet.name)) { + sources += sourceSet.java.srcDirs.findAll { dir -> dir.exists() }.collect { dir -> relativize(dir) } + sources += sourceSet.resources.srcDirs.findAll { dir -> dir.exists() }.collect { dir -> relativize(dir) } + } + } - classpath.entries += sources.sort().collect { name -> - def sourceFolder = new SourceFolder(name, "build/eclipse/" + name) + // This is hacky - we take the resolved compile classpath and just + // include JAR files from there. We should probably make it smarter + // by looking at real dependencies. But then: this Eclipse configuration + // doesn't really separate sources anyway so why bother. + jars += prj.configurations.compileClasspath.resolve() + jars += prj.configurations.testCompileClasspath.resolve() + } + + classpath.entries += sources.sort().collect { name -> + def sourceFolder = new SourceFolder(name, "build/eclipse/" + name) sourceFolder.setExcludes(["module-info.java"]) return sourceFolder } @@ -81,36 +82,38 @@ configure(rootProject) { } } - task luceneEclipseJdt(type: Sync) { - def errorMode = project.propertyOrDefault('eclipse.errors','warning'); - def ecjLintFile = rootProject.file('gradle/validation/ecj-lint/ecj.javadocs.prefs'); - - description = 'Generates the Eclipse JDT settings file.' - - inputs.file(ecjLintFile) - inputs.property('errorMode', errorMode) - inputs.property('eclipseJavaVersion', eclipseJavaVersion as String) - - from rootProject.file("${resources}/dot.settings") - into rootProject.file(".settings") - filter(ReplaceTokens, tokens: [ - 'ecj-lint-config': ecjLintFile.getText('UTF-8').replaceAll(/=error\b/, '=' + errorMode) - ]) - filteringCharset = 'UTF-8' - - doLast { - logger.lifecycle('Eclipse config for Java {} written with ECJ errors configured as {}. Change by passing -Peclipse.errors=ignore/warning/error.', eclipseJavaVersion, errorMode) - logger.lifecycle('To edit classes of MR-JARs for a specific Java version, use e.g., -Peclipse.javaVersion=19') + task luceneEclipseJdt(type: Sync) { + def errorMode = project.propertyOrDefault('eclipse.errors' ,'warning'); + def ecjLintFile = rootProject.file('gradle/validation/ecj-lint/ecj.javadocs.prefs'); + + description = 'Generates the Eclipse JDT settings file.' + + inputs.file(ecjLintFile) + inputs.property('errorMode', errorMode) + inputs.property('eclipseJavaVersion', eclipseJavaVersion as String) + + from rootProject.file("${resources}/dot.settings") + into rootProject.file(".settings") + filter(ReplaceTokens, tokens: [ + 'ecj-lint-config': ecjLintFile.getText('UTF-8').replaceAll(/=error\b/, '=' + errorMode) + ]) + filteringCharset = 'UTF-8' + + doLast { + logger.lifecycle('Eclipse config for Java {} written with ECJ errors configured as {}. Change by passing -Peclipse.errors=ignore/warning/error.', eclipseJavaVersion, errorMode) + logger.lifecycle('To edit classes of MR-JARs for a specific Java version, use e.g., -Peclipse.javaVersion=19') + } + } + + eclipseJdt { + enabled = false + dependsOn 'luceneEclipse' + } + + eclipseClasspath { + inputs.property('eclipseJavaVersion', eclipseJavaVersion as String + ) } - } - - eclipseJdt { - enabled = false - dependsOn 'luceneEclipseJdt' - } - - eclipseClasspath { - inputs.property('eclipseJavaVersion', eclipseJavaVersion as String) } } @@ -131,6 +134,6 @@ public class LibEntry implements ClasspathEntry { node.appendNode("classpathentry", Map.of( "kind", "lib", "path", path - )); + )) } } diff --git a/gradle/maven/publications-maven.gradle b/gradle/maven/publications-maven.gradle index 42b1a6d17a0..cfaa4130d0d 100644 --- a/gradle/maven/publications-maven.gradle +++ b/gradle/maven/publications-maven.gradle @@ -49,7 +49,7 @@ configure(rootProject.ext.mavenProjects) { Project project -> // This moves pom metadata configuration after all the scripts of all projects // have been evaluated. This is required because we set artifact groups - // and archivesBaseName in other scripts and some of the properties below don't + // and archivesName in other scripts and some of the properties below don't // accept lazy property providers (so everything must be in its final form). gradle.projectsEvaluated { publishing { @@ -57,22 +57,10 @@ configure(rootProject.ext.mavenProjects) { Project project -> configure(publication) { from components.java groupId = project.group - artifactId = project.archivesBaseName + artifactId = project.base.archivesName.get() artifact sourcesJar artifact javadocJar - - // LUCENE-9561: - // Remove dependencyManagement section created by a combination of - // Palantir and the publishing plugin. - // - // https://github.com/palantir/gradle-consistent-versions/issues/550 - pom({ - withXml { - def dm = asNode().dependencyManagement - if (dm) dm.replaceNode {} - } - }) } } } diff --git a/gradle/template.gradle.properties b/gradle/template.gradle.properties index 60486b86967..a0b4fb91682 100644 --- a/gradle/template.gradle.properties +++ b/gradle/template.gradle.properties @@ -104,3 +104,6 @@ org.gradle.java.installations.auto-download=true # Set these to enable automatic JVM location discovery. org.gradle.java.installations.fromEnv=JAVA21_HOME,JAVA22_HOME,RUNTIME_JAVA_HOME #org.gradle.java.installations.paths=(custom paths) + +# Opt out of gradle enterprise build scan plugin entire. +# gradle.ge=false diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index d8ee3cedaf0..1f3a7d8b1a0 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -18,7 +18,6 @@ import org.apache.tools.ant.taskdefs.condition.Os import org.apache.tools.ant.types.Commandline import org.gradle.api.tasks.testing.logging.* -import org.apache.lucene.gradle.ErrorReportingTestListener def resources = scriptResources(buildscript) def verboseModeHookInstalled = false @@ -133,7 +132,12 @@ allprojects { jvmArgs '--add-modules', 'jdk.incubator.vector' } - jvmArgs '--enable-native-access=' + (project.path == ':lucene:core' ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + jvmArgs '--enable-native-access=' + (project.path in [ + ':lucene:core', + ':lucene:codecs', + ":lucene:distribution.tests", + ":lucene:test-framework" + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) @@ -196,7 +200,7 @@ allprojects { } def spillDir = getTemporaryDir().toPath() - def listener = new ErrorReportingTestListener(test.testLogging, spillDir, testOutputsDir.toPath(), verboseMode) + def listener = buildinfra.newErrorReportingTestListener(test.testLogging, spillDir, testOutputsDir.toPath(), verboseMode) addTestOutputListener(listener) addTestListener(listener) diff --git a/gradle/testing/failed-tests-at-end.gradle b/gradle/testing/failed-tests-at-end.gradle index 13a1d5c720f..c514046b029 100644 --- a/gradle/testing/failed-tests-at-end.gradle +++ b/gradle/testing/failed-tests-at-end.gradle @@ -15,8 +15,6 @@ * limitations under the License. */ -import org.apache.lucene.gradle.ErrorReportingTestListener - // Display all failed tests at the end of the build. def failedTests = [] @@ -28,7 +26,7 @@ allprojects { failedTests << [ "name": "${desc.className}.${desc.name}", "project": "${test.project.path}", - "output": file("${task.testOutputsDir}/${ErrorReportingTestListener.getOutputLogName(desc.parent)}"), + "output": file("${task.testOutputsDir}/${buildinfra.getOutputLogName(desc.parent)}"), "reproduce": "gradlew ${project.path}:test --tests \"${desc.className}.${desc.name}\" ${task.project.testOptionsForReproduceLine}" ] } @@ -39,7 +37,7 @@ allprojects { failedTests << [ "name": "${desc.name}", "project": "${test.project.path}", - "output": file("${task.testOutputsDir}/${ErrorReportingTestListener.getOutputLogName(desc)}"), + "output": file("${task.testOutputsDir}/${buildinfra.getOutputLogName(desc)}"), "reproduce": "gradlew ${project.path}:test --tests \"${desc.name}\" ${task.project.testOptionsForReproduceLine}" ] } diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index fd32d57e7c1..9ca8625b0ee 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -30,7 +30,7 @@ buildscript { } dependencies { - classpath 'com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.7.2' + classpath deps.randomizedtesting.runner } } @@ -126,10 +126,10 @@ allprojects { secManagerExclusions } dependencies { - secManagerExclusions ( "com.carrotsearch.randomizedtesting:randomizedtesting-runner", { + secManagerExclusions ( deps.randomizedtesting.runner, { exclude group: "junit" }) - secManagerExclusions ( "junit:junit", { + secManagerExclusions ( deps.junit, { exclude group: "org.hamcrest" }) } diff --git a/gradle/validation/check-environment.gradle b/gradle/validation/check-environment.gradle index 40971343825..1b0c9821182 100644 --- a/gradle/validation/check-environment.gradle +++ b/gradle/validation/check-environment.gradle @@ -22,7 +22,7 @@ import org.gradle.util.GradleVersion configure(rootProject) { ext { - expectedGradleVersion = '8.8' + expectedGradleVersion = deps.versions.minGradle.get() hasJavaFlightRecorder = ModuleLayer.boot().findModule('jdk.jfr').map(this.class.module::canRead).orElse(false) } @@ -32,6 +32,7 @@ configure(rootProject) { } def currentJavaVersion = JavaVersion.current() + def minJavaVersion = JavaVersion.toVersion(deps.versions.minJava.get()) if (currentJavaVersion < minJavaVersion) { throw new GradleException("At least Java ${minJavaVersion} is required, you are running Java ${currentJavaVersion} " + "[${System.getProperty('java.vm.name')} ${System.getProperty('java.vm.version')}]") diff --git a/gradle/validation/dependencies.gradle b/gradle/validation/dependencies.gradle new file mode 100644 index 00000000000..43dcf7583b8 --- /dev/null +++ b/gradle/validation/dependencies.gradle @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Configure sanity check for conflicting dependencies across certain configurations +allprojects { + apply plugin: deps.plugins.dependencychecks.get().pluginId + + def mainConfigurations = project.configurations.matching { + it.name in [ + "compileClasspath", + "runtimeClasspath" + ] + } + + def testConfigurations = project.configurations.matching { + it.name in [ + "annotationProcessor", + "testCompileClasspath", + "testRuntimeClasspath" + ] + } + + dependencyVersionChecks { + lockFileComment = "An inventory of resolved dependency versions. Do not edit this file directly." + + configurationGroups { + main_dependencies { + include mainConfigurations + } + + test_dependencies { + include testConfigurations + } + } + } + + dependencies { + constraints { + mainConfigurations.configureEach { Configuration conf -> + // no resolutions for conflicting dependencies at the moment. + } + } + } +} + +// Configure version catalog cleanups plugin. +configure(rootProject) { + apply plugin: deps.plugins.versionCatalogUpdate.get().pluginId + + versionCatalogUpdate { + sortByKey = true + + versionCatalogs { + deps { + catalogFile = file("versions.toml") + } + } + } + + tasks.matching { it.name == "tidy" }.configureEach { + it.dependsOn(":versionCatalogFormatDeps") + } + + tasks.matching { + it.path in [ + ":versionCatalogUpdateDeps" + ] + }.configureEach { + it.interactive = true + } + + tasks.register("updateDeps", { + dependsOn ":versionCatalogUpdateDeps" + }) +} \ No newline at end of file diff --git a/gradle/validation/ecj-lint.gradle b/gradle/validation/ecj-lint.gradle index 7a59ed5e10d..a98bd09e62e 100644 --- a/gradle/validation/ecj-lint.gradle +++ b/gradle/validation/ecj-lint.gradle @@ -23,7 +23,7 @@ configure(rootProject) { } dependencies { - ecjDeps "org.eclipse.jdt:ecj:${scriptDepVersions['ecj']}" + ecjDeps deps.ecj } } diff --git a/gradle/validation/error-prone.gradle b/gradle/validation/error-prone.gradle index 362e0c98cde..6972129d16d 100644 --- a/gradle/validation/error-prone.gradle +++ b/gradle/validation/error-prone.gradle @@ -37,24 +37,25 @@ if (skipReason) { allprojects { prj -> plugins.withType(JavaPlugin) { - // LUCENE-9650: Errorprone on master/gradle does not work when running as plugin - // inside a forked Javac process. Javac running inside Gradle works, because we have - // additional module system opens in place. - // This is a hack to keep the dependency (so that palantir's version check doesn't complain) - // but don't include the plugin (which fails on JDK16+). + // LUCENE-9650: Errorprone does not work when running as a plugin inside a forked Javac process. + // Javac running inside Gradle works, because we have additional module system opens in place. if (skipReason) { tasks.withType(JavaCompile) { task -> task.dependsOn ":errorProneSkipped" } + + // Error prone plugin adds error prone to test classpath. We need to add it here too (manually) so that + // versions.lock is consistent with or without error prone. configurations { errorprone } dependencies { - errorprone("com.google.errorprone:error_prone_core") + errorprone deps.errorprone } + configurations.annotationProcessor.extendsFrom(configurations.errorprone) } else { - prj.apply plugin: 'net.ltgt.errorprone' + prj.apply plugin: deps.plugins.errorprone.get().pluginId dependencies { - errorprone("com.google.errorprone:error_prone_core") + errorprone deps.errorprone } tasks.withType(JavaCompile) { task -> diff --git a/gradle/validation/forbidden-apis.gradle b/gradle/validation/forbidden-apis.gradle index dd5f7de51c5..ebf5aff5d72 100644 --- a/gradle/validation/forbidden-apis.gradle +++ b/gradle/validation/forbidden-apis.gradle @@ -1,4 +1,4 @@ - /* +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -57,7 +57,7 @@ allprojects { prj -> } // Configure defaults for sourceSets.main - tasks.matching { it.name ==~ /forbiddenApisMain\d*/ }.all { + tasks.matching { it.name ==~ /forbiddenApisMain\d*/ }.configureEach { bundledSignatures += [ 'jdk-unsafe', 'jdk-deprecated', @@ -76,12 +76,12 @@ allprojects { prj -> // Configure defaults for the MR-JAR feature sourceSets by setting java version and ignore missing classes // TODO: // - Get hold of warning messages, see https://github.com/policeman-tools/forbidden-apis/issues/207 - tasks.matching { it.name ==~ /forbiddenApisMain\d+/ }.all { + tasks.matching { it.name ==~ /forbiddenApisMain\d+/ }.configureEach { failOnMissingClasses = false } // Configure defaults for sourceSets.test - tasks.matching { it.name in ["forbiddenApisTest", "forbiddenApisTestFixtures"] }.all { + tasks.matching { it.name in ["forbiddenApisTest", "forbiddenApisTestFixtures"] }.configureEach { bundledSignatures += [ 'jdk-unsafe', 'jdk-deprecated', @@ -105,7 +105,7 @@ allprojects { prj -> } // Configure defaults for sourceSets.tools (if present). - tasks.matching { it.name == "forbiddenApisTools" }.all { + tasks.matching { it.name == "forbiddenApisTools" }.configureEach { bundledSignatures += [ 'jdk-unsafe', 'jdk-deprecated', @@ -129,12 +129,24 @@ allprojects { prj -> // // This is the simplest workaround possible: just point at all the rule files and indicate // them as inputs. This way if a rule is modified, checks will be reapplied. - configure(tasks.matching { it.name.startsWith("forbiddenApis") }) { task -> + tasks.matching { it.name.startsWith("forbiddenApis") }.configureEach { task -> task.inputs.dir(file(resources)) } // Disable sysout signatures for these projects. - if (prj.path in [ + if (prj.name in ["missing-doclet", "build-infra"]) { + forbiddenApisMain.bundledSignatures -= [ + 'jdk-non-portable', + 'jdk-system-out' + ] + + forbiddenApisMain.exclude("**/Checksum*") + forbiddenApisMain.suppressAnnotations += [ + "**.*SuppressForbidden" + ] + } + + if (prj.name in ["missing-doclet"] || prj.path in [ ":lucene:demo", ":lucene:benchmark", ":lucene:test-framework" diff --git a/gradle/validation/git-status.gradle b/gradle/validation/git-status.gradle index 37c3d8dce39..31b80641683 100644 --- a/gradle/validation/git-status.gradle +++ b/gradle/validation/git-status.gradle @@ -33,7 +33,7 @@ buildscript { } dependencies { - classpath "org.eclipse.jgit:org.eclipse.jgit:${scriptDepVersions['jgit']}" + classpath deps.jgit } } diff --git a/gradle/validation/jar-checks.gradle b/gradle/validation/jar-checks.gradle index 7fa6cd3b487..5fe1bcbb3a6 100644 --- a/gradle/validation/jar-checks.gradle +++ b/gradle/validation/jar-checks.gradle @@ -20,8 +20,6 @@ // 2) notice file // 3) checksum validation/ generation. -import org.apache.commons.codec.digest.DigestUtils - // This should be false only for debugging. def failOnError = true @@ -136,7 +134,7 @@ subprojects { jarName : file.toPath().getFileName().toString(), path : file, module : resolvedArtifact.moduleVersion, - checksum : provider { new DigestUtils(DigestUtils.sha1Digest).digestAsHex(file).trim() }, + checksum : provider { buildinfra.sha1Digest().digestAsHex(file).trim() }, // We keep track of the files referenced by this dependency (sha, license, notice, etc.) // so that we can determine unused dangling files later on. referencedFiles: [] diff --git a/gradle/validation/precommit.gradle b/gradle/validation/precommit.gradle index 68f92b9aeb7..06d65bca8ca 100644 --- a/gradle/validation/precommit.gradle +++ b/gradle/validation/precommit.gradle @@ -23,8 +23,7 @@ configure(rootProject) { description = "All precommit checks" // Root-level validation tasks. - dependsOn ":verifyLocks" - dependsOn ":versionsPropsAreSorted" + dependsOn ":checkLocks" dependsOn ":checkWorkingCopyClean" } diff --git a/gradle/validation/rat-sources.gradle b/gradle/validation/rat-sources.gradle index 3c215c35223..138c3f5d906 100644 --- a/gradle/validation/rat-sources.gradle +++ b/gradle/validation/rat-sources.gradle @@ -18,22 +18,23 @@ import groovy.xml.NamespaceBuilder // Configure rat dependencies for use in the custom task. -configure(rootProject) { + +// Configure the rat validation task and all scanned directories. +allprojects { configurations { ratDeps } dependencies { - ratDeps "org.apache.rat:apache-rat:${scriptDepVersions['apache-rat']}" + ratDeps deps.rat } -} -// Configure the rat validation task and all scanned directories. -allprojects { - task("rat", type: RatTask) { + tasks.register("rat", RatTask).configure { group = 'Verification' description = 'Runs Apache Rat checks.' + dependsOn configurations.ratDeps + def defaultScanFileTree = project.fileTree(projectDir, { // Don't check under the project's build folder. exclude project.buildDir.name @@ -78,10 +79,10 @@ allprojects { // Exclude github stuff (templates, workflows). exclude ".github" - // The root project also includes patterns for the boostrap (buildSrc) and composite + // The root project also includes patterns for the include composite // projects. Include their sources in the scan. - include "buildSrc/src/**" - include "dev-tools/missing-doclet/src/**" + include "build-tools/build-infra/src/**" + include "build-tools/missing-doclet/src/**" // do not let RAT attempt to scan a python venv, it gets lost and confused... exclude "dev-tools/aws-jmh/build/**" @@ -142,7 +143,7 @@ class RatTask extends DefaultTask { def generateReport(File reportFile) { // Set up ant rat task. - def ratClasspath = project.rootProject.configurations.ratDeps.asPath + def ratClasspath = project.configurations.ratDeps.asPath ant.setLifecycleLogLevel(AntBuilder.AntMessagePriority.ERROR) ant.taskdef(resource: 'org/apache/rat/anttasks/antlib.xml', classpath: ratClasspath) diff --git a/gradle/validation/spotless.gradle b/gradle/validation/spotless.gradle index 72d64399d6c..a77829a7032 100644 --- a/gradle/validation/spotless.gradle +++ b/gradle/validation/spotless.gradle @@ -20,9 +20,9 @@ * spotless and Google Java Format. */ -def resources = scriptResources(buildscript) +// def resources = scriptResources(buildscript) -configure(project(":lucene").subprojects) { prj -> +configure(allprojects) { prj -> plugins.withType(JavaPlugin) { prj.apply plugin: 'com.diffplug.spotless' @@ -36,7 +36,7 @@ configure(project(":lucene").subprojects) { prj -> lineEndings 'UNIX' endWithNewline() - googleJavaFormat('1.18.1') + googleJavaFormat(deps.versions.googleJavaFormat.get()) // Apply to all Java sources target "src/**/*.java" @@ -100,23 +100,19 @@ configure(project(":lucene").subprojects) { prj -> // Emit a custom message about how to fix formatting errors. tasks.matching { task -> task.name == "spotlessJavaCheck" }.configureEach { - runToFixMessage.set("\nIMPORTANT: run the top-level './gradlew tidy' to format code automatically (see help/formatting.txt for more info).") + it.runToFixMessage.set("\nIMPORTANT: run the top-level './gradlew tidy' to format code automatically (see help/formatting.txt for more info).") } - // Add an alias to 'spotlessApply' simply called 'tidy' and wire up - // spotlessCheck to convention's check. - task tidy() { - description "Applies formatters and cleanups to sources." - group "verification" + // Hook up spotless to tidy and check tasks. + tasks.matching { it.name == "tidy" }.configureEach { v -> + v.dependsOn tasks.matching { it.name == "spotlessApply" } } - tasks.matching { task -> task.name == "spotlessApply" }.configureEach { v -> - tidy.dependsOn v - v.dependsOn ":checkJdkInternalsExportedToGradle" + tasks.matching { it.name == "check" }.configureEach { v -> + v.dependsOn tasks.matching { it.name == "spotlessCheck" } } - tasks.matching { task -> task.name == "spotlessCheck" }.configureEach { v -> - check.dependsOn v + tasks.matching { task -> task.name in ["spotlessApply", "spotlessCheck"] }.configureEach { v -> v.dependsOn ":checkJdkInternalsExportedToGradle" } } diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle index 908169f61af..5adeaa0e806 100644 --- a/gradle/validation/validate-source-patterns.gradle +++ b/gradle/validation/validate-source-patterns.gradle @@ -33,7 +33,7 @@ buildscript { } dependencies { - classpath "org.apache.rat:apache-rat:${scriptDepVersions['apache-rat']}" + classpath deps.rat } } diff --git a/gradle/validation/versions-props-sorted.gradle b/gradle/validation/versions-props-sorted.gradle deleted file mode 100644 index 670d2e8f691..00000000000 --- a/gradle/validation/versions-props-sorted.gradle +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// This ensures 'versions.props' file is sorted lexicographically. - -configure(rootProject) { - task versionsPropsAreSorted() { - doFirst { - def versionsProps = file('versions.props') - def lines = versionsProps.readLines("UTF-8") - def sorted = lines.toSorted() - - if (!Objects.equals(lines, sorted)) { - def sortedFile = file("${buildDir}/versions.props") - sortedFile.write(sorted.join("\n"), "UTF-8") - throw new GradleException("${versionsProps} file is not sorted lexicographically. I wrote a sorted file to ${sortedFile} - please review and commit.") - } - } - } -} diff --git a/gradlew b/gradlew index 7f9fe6edf7c..e46ae37da34 100755 --- a/gradlew +++ b/gradlew @@ -158,7 +158,7 @@ fi GRADLE_WRAPPER_JAR="$APP_HOME/gradle/wrapper/gradle-wrapper.jar" if [ ! -e "$GRADLE_WRAPPER_JAR" ]; then - "$JAVACMD" $JAVA_OPTS "$APP_HOME/buildSrc/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java" "$GRADLE_WRAPPER_JAR" + "$JAVACMD" $JAVA_OPTS "$APP_HOME/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java" "$GRADLE_WRAPPER_JAR" WRAPPER_STATUS=$? if [ "$WRAPPER_STATUS" -eq 1 ]; then echo "ERROR: Something went wrong. Make sure you're using Java version of exactly 21." @@ -173,7 +173,7 @@ CLASSPATH=$GRADLE_WRAPPER_JAR # START OF LUCENE CUSTOMIZATION # Generate gradle.properties if they don't exist if [ ! -e "$APP_HOME/gradle.properties" ]; then - "$JAVACMD" $JAVA_OPTS "$APP_HOME/buildSrc/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java" "$APP_HOME/gradle/template.gradle.properties" "$APP_HOME/gradle.properties" + "$JAVACMD" $JAVA_OPTS "$APP_HOME/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java" "$APP_HOME/gradle/template.gradle.properties" "$APP_HOME/gradle.properties" GENERATOR_STATUS=$? if [ "$GENERATOR_STATUS" -ne 0 ]; then exit $GENERATOR_STATUS diff --git a/gradlew.bat b/gradlew.bat index 8202f430e1e..bb198344481 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -76,7 +76,7 @@ goto fail @rem LUCENE-9266: verify and download the gradle wrapper jar if we don't have one. set GRADLE_WRAPPER_JAR=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar IF NOT EXIST "%GRADLE_WRAPPER_JAR%" ( - "%JAVA_EXE%" %JAVA_OPTS% "%APP_HOME%/buildSrc/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java" "%GRADLE_WRAPPER_JAR%" + "%JAVA_EXE%" %JAVA_OPTS% "%APP_HOME%/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/WrapperDownloader.java" "%GRADLE_WRAPPER_JAR%" IF %ERRORLEVEL% EQU 1 goto failWithJvmMessage IF %ERRORLEVEL% NEQ 0 goto fail ) @@ -89,7 +89,7 @@ set CLASSPATH=%GRADLE_WRAPPER_JAR% IF NOT EXIST "%APP_HOME%\gradle.properties" ( @rem local expansion is needed to check ERRORLEVEL inside control blocks. setlocal enableDelayedExpansion - "%JAVA_EXE%" %JAVA_OPTS% "%APP_HOME%/buildSrc/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java" "%APP_HOME%\gradle\template.gradle.properties" "%APP_HOME%\gradle.properties" + "%JAVA_EXE%" %JAVA_OPTS% "%APP_HOME%/build-tools/build-infra/src/main/java/org/apache/lucene/gradle/GradlePropertiesGenerator.java" "%APP_HOME%\gradle\template.gradle.properties" "%APP_HOME%\gradle.properties" IF %ERRORLEVEL% NEQ 0 goto fail endlocal ) diff --git a/help/dependencies.txt b/help/dependencies.txt index e58baa7eb54..514cb946097 100644 --- a/help/dependencies.txt +++ b/help/dependencies.txt @@ -7,81 +7,79 @@ and each configuration can have dependencies attached to it. There are some standard conventions so, for example, the Java plugin adds standard configurations such as "api", "implementation", "testImplementation" and others. These configurations can also inherit -from each other; more about this typic can be found here: +from each other; more about this topic can be found here: https://docs.gradle.org/current/userguide/dependency_management_for_java_projects.html#dependency_management_for_java_projects https://docs.gradle.org/current/userguide/java_library_plugin.html#sec:java_library_separation https://docs.gradle.org/current/userguide/java_plugin.html#sec:java_plugin_and_dependency_management -Lucene typically uses three configurations and attach project -dependencies to them: +Lucene uses the following configurations and attach project dependencies +to them: -api - makes a dependency available for main classes, tests and any +moduleApi - makes the dependency available to main classes, tests and any other modules importing the project (exportable dependency), -implementation - makes a dependency available for main classes, tests - but will *not* export the dependency for other modules (so their +moduleImplementation - makes the dependency available to main classes, tests + but will *not* export the dependency to other modules (so their compilation classpath won't contain it). -testImplementation - makes a dependency only available for test classes. +moduleTestImplementation - makes the dependency available for test classes only. + +The "module" prefix is used to distinguish configurations which apply +to modular builds, compared to the regular classpath-configurations defined +by gradle's java module. Some Lucene modules may define regular classpath +entries to bypass the limitations of the module system (or gradle's). Adding a library dependency --------------------------- +Lucene dependencies and their versions are managed globally using version +catalogs (in versions.toml) [https://docs.gradle.org/current/userguide/platforms.html]. + Let's say we wish to add a dependency on library "foo.bar:baz" in version 1.2 to :lucene:core. Let's assume this library is only used internally by the project. The :lucene:core project is configured -by lucene/core/build.gradle and we would add (or modify) the dependency +by lucene/core/build.gradle, so we add (or modify) the dependency block as follows: dependencies { - implementation "foo.bar:baz" + moduleImplementation deps.baz } -The "implementation" here is a named configuration; we don't need to declare -it because it is declared for us by the java-library plugin. +The "moduleImplementation" here is a named configuration explained in the +section above. The "deps.baz" refers to the version catalog named "deps", +in which the dependency "baz" should be declared. If this is the first +reference to this library, then we have to add it to "versions.toml" catalog: +the version goes under the "versions" and module coordinates +under the "libraries" section: -In "normal" gradle the version of the dependency would be present -directly inside the declaration but we use a plugin -(palantir-consistent-versions) to manage all dependency versions -from the top-level (so that conflicts can be resolved globally). +[versions] +baz = "1.2" +... +[libraries] +baz = { module = "foo.bar:baz", version.ref = "baz" } -If this is the first time "foo.bar:baz" is added to the project, we'd have -to add its version to "versions.props" file at the top level of the -checkout: +The version defined in the "versions" section is the preferred version of the library +we wish to use. Finally, run tidy to sort all entries in versions.toml: -foo.bar:baz=1.2 +gradlew tidy -and then regenerate the "versions.lock" file using the following -command: +Gradle will try to consolidate different versions across different +configurations to make sure they're compatible and may complain if it encounters +conflicting versions in the dependency tree. We want all dependencies to be consistent, +so we use an additional build plugin to ensure no accidental version changes +occur. Whenever we add or remove dependencies, we have to follow-up with lock file +regeneration: -gradlew --write-locks +gradlew writeLocks +git diff versions.* -IMPORTANT: The versions.lock file will contain the actual version -of the dependency picked based on other project dependencies and -their transitive dependencies. This selected version may be -different from what each of these actually requires (the highest -version number will be typically selected). To see which dependencies -require which version of the library use: +IMPORTANT: The versions.lock file will contain a list of actual library versions +and configurations they occurred in. -gradlew why --hash=... - -where the hash code comes from versions.lock file. For example, at -the time of writing, jackson-databind has the following entry: - -com.fasterxml.jackson.core:jackson-databind:2.10.0 (3 constraints: 931a7796) - -and "gradlew why --hash=931a7796" prints: - -com.fasterxml.jackson.core:jackson-databind:2.10.0 - projects -> 2.10.0 - net.thisptr:jackson-jq -> 2.7.0 - org.carrot2:carrot2-mini -> 2.9.9.3 - -Once the dependency is added it always makes sense to see the -tree of all module dependencies and maybe exclude transitive -dependencies of foo.bar:baz that we won't need. +Once a new dependency is added it always makes sense to regenerate the lock file +and look at which dependencies have changed (and why). Inspecting current dependencies @@ -98,12 +96,12 @@ in just the "publicly visible" and "classpath-visible" configurations. The publicly visible project dependencies (classes shared by other modules importing our module) can be displayed with: -gradlew -p lucene\analysis\icu dependencies --configuration api +gradlew -p lucene\analysis\icu dependencies --configuration moduleApi And the "private" set of dependencies (real classpath) can be dumped with: -gradlew -p lucene\analysis\icu dependencies --configuration runtimeClasspath +gradlew -p lucene\analysis\icu dependencies --configuration moduleRuntimePath Excluding a transitive dependency @@ -115,7 +113,7 @@ crucial for the functioning of "foo.bar:baz". We can exclude it by adding an exclusion block to the original declaration: dependencies { - implementation("foo.bar:baz", { + implementation(deps.baz, { exclude group: "foo.bar", module: "irrelevant" }) } diff --git a/help/formatting.txt b/help/formatting.txt index de52cb6fd93..5b18c9fbe73 100644 --- a/help/formatting.txt +++ b/help/formatting.txt @@ -2,7 +2,7 @@ Code formatting =============== Starting with (LUCENE-9564) Java code is enforced to comply with -google-java-format conventions. In theory you shouldn't worry about +google-java-format conventions. In theory, you shouldn't worry about what the convention actually looks like - write the code in any way you like and then run: @@ -13,7 +13,7 @@ your code so that it complies with the convention and passes gradle 'check' task. IMPORTANT: There is *no* way to mark sections of the code as excluded -from formatting. This is by design and cannot be altered. In vast +from formatting. This is by design and cannot be altered. In the vast majority of cases the formatter will do a great job of cleaning up the code. Occasionally you may want to rewrite the code (introduce a local variable or reshape code paths) so that it's easier to read after diff --git a/help/publishing.txt b/help/publishing.txt index 545cc58d74c..0b237b039e3 100644 --- a/help/publishing.txt +++ b/help/publishing.txt @@ -54,7 +54,7 @@ Signing can be enabled by adding the "-Psign" option, for example: gradlew assembleRelease mavenToApacheReleases -Psign -By default gradle uses a Java-based implementation of PGP for signing, which requieres +By default, gradle uses a Java-based implementation of PGP for signing, which requires several "signing.*" properties via either ~/.gradle/gradle.properties or command-line options: https://docs.gradle.org/current/userguide/signing_plugin.html#sec:signatory_credentials @@ -92,9 +92,9 @@ signing.gnupg.passphrase=... # Provide your passphrase to If in doubt, consult gradle's signing plugin documentation: https://docs.gradle.org/current/userguide/signing_plugin.html#sec:using_gpg_agent -"signing.gnupg.passphrase" is not recomended because there is no advantage to using an external GPG process if you use it. If you -are comfortable giving gradle your passphrase, then there is no reason to use an external GPG process via '-PuseGpg'. Just use the -"signing.*" options described previuosly to let gradle deal with your key directly. +"signing.gnupg.passphrase" is not recommended because there is no advantage to using an external GPG process if you use it. +If you are comfortable giving gradle your passphrase, then there is no reason to use an external GPG process via '-PuseGpg'. +Just use the "signing.*" options described previuosly to let gradle deal with your key directly. Because of how Gradle's signing plugin invokes GPG, using an external GPG process *only* works if your GPG configuration uses a GPG agent (required by gpg2) and if the "pinentry" for your GPG agent does not require access to the tty to prompt you for a password. diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index dbff31d8c18..baa53ed5d90 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -129,6 +129,10 @@ New Features * GITHUB#13233: Add RomanianNormalizationFilter (Trey Jones, Robert Muir) +* GITHUB#13449: Sparse index: optional skip list on top of doc values which is exposed via the + DocValuesSkipper abstraction. A new flag is added to FieldType.java that configures whether + to create a "skip index" for doc values. (Ignacio Vera) + Improvements --------------------- @@ -204,6 +208,10 @@ Changes in Backwards Compatibility Policy Other --------------------- +* GITHUB#13459: Merges all immutable attributes in FieldInfos.FieldNumbers into one Hashmap saving + memory when writing big indices. Fixes an exotic bug when calling clear where not all attributes + were cleared. (Ignacio Vera) + * LUCENE-10376: Roll up the loop in VInt/VLong in DataInput. (Guo Feng) * LUCENE-10253: The @BadApple annotation has been removed from the test @@ -239,7 +247,10 @@ Other API Changes --------------------- -(No changes) + +* GITHUB#13281: Mark COSINE VectorSimilarityFunction as deprecated. (Pulkit Gupta) + +* GITHUB#13469: Expose FlatVectorsFormat as a first-class format; can be configured using a custom Codec. (Michael Sokolov) New Features --------------------- @@ -262,15 +273,19 @@ Optimizations * GITHUB#13454: MultiTermQuery returns null ScoreSupplier in cases where no query terms are present in the index segment (Mayya Sharipova) +* GITHUB#13431: Replace TreeMap and use compiled Patterns in Japanese UserDictionary. (Bruno Roustant) + +* GITHUB#12941: Don't preserve auxiliary buffer contents in LSBRadixSorter if it grows. (Stefan Vodita) + Bug Fixes --------------------- -(No changes) + +* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in + some corner cases. (Greg Miller) Other ---------------------- -* GITHUB#13459: Merges all immutable attributes in FieldInfos.FieldNumbers into one Hashmap saving - memory when writing big indices. Fixes an exotic bug when calling clear where not all attributes - were cleared. (Ignacio Vera) +-------------------- +(No changes) ======================== Lucene 9.11.0 ======================= diff --git a/lucene/analysis/icu/build.gradle b/lucene/analysis/icu/build.gradle index aba455f4e54..5550143224a 100644 --- a/lucene/analysis/icu/build.gradle +++ b/lucene/analysis/icu/build.gradle @@ -23,7 +23,7 @@ dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:analysis:common') - moduleApi 'com.ibm.icu:icu4j' + moduleApi deps.icu4j moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index a62ffe5d8ac..391ed2ba44b 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -23,8 +23,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Map; -import java.util.TreeMap; +import java.util.regex.Pattern; import org.apache.lucene.analysis.morph.Dictionary; import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; @@ -37,6 +36,10 @@ public final class UserDictionary implements Dictionary { public static final String INTERNAL_SEPARATOR = "\u0000"; + private static final Pattern LINE_COMMENT = Pattern.compile("^#.*$"); + private static final Pattern WHITESPACE = Pattern.compile("\\s"); + private static final Pattern SPACES = Pattern.compile(" +"); + // phrase text -> phrase ID private final TokenInfoFST fst; @@ -51,16 +54,16 @@ public final class UserDictionary implements Dictionary { public static UserDictionary open(Reader reader) throws IOException { BufferedReader br = new BufferedReader(reader); - String line = null; + String line; List featureEntries = new ArrayList<>(); // text, segmentation, readings, POS while ((line = br.readLine()) != null) { // Remove comments - line = line.replaceAll("^#.*$", ""); + line = LINE_COMMENT.matcher(line).replaceAll(""); // Skip empty lines or comment lines - if (line.trim().length() == 0) { + if (line.trim().isEmpty()) { continue; } String[] values = CSVUtil.parse(line); @@ -99,10 +102,10 @@ public final class UserDictionary implements Dictionary { long ord = 0; for (String[] values : featureEntries) { - String surface = values[0].replaceAll("\\s", ""); - String concatenatedSegment = values[1].replaceAll("\\s", ""); - String[] segmentation = values[1].replaceAll(" *", " ").split(" "); - String[] readings = values[2].replaceAll(" *", " ").split(" "); + String surface = WHITESPACE.matcher(values[0]).replaceAll(""); + String concatenatedSegment = WHITESPACE.matcher(values[1]).replaceAll(""); + String[] segmentation = SPACES.split(values[1]); + String[] readings = SPACES.split(values[2]); String pos = values[3]; if (segmentation.length != readings.length) { @@ -141,7 +144,7 @@ public final class UserDictionary implements Dictionary { scratch.growNoCopy(token.length()); scratch.setLength(token.length()); for (int i = 0; i < token.length(); i++) { - scratch.setIntAt(i, (int) token.charAt(i)); + scratch.setIntAt(i, token.charAt(i)); } fstCompiler.add(scratch.get(), ord); segmentations.add(wordIdAndLength); @@ -151,7 +154,7 @@ public final class UserDictionary implements Dictionary { new TokenInfoFST( FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()), false); this.morphAtts = new UserMorphData(data.toArray(new String[0])); - this.segmentations = segmentations.toArray(new int[segmentations.size()][]); + this.segmentations = segmentations.toArray(new int[0][]); } @Override @@ -168,33 +171,53 @@ public final class UserDictionary implements Dictionary { * @return array of {wordId, position, length} */ public int[][] lookup(char[] chars, int off, int len) throws IOException { - // TODO: can we avoid this treemap/toIndexArray? - TreeMap result = new TreeMap<>(); // index, [length, length...] - boolean found = false; // true if we found any results - + List matches = null; + int numResults = 0; final FST.BytesReader fstReader = fst.getBytesReader(); - + final int end = off + len; FST.Arc arc = new FST.Arc<>(); - int end = off + len; for (int startOffset = off; startOffset < end; startOffset++) { + int[] wordIdAndLength = null; arc = fst.getFirstArc(arc); int output = 0; - int remaining = end - startOffset; - for (int i = 0; i < remaining; i++) { + for (int i = 0, remaining = end - startOffset; i < remaining; i++) { int ch = chars[startOffset + i]; if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { break; // continue to next position } output += arc.output().intValue(); if (arc.isFinal()) { - final int finalOutput = output + arc.nextFinalOutput().intValue(); - result.put(startOffset - off, segmentations[finalOutput]); - found = true; + int finalOutput = output + arc.nextFinalOutput().intValue(); + wordIdAndLength = segmentations[finalOutput]; } } + if (wordIdAndLength != null) { + if (matches == null) { + matches = new ArrayList<>(); + } + matches.add(new Match(startOffset - off, wordIdAndLength)); + numResults += wordIdAndLength.length - 1; + } } - - return found ? toIndexArray(result) : EMPTY_RESULT; + if (numResults == 0) { + return EMPTY_RESULT; + } + int[][] result = new int[numResults][]; + int index = 0; + for (int i = 0; i < matches.size(); i++) { + Match match = matches.get(i); + int[] wordIdAndLength = match.wordIdAndLength; + int wordId = wordIdAndLength[0]; + // convert length to index + int position = match.position; + for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset + // add a {wordId, index, length} token to the results + int[] token = {wordId + j - 1, position, wordIdAndLength[j]}; + result[index++] = token; + position += wordIdAndLength[j]; + } + } + return result; } public TokenInfoFST getFST() { @@ -203,28 +226,9 @@ public final class UserDictionary implements Dictionary { private static final int[][] EMPTY_RESULT = new int[0][]; - /** - * Convert Map of index and wordIdAndLength to array of {wordId, index, length} - * - * @return array of {wordId, index, length} - */ - private int[][] toIndexArray(Map input) { - ArrayList result = new ArrayList<>(); - for (Map.Entry entry : input.entrySet()) { - int[] wordIdAndLength = entry.getValue(); - int wordId = wordIdAndLength[0]; - // convert length to index - int current = entry.getKey(); - for (int j = 1; j < wordIdAndLength.length; j++) { // first entry is wordId offset - int[] token = {wordId + j - 1, current, wordIdAndLength[j]}; - result.add(token); - current += wordIdAndLength[j]; - } - } - return result.toArray(new int[result.size()][]); - } - public int[] lookupSegmentation(int phraseID) { return segmentations[phraseID]; } + + private record Match(int position, int[] wordIdAndLength) {} } diff --git a/lucene/analysis/morfologik/build.gradle b/lucene/analysis/morfologik/build.gradle index 4faee5d3424..7c8b5910395 100644 --- a/lucene/analysis/morfologik/build.gradle +++ b/lucene/analysis/morfologik/build.gradle @@ -22,10 +22,10 @@ description = 'Analyzer for dictionary stemming, built-in Polish dictionary' dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:analysis:common') - moduleApi 'org.carrot2:morfologik-stemming' + moduleApi deps.morfologik.stemming - moduleImplementation 'org.carrot2:morfologik-polish' - moduleImplementation 'ua.net.nlp:morfologik-ukrainian-search' + moduleImplementation deps.morfologik.polish + moduleImplementation deps.morfologik.ukrainian moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/analysis/opennlp/build.gradle b/lucene/analysis/opennlp/build.gradle index 2964e88d212..02449c9f819 100644 --- a/lucene/analysis/opennlp/build.gradle +++ b/lucene/analysis/opennlp/build.gradle @@ -22,7 +22,7 @@ description = 'OpenNLP Library Integration' dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:analysis:common') - moduleApi 'org.apache.opennlp:opennlp-tools' + moduleApi deps.opennlp.tools moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/analysis/phonetic/build.gradle b/lucene/analysis/phonetic/build.gradle index 1b8eee8f845..81841ccfc3e 100644 --- a/lucene/analysis/phonetic/build.gradle +++ b/lucene/analysis/phonetic/build.gradle @@ -23,7 +23,7 @@ dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:analysis:common') - moduleApi 'commons-codec:commons-codec' + moduleApi deps.commons.codec moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java index a3e09db8ae9..9f7f0b83e41 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene60/Lucene60FieldInfosFormat.java @@ -209,6 +209,7 @@ public final class Lucene60FieldInfosFormat extends FieldInfosFormat { storePayloads, indexOptions, docValuesType, + false, dvGen, attributes, pointDataDimensionCount, diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java index 40e955dfeb4..c5754e5d1e5 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene80/Lucene80DocValuesProducer.java @@ -28,6 +28,7 @@ import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.ImpactsEnum; @@ -1677,6 +1678,11 @@ final class Lucene80DocValuesProducer extends DocValuesProducer { } } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) { + return null; + } + @Override public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(data); diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java index 22eb4558e3b..c4e3dca2873 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java @@ -186,6 +186,7 @@ public final class Lucene90FieldInfosFormat extends FieldInfosFormat { storePayloads, indexOptions, docValuesType, + false, dvGen, attributes, pointDataDimensionCount, diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60FieldInfosFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60FieldInfosFormat.java index 10b7fe9a92e..0b9e9f72672 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60FieldInfosFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene60/TestLucene60FieldInfosFormat.java @@ -27,4 +27,9 @@ public class TestLucene60FieldInfosFormat extends BaseFieldInfoFormatTestCase { protected Codec getCodec() { return new Lucene84RWCodec(); } + + @Override + protected boolean supportDocValuesSkipIndex() { + return false; + } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java index 5b9dd0e7d9b..376b72d79a0 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene80/BaseLucene80DocValuesFormatTestCase.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.backward_codecs.lucene80; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -59,18 +60,114 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; -import org.apache.lucene.tests.index.BaseCompressingDocValuesFormatTestCase; +import org.apache.lucene.tests.index.LegacyBaseDocValuesFormatTestCase; import org.apache.lucene.tests.index.RandomIndexWriter; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.packed.PackedInts; /** Tests Lucene80DocValuesFormat */ public abstract class BaseLucene80DocValuesFormatTestCase - extends BaseCompressingDocValuesFormatTestCase { + extends LegacyBaseDocValuesFormatTestCase { + + private static long dirSize(Directory d) throws IOException { + long size = 0; + for (String file : d.listAll()) { + size += d.fileLength(file); + } + return size; + } + + public void testUniqueValuesCompression() throws IOException { + try (final Directory dir = new ByteBuffersDirectory()) { + final IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + final IndexWriter iwriter = new IndexWriter(dir, iwc); + + final int uniqueValueCount = TestUtil.nextInt(random(), 1, 256); + final List values = new ArrayList<>(); + + final Document doc = new Document(); + final NumericDocValuesField dvf = new NumericDocValuesField("dv", 0); + doc.add(dvf); + for (int i = 0; i < 300; ++i) { + final long value; + if (values.size() < uniqueValueCount) { + value = random().nextLong(); + values.add(value); + } else { + value = RandomPicks.randomFrom(random(), values); + } + dvf.setLongValue(value); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + final long size1 = dirSize(dir); + for (int i = 0; i < 20; ++i) { + dvf.setLongValue(RandomPicks.randomFrom(random(), values)); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + final long size2 = dirSize(dir); + // make sure the new longs did not cost 8 bytes each + assertTrue(size2 < size1 + 8 * 20); + } + } + + public void testDateCompression() throws IOException { + try (final Directory dir = new ByteBuffersDirectory()) { + final IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + final IndexWriter iwriter = new IndexWriter(dir, iwc); + + final long base = 13; // prime + final long day = 1000L * 60 * 60 * 24; + + final Document doc = new Document(); + final NumericDocValuesField dvf = new NumericDocValuesField("dv", 0); + doc.add(dvf); + for (int i = 0; i < 300; ++i) { + dvf.setLongValue(base + random().nextInt(1000) * day); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + final long size1 = dirSize(dir); + for (int i = 0; i < 50; ++i) { + dvf.setLongValue(base + random().nextInt(1000) * day); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + final long size2 = dirSize(dir); + // make sure the new longs costed less than if they had only been packed + assertTrue(size2 < size1 + (PackedInts.bitsRequired(day) * 50) / 8); + } + } + + public void testSingleBigValueCompression() throws IOException { + try (final Directory dir = new ByteBuffersDirectory()) { + final IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + final IndexWriter iwriter = new IndexWriter(dir, iwc); + + final Document doc = new Document(); + final NumericDocValuesField dvf = new NumericDocValuesField("dv", 0); + doc.add(dvf); + for (int i = 0; i < 20000; ++i) { + dvf.setLongValue(i & 1023); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + final long size1 = dirSize(dir); + dvf.setLongValue(Long.MAX_VALUE); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + final long size2 = dirSize(dir); + // make sure the new value did not grow the bpv for every other value + assertTrue(size2 < size1 + (20000 * (63 - 10)) / 8); + } + } // TODO: these big methods can easily blow up some of the other ram-hungry codecs... // for now just keep them here, as we want to test this for this format. diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWHnswScalarQuantizationVectorsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWHnswScalarQuantizationVectorsFormat.java index a1d7648d077..8eac95dd9ef 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWHnswScalarQuantizationVectorsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWHnswScalarQuantizationVectorsFormat.java @@ -52,11 +52,6 @@ class Lucene99RWHnswScalarQuantizationVectorsFormat null); } - @Override - public int getMaxDimensions(String fieldName) { - return 1024; - } - static class Lucene99RWScalarQuantizedFormat extends Lucene99ScalarQuantizedVectorsFormat { private static final FlatVectorsFormat rawVectorFormat = new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); diff --git a/lucene/benchmark-jmh/build.gradle b/lucene/benchmark-jmh/build.gradle index 2a0e0bffd01..1751a43d7a7 100644 --- a/lucene/benchmark-jmh/build.gradle +++ b/lucene/benchmark-jmh/build.gradle @@ -25,8 +25,8 @@ dependencies { moduleImplementation project(':lucene:core') moduleImplementation project(':lucene:expressions') - moduleImplementation "org.openjdk.jmh:jmh-core:1.37" - annotationProcessor "org.openjdk.jmh:jmh-generator-annprocess:1.37" + moduleImplementation deps.jmh.core + annotationProcessor deps.jmh.annprocess } diff --git a/lucene/benchmark/build.gradle b/lucene/benchmark/build.gradle index 97f17bd3cfb..7ec8a8b3a0d 100644 --- a/lucene/benchmark/build.gradle +++ b/lucene/benchmark/build.gradle @@ -31,17 +31,17 @@ dependencies { moduleImplementation project(':lucene:spatial-extras') moduleImplementation project(':lucene:queryparser') - moduleImplementation "org.apache.commons:commons-compress" - moduleImplementation "com.ibm.icu:icu4j" - moduleImplementation "org.locationtech.spatial4j:spatial4j" - moduleImplementation ("net.sourceforge.nekohtml:nekohtml", { + moduleImplementation deps.commons.compress + moduleImplementation deps.icu4j + moduleImplementation deps.spatial4j + moduleImplementation(deps.nekohtml, { exclude module: "xml-apis" // LUCENE-10337: Exclude xercesImpl from module path because it has split packages with the JDK (!) exclude module: "xercesImpl" }) // LUCENE-10337: Include xercesImpl on regular classpath where it won't cause conflicts. - implementation ("xerces:xercesImpl", { + implementation (deps.xerces, { exclude module: "xml-apis" }) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java index 916b696020d..f58ff0873ca 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java @@ -16,13 +16,16 @@ */ package org.apache.lucene.codecs.simpletext; +import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.DOCCOUNT; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.END; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.FIELD; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.LENGTH; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXLENGTH; +import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXVALUE; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MINVALUE; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.NUMVALUES; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORDPATTERN; +import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORIGIN; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.PATTERN; import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.TYPE; @@ -40,6 +43,7 @@ import java.util.function.IntFunction; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; @@ -59,12 +63,15 @@ import org.apache.lucene.util.StringHelper; class SimpleTextDocValuesReader extends DocValuesProducer { static class OneField { + int docCount; long dataStartFilePointer; String pattern; String ordPattern; int maxLength; boolean fixedLength; + long origin; long minValue; + long maxValue; long numValues; } @@ -99,17 +106,34 @@ class SimpleTextDocValuesReader extends DocValuesProducer { DocValuesType dvType = DocValuesType.valueOf(stripPrefix(TYPE)); assert dvType != DocValuesType.NONE; - if (dvType == DocValuesType.NUMERIC) { + + if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED_NUMERIC) { readLine(); assert startsWith(MINVALUE) : "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext; field.minValue = Long.parseLong(stripPrefix(MINVALUE)); readLine(); + assert startsWith(MAXVALUE) + : "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext; + field.maxValue = Long.parseLong(stripPrefix(MAXVALUE)); + } + + readLine(); + assert startsWith(DOCCOUNT) + : "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext; + field.docCount = Integer.parseInt(stripPrefix(DOCCOUNT)); + + if (dvType == DocValuesType.NUMERIC) { + readLine(); + assert startsWith(ORIGIN) + : "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext; + field.origin = Long.parseLong(stripPrefix(ORIGIN)); + readLine(); assert startsWith(PATTERN); field.pattern = stripPrefix(PATTERN); field.dataStartFilePointer = data.getFilePointer(); data.seek(data.getFilePointer() + (1 + field.pattern.length() + 2) * (long) maxDoc); - } else if (dvType == DocValuesType.BINARY) { + } else if (dvType == DocValuesType.BINARY || dvType == DocValuesType.SORTED_NUMERIC) { readLine(); assert startsWith(MAXLENGTH); field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH)); @@ -225,7 +249,7 @@ class SimpleTextDocValuesReader extends DocValuesProducer { throw new CorruptIndexException("failed to parse BigDecimal value", in, pe); } SimpleTextUtil.readLine(in, scratch); // read the line telling us if it's real or not - return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue(); + return BigInteger.valueOf(field.origin).add(bd.toBigIntegerExact()).longValue(); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -824,4 +848,82 @@ class SimpleTextDocValuesReader extends DocValuesProducer { } } } + + @Override + public DocValuesSkipper getSkipper(FieldInfo fieldInfo) { + final boolean numeric = + fieldInfo.getDocValuesType() == DocValuesType.NUMERIC + || fieldInfo.getDocValuesType() == DocValuesType.SORTED_NUMERIC; + final OneField field = fields.get(fieldInfo.name); + + // SegmentCoreReaders already verifies this field is + // valid: + assert field != null; + + return new DocValuesSkipper() { + int doc = -1; + + @Override + public int numLevels() { + return 1; + } + + @Override + public long minValue(int level) { + return minValue(); + } + + @Override + public long maxValue(int level) { + return maxValue(); + } + + @Override + public int docCount(int level) { + return docCount(); + } + + @Override + public long minValue() { + return numeric ? field.minValue : 0; + } + + @Override + public long maxValue() { + return numeric ? field.maxValue : field.numValues - 1; + } + + @Override + public int docCount() { + return field.docCount; + } + + @Override + public int minDocID(int level) { + if (doc == -1) { + return -1; + } else if (doc >= maxDoc || field.docCount == 0) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + return 0; + } + } + + @Override + public int maxDocID(int level) { + if (doc == -1) { + return -1; + } else if (doc >= maxDoc || field.docCount == 0) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + return maxDoc; + } + } + + @Override + public void advance(int target) { + doc = target; + } + }; + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java index a49041d2730..760895624da 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesWriter.java @@ -46,8 +46,13 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { static final BytesRef END = new BytesRef("END"); static final BytesRef FIELD = new BytesRef("field "); static final BytesRef TYPE = new BytesRef(" type "); + static final BytesRef DOCCOUNT = new BytesRef(" doccount "); // used for numerics - static final BytesRef MINVALUE = new BytesRef(" minvalue "); + static final BytesRef ORIGIN = new BytesRef(" origin "); // for deltas + + static final BytesRef MINVALUE = new BytesRef(" minalue "); + static final BytesRef MAXVALUE = new BytesRef(" maxvalue "); + static final BytesRef PATTERN = new BytesRef(" pattern "); // used for bytes static final BytesRef LENGTH = new BytesRef("length "); @@ -97,13 +102,27 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { maxValue = Math.max(maxValue, v); numValues++; } + + // write absolute min and max for skipper + SimpleTextUtil.write(data, MINVALUE); + SimpleTextUtil.write(data, Long.toString(minValue), scratch); + SimpleTextUtil.writeNewline(data); + + SimpleTextUtil.write(data, MAXVALUE); + SimpleTextUtil.write(data, Long.toString(maxValue), scratch); + SimpleTextUtil.writeNewline(data); + + SimpleTextUtil.write(data, DOCCOUNT); + SimpleTextUtil.write(data, Integer.toString(numValues), scratch); + SimpleTextUtil.writeNewline(data); + if (numValues != numDocs) { minValue = Math.min(minValue, 0); maxValue = Math.max(maxValue, 0); } // write our minimum value to the .dat, all entries are deltas from that - SimpleTextUtil.write(data, MINVALUE); + SimpleTextUtil.write(data, ORIGIN); SimpleTextUtil.write(data, Long.toString(minValue), scratch); SimpleTextUtil.writeNewline(data); @@ -161,6 +180,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { assert fieldSeen(field.name); assert field.getDocValuesType() == DocValuesType.BINARY; + writeFieldEntry(field, DocValuesType.BINARY); doAddBinaryField(field, valuesProducer); } @@ -168,10 +188,15 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { throws IOException { int maxLength = 0; BinaryDocValues values = valuesProducer.getBinary(field); + int docCount = 0; for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + ++docCount; maxLength = Math.max(maxLength, values.binaryValue().toString().length()); } - writeFieldEntry(field, DocValuesType.BINARY); + + SimpleTextUtil.write(data, DOCCOUNT); + SimpleTextUtil.write(data, Integer.toString(docCount), scratch); + SimpleTextUtil.writeNewline(data); // write maxLength SimpleTextUtil.write(data, MAXLENGTH); @@ -232,6 +257,15 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { assert field.getDocValuesType() == DocValuesType.SORTED; writeFieldEntry(field, DocValuesType.SORTED); + int docCount = 0; + SortedDocValues values = valuesProducer.getSorted(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + ++docCount; + } + SimpleTextUtil.write(data, DOCCOUNT); + SimpleTextUtil.write(data, Integer.toString(docCount), scratch); + SimpleTextUtil.writeNewline(data); + int valueCount = 0; int maxLength = -1; TermsEnum terms = valuesProducer.getSorted(field).termsEnum(); @@ -301,7 +335,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { assert valuesSeen == valueCount; - SortedDocValues values = valuesProducer.getSorted(field); + values = valuesProducer.getSorted(field); for (int i = 0; i < numDocs; ++i) { if (values.docID() < i) { values.nextDoc(); @@ -321,6 +355,28 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { throws IOException { assert fieldSeen(field.name); assert field.getDocValuesType() == DocValuesType.SORTED_NUMERIC; + writeFieldEntry(field, DocValuesType.SORTED_NUMERIC); + + long minValue = Long.MAX_VALUE; + long maxValue = Long.MIN_VALUE; + SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + for (int i = 0; i < values.docValueCount(); ++i) { + long v = values.nextValue(); + minValue = Math.min(minValue, v); + maxValue = Math.max(maxValue, v); + } + } + + // write absolute min and max for skipper + SimpleTextUtil.write(data, MINVALUE); + SimpleTextUtil.write(data, Long.toString(minValue), scratch); + SimpleTextUtil.writeNewline(data); + + SimpleTextUtil.write(data, MAXVALUE); + SimpleTextUtil.write(data, Long.toString(maxValue), scratch); + SimpleTextUtil.writeNewline(data); + doAddBinaryField( field, new EmptyDocValuesProducer() { @@ -395,6 +451,15 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { assert field.getDocValuesType() == DocValuesType.SORTED_SET; writeFieldEntry(field, DocValuesType.SORTED_SET); + int docCount = 0; + SortedSetDocValues values = valuesProducer.getSortedSet(field); + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + ++docCount; + } + SimpleTextUtil.write(data, DOCCOUNT); + SimpleTextUtil.write(data, Integer.toString(docCount), scratch); + SimpleTextUtil.writeNewline(data); + long valueCount = 0; int maxLength = 0; TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum(); @@ -430,7 +495,7 @@ class SimpleTextDocValuesWriter extends DocValuesConsumer { // length int maxOrdListLength = 0; StringBuilder sb2 = new StringBuilder(); - SortedSetDocValues values = valuesProducer.getSortedSet(field); + values = valuesProducer.getSortedSet(field); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { sb2.setLength(0); for (int i = 0; i < values.docValueCount(); i++) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 21cfe9b613f..655938ac67a 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -60,6 +60,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef PAYLOADS = new BytesRef(" payloads "); static final BytesRef NORMS = new BytesRef(" norms "); static final BytesRef DOCVALUES = new BytesRef(" doc values "); + static final BytesRef DOCVALUES_SKIP_INDEX = new BytesRef(" doc values skip index"); static final BytesRef DOCVALUES_GEN = new BytesRef(" doc values gen "); static final BytesRef INDEXOPTIONS = new BytesRef(" index options "); static final BytesRef NUM_ATTS = new BytesRef(" attributes "); @@ -122,6 +123,11 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { String dvType = readString(DOCVALUES.length, scratch); final DocValuesType docValuesType = docValuesType(dvType); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), DOCVALUES_SKIP_INDEX); + boolean docValueSkipper = + Boolean.parseBoolean(readString(DOCVALUES_SKIP_INDEX.length, scratch)); + SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), DOCVALUES_GEN); final long dvGen = Long.parseLong(readString(DOCVALUES_GEN.length, scratch)); @@ -184,6 +190,7 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { storePayloads, indexOptions, docValuesType, + docValueSkipper, dvGen, Collections.unmodifiableMap(atts), dimensionalCount, @@ -276,6 +283,10 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { SimpleTextUtil.write(out, getDocValuesType(fi.getDocValuesType()), scratch); SimpleTextUtil.writeNewline(out); + SimpleTextUtil.write(out, DOCVALUES_SKIP_INDEX); + SimpleTextUtil.write(out, Boolean.toString(fi.hasDocValuesSkipIndex()), scratch); + SimpleTextUtil.writeNewline(out); + SimpleTextUtil.write(out, DOCVALUES_GEN); SimpleTextUtil.write(out, Long.toString(fi.getDocValuesGen()), scratch); SimpleTextUtil.writeNewline(out); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java index 5c1504679ec..dd3cdf7d768 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextDocValuesFormat.java @@ -37,6 +37,12 @@ import org.apache.lucene.util.BytesRef; public class TestSimpleTextDocValuesFormat extends BaseDocValuesFormatTestCase { private final Codec codec = new SimpleTextCodec(); + @Override + protected boolean skipperHasAccurateDocBounds() { + // This format always returns minDocID = 0 and maxDocID = maxDoc - 1 + return false; + } + @Override protected Codec getCodec() { return codec; diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java index 2ee6b8fd2d2..24f14448178 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java @@ -111,6 +111,7 @@ public class TestBlockWriter extends LuceneTestCase { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, + false, -1, Collections.emptyMap(), 0, diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java index bf2b0133240..7a6524d77cd 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java @@ -198,6 +198,7 @@ public class TestSTBlockReader extends LuceneTestCase { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, + false, -1, Collections.emptyMap(), 0, diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index f093415d579..9df3adf7aef 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -76,7 +76,8 @@ module org.apache.lucene.core { org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat, - org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; + org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat, + org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; provides org.apache.lucene.index.SortFieldProvider with diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java index 2731bccd067..2c90448a39c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesProducer.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs; import java.io.Closeable; import java.io.IOException; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.NumericDocValues; @@ -73,6 +74,13 @@ public abstract class DocValuesProducer implements Closeable { */ public abstract SortedSetDocValues getSortedSet(FieldInfo field) throws IOException; + /** + * Returns a {@link DocValuesSkipper} for this field. The returned instance need not be + * thread-safe: it will only be used by a single thread. The return value is undefined if {@link + * FieldInfo#hasDocValuesSkipIndex()} doesn't return {@code true}. + */ + public abstract DocValuesSkipper getSkipper(FieldInfo field) throws IOException; + /** * Checks consistency of this producer * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsFormat.java index 39d4bf01c6e..a5e6f7df5b8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsFormat.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.hnsw; import java.io.IOException; +import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -27,14 +28,23 @@ import org.apache.lucene.index.SegmentWriteState; * * @lucene.experimental */ -public abstract class FlatVectorsFormat { +public abstract class FlatVectorsFormat extends KnnVectorsFormat { /** Sole constructor */ - protected FlatVectorsFormat() {} + protected FlatVectorsFormat(String name) { + super(name); + } /** Returns a {@link FlatVectorsWriter} to write the vectors to the index. */ + @Override public abstract FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException; /** Returns a {@link KnnVectorsReader} to read the vectors from the index. */ + @Override public abstract FlatVectorsReader fieldsReader(SegmentReadState state) throws IOException; + + @Override + public int getMaxDimensions(String fieldName) { + return 1024; + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsReader.java index 04e379c10fe..9d776567883 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsReader.java @@ -17,12 +17,11 @@ package org.apache.lucene.codecs.hnsw; -import java.io.Closeable; import java.io.IOException; -import org.apache.lucene.index.ByteVectorValues; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.hnsw.RandomVectorScorer; /** @@ -39,7 +38,7 @@ import org.apache.lucene.util.hnsw.RandomVectorScorer; * * @lucene.experimental */ -public abstract class FlatVectorsReader implements Closeable, Accountable { +public abstract class FlatVectorsReader extends KnnVectorsReader implements Accountable { /** Scorer for flat vectors */ protected final FlatVectorsScorer vectorScorer; @@ -56,6 +55,18 @@ public abstract class FlatVectorsReader implements Closeable, Accountable { return vectorScorer; } + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + // don't scan stored field data. If we didn't index it, produce no search results + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + // don't scan stored field data. If we didn't index it, produce no search results + } + /** * Returns a {@link RandomVectorScorer} for the given field and target vector. * @@ -77,28 +88,4 @@ public abstract class FlatVectorsReader implements Closeable, Accountable { */ public abstract RandomVectorScorer getRandomVectorScorer(String field, byte[] target) throws IOException; - - /** - * Checks consistency of this reader. - * - *

Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value - * against large data files. - * - * @lucene.internal - */ - public abstract void checkIntegrity() throws IOException; - - /** - * Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if - * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is - * never {@code null}. - */ - public abstract FloatVectorValues getFloatVectorValues(String field) throws IOException; - - /** - * Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if - * the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is - * never {@code null}. - */ - public abstract ByteVectorValues getByteVectorValues(String field) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsWriter.java index 96af676762f..3a7803011aa 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorsWriter.java @@ -17,14 +17,11 @@ package org.apache.lucene.codecs.hnsw; -import java.io.Closeable; import java.io.IOException; import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; -import org.apache.lucene.index.Sorter; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; /** @@ -32,7 +29,7 @@ import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; * * @lucene.experimental */ -public abstract class FlatVectorsWriter implements Accountable, Closeable { +public abstract class FlatVectorsWriter extends KnnVectorsWriter { /** Scorer for flat vectors */ protected final FlatVectorsScorer vectorsScorer; @@ -60,6 +57,11 @@ public abstract class FlatVectorsWriter implements Accountable, Closeable { public abstract FlatFieldVectorsWriter addField( FieldInfo fieldInfo, KnnFieldVectorsWriter indexWriter) throws IOException; + @Override + public FlatFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + return addField(fieldInfo, null); + } + /** * Write the field for merging, providing a scorer over the newly merged flat vectors. This way * any additional merging logic can be implemented by the user of this class. @@ -72,15 +74,4 @@ public abstract class FlatVectorsWriter implements Accountable, Closeable { */ public abstract CloseableRandomVectorScorerSupplier mergeOneFieldToIndex( FieldInfo fieldInfo, MergeState mergeState) throws IOException; - - /** Write field for merging */ - public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - IOUtils.close(mergeOneFieldToIndex(fieldInfo, mergeState)); - } - - /** Called once at the end before close */ - public abstract void finish() throws IOException; - - /** Flush all buffered data on disk * */ - public abstract void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java index d64dce985c0..63e4891960c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene90; import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SHIFT; import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.NUMERIC_BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_INTERVAL_SIZE; import java.io.IOException; import java.util.Arrays; @@ -43,6 +44,7 @@ import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.ByteBuffersIndexOutput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -129,16 +131,17 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { throws IOException { meta.writeInt(field.number); meta.writeByte(Lucene90DocValuesFormat.NUMERIC); - - writeValues( - field, + DocValuesProducer producer = new EmptyDocValuesProducer() { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { return DocValues.singleton(valuesProducer.getNumeric(field)); } - }, - false); + }; + if (field.hasDocValuesSkipIndex()) { + writeSkipIndex(field, producer); + } + writeValues(field, producer, false); } private static class MinMaxTracker { @@ -183,6 +186,84 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { } } + private static class SkipAccumulator { + int minDocID; + int maxDocID; + int docCount; + long minValue; + long maxValue; + + SkipAccumulator(int docID) { + minDocID = docID; + minValue = Long.MAX_VALUE; + maxValue = Long.MIN_VALUE; + docCount = 0; + } + + void accumulate(long value) { + minValue = Math.min(minValue, value); + maxValue = Math.max(maxValue, value); + } + + void nextDoc(int docID) { + maxDocID = docID; + ++docCount; + } + + void writeTo(DataOutput output) throws IOException { + output.writeInt(maxDocID); + output.writeInt(minDocID); + output.writeLong(maxValue); + output.writeLong(minValue); + output.writeInt(docCount); + } + } + + private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer) + throws IOException { + assert field.hasDocValuesSkipIndex(); + // TODO: This disk compression once we introduce levels + long start = data.getFilePointer(); + SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); + long globalMaxValue = Long.MIN_VALUE; + long globalMinValue = Long.MAX_VALUE; + int globalDocCount = 0; + int maxDocId = -1; + SkipAccumulator accumulator = null; + int counter = 0; + for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { + if (counter == 0) { + accumulator = new SkipAccumulator(doc); + } + accumulator.nextDoc(doc); + for (int i = 0, end = values.docValueCount(); i < end; ++i) { + accumulator.accumulate(values.nextValue()); + } + if (++counter == SKIP_INDEX_INTERVAL_SIZE) { + globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); + globalMinValue = Math.min(globalMinValue, accumulator.minValue); + globalDocCount += accumulator.docCount; + maxDocId = accumulator.maxDocID; + accumulator.writeTo(data); + counter = 0; + } + } + + if (counter > 0) { + globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue); + globalMinValue = Math.min(globalMinValue, accumulator.minValue); + globalDocCount += accumulator.docCount; + maxDocId = accumulator.maxDocID; + accumulator.writeTo(data); + } + meta.writeLong(start); // record the start in meta + meta.writeLong(data.getFilePointer() - start); // record the length + meta.writeLong(globalMaxValue); + meta.writeLong(globalMinValue); + meta.writeInt(globalDocCount); + meta.writeInt(maxDocId); + } + private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, boolean ords) throws IOException { SortedNumericDocValues values = valuesProducer.getSortedNumeric(field); @@ -489,13 +570,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException { meta.writeInt(field.number); meta.writeByte(Lucene90DocValuesFormat.SORTED); - doAddSortedField(field, valuesProducer); + doAddSortedField(field, valuesProducer, false); } - private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer) - throws IOException { - writeValues( - field, + private void doAddSortedField( + FieldInfo field, DocValuesProducer valuesProducer, boolean addTypeByte) throws IOException { + DocValuesProducer producer = new EmptyDocValuesProducer() { @Override public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { @@ -534,8 +614,14 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { }; return DocValues.singleton(sortedOrds); } - }, - true); + }; + if (field.hasDocValuesSkipIndex()) { + writeSkipIndex(field, producer); + } + if (addTypeByte) { + meta.writeByte((byte) 0); // multiValued (0 = singleValued) + } + writeValues(field, producer, true); addTermsDict(DocValues.singleton(valuesProducer.getSorted(field))); } @@ -702,6 +788,12 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { private void doAddSortedNumericField( FieldInfo field, DocValuesProducer valuesProducer, boolean ords) throws IOException { + if (field.hasDocValuesSkipIndex()) { + writeSkipIndex(field, valuesProducer); + } + if (ords) { + meta.writeByte((byte) 1); // multiValued (1 = multiValued) + } long[] stats = writeValues(field, valuesProducer, ords); int numDocsWithField = Math.toIntExact(stats[0]); long numValues = stats[1]; @@ -753,7 +845,7 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { meta.writeByte(Lucene90DocValuesFormat.SORTED_SET); if (isSingleValued(valuesProducer.getSortedSet(field))) { - meta.writeByte((byte) 0); // multiValued (0 = singleValued) + doAddSortedField( field, new EmptyDocValuesProducer() { @@ -762,10 +854,10 @@ final class Lucene90DocValuesConsumer extends DocValuesConsumer { return SortedSetSelector.wrap( valuesProducer.getSortedSet(field), SortedSetSelector.Type.MIN); } - }); + }, + true); return; } - meta.writeByte((byte) 1); // multiValued (1 = multiValued) doAddSortedNumericField( field, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesFormat.java index 7a59baf90f3..847a5341584 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesFormat.java @@ -181,4 +181,7 @@ public final class Lucene90DocValuesFormat extends DocValuesFormat { static final int TERMS_DICT_REVERSE_INDEX_SHIFT = 10; static final int TERMS_DICT_REVERSE_INDEX_SIZE = 1 << TERMS_DICT_REVERSE_INDEX_SHIFT; static final int TERMS_DICT_REVERSE_INDEX_MASK = TERMS_DICT_REVERSE_INDEX_SIZE - 1; + + static final int SKIP_INDEX_INTERVAL_SHIFT = 12; + static final int SKIP_INDEX_INTERVAL_SIZE = 1 << SKIP_INDEX_INTERVAL_SHIFT; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index 4078a2dc977..ad467fbc87f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.ImpactsEnum; @@ -39,6 +40,7 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -59,6 +61,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { private final Map sorted; private final Map sortedSets; private final Map sortedNumerics; + private final Map skippers; private final IndexInput data; private final int maxDoc; private int version = -1; @@ -80,6 +83,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { sorted = new HashMap<>(); sortedSets = new HashMap<>(); sortedNumerics = new HashMap<>(); + skippers = new HashMap<>(); merging = false; // read in the entries from the metadata file. @@ -147,6 +151,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { Map sorted, Map sortedSets, Map sortedNumerics, + Map skippers, IndexInput data, int maxDoc, int version, @@ -156,6 +161,7 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { this.sorted = sorted; this.sortedSets = sortedSets; this.sortedNumerics = sortedNumerics; + this.skippers = skippers; this.data = data.clone(); this.maxDoc = maxDoc; this.version = version; @@ -165,7 +171,16 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { @Override public DocValuesProducer getMergeInstance() { return new Lucene90DocValuesProducer( - numerics, binaries, sorted, sortedSets, sortedNumerics, data, maxDoc, version, true); + numerics, + binaries, + sorted, + sortedSets, + sortedNumerics, + skippers, + data, + maxDoc, + version, + true); } private void readFields(IndexInput meta, FieldInfos infos) throws IOException { @@ -175,6 +190,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } byte type = meta.readByte(); + if (info.hasDocValuesSkipIndex()) { + skippers.put(info.name, readDocValueSkipperMeta(meta)); + } if (type == Lucene90DocValuesFormat.NUMERIC) { numerics.put(info.name, readNumeric(meta)); } else if (type == Lucene90DocValuesFormat.BINARY) { @@ -197,6 +215,17 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { return entry; } + private DocValuesSkipperEntry readDocValueSkipperMeta(IndexInput meta) throws IOException { + long offset = meta.readLong(); + long length = meta.readLong(); + long maxValue = meta.readLong(); + long minValue = meta.readLong(); + int docCount = meta.readInt(); + int maxDocID = meta.readInt(); + + return new DocValuesSkipperEntry(offset, length, minValue, maxValue, docCount, maxDocID); + } + private void readNumeric(IndexInput meta, NumericEntry entry) throws IOException { entry.docsWithFieldOffset = meta.readLong(); entry.docsWithFieldLength = meta.readLong(); @@ -326,6 +355,9 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { data.close(); } + private record DocValuesSkipperEntry( + long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {} + private static class NumericEntry { long[] table; int blockShift; @@ -1749,4 +1781,88 @@ final class Lucene90DocValuesProducer extends DocValuesProducer { return mul * values.get(index & mask) + delta; } } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + final DocValuesSkipperEntry entry = skippers.get(field.name); + + final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length); + // Prefetch the first page of data. Following pages are expected to get prefetched through + // read-ahead. + if (input.length() > 0) { + input.prefetch(0, 1); + } + return new DocValuesSkipper() { + int minDocID = -1; + int maxDocID = -1; + long minValue, maxValue; + int docCount; + + @Override + public void advance(int target) throws IOException { + if (target > entry.maxDocId) { + minDocID = DocIdSetIterator.NO_MORE_DOCS; + maxDocID = DocIdSetIterator.NO_MORE_DOCS; + } else { + while (true) { + maxDocID = input.readInt(); + if (maxDocID >= target) { + minDocID = input.readInt(); + maxValue = input.readLong(); + minValue = input.readLong(); + docCount = input.readInt(); + break; + } else { + input.skipBytes(24); + } + } + } + } + + @Override + public int numLevels() { + return 1; + } + + @Override + public int minDocID(int level) { + return minDocID; + } + + @Override + public int maxDocID(int level) { + return maxDocID; + } + + @Override + public long minValue(int level) { + return minValue; + } + + @Override + public long maxValue(int level) { + return maxValue; + } + + @Override + public int docCount(int level) { + return docCount; + } + + @Override + public long minValue() { + return entry.minValue; + } + + @Override + public long maxValue() { + return entry.maxValue; + } + + @Override + public int docCount() { + return entry.docCount; + } + }; + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 341e28c36f5..481148cbd0d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -163,8 +163,10 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; boolean isParentField = format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; + boolean hasDocValuesSkipIndex = + format >= FORMAT_DOCVALUE_SKIPPER ? (bits & DOCVALUES_SKIPPER) != 0 : false; - if ((bits & 0xE0) != 0) { + if ((bits & 0xC0) != 0) { throw new CorruptIndexException( "unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input); } @@ -173,6 +175,13 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { "parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"", input); } + if (format < FORMAT_DOCVALUE_SKIPPER && (bits & DOCVALUES_SKIPPER) != 0) { + throw new CorruptIndexException( + "doc values skipper bit is set but shouldn't \"" + + Integer.toBinaryString(bits) + + "\"", + input); + } final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -208,6 +217,7 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { storePayloads, indexOptions, docValuesType, + hasDocValuesSkipIndex, dvGen, attributes, pointDataDimensionCount, @@ -394,6 +404,7 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; if (fi.isParentField()) bits |= PARENT_FIELD_FIELD; + if (fi.hasDocValuesSkipIndex()) bits |= DOCVALUES_SKIPPER; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -423,7 +434,8 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { static final int FORMAT_START = 0; // this doesn't actually change the file format but uses up one more bit an existing bit pattern static final int FORMAT_PARENT_FIELD = 1; - static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD; + static final int FORMAT_DOCVALUE_SKIPPER = 2; + static final int FORMAT_CURRENT = FORMAT_DOCVALUE_SKIPPER; // Field flags static final byte STORE_TERMVECTOR = 0x1; @@ -431,4 +443,5 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { static final byte STORE_PAYLOADS = 0x4; static final byte SOFT_DELETES_FIELD = 0x8; static final byte PARENT_FIELD_FIELD = 0x10; + static final byte DOCVALUES_SKIPPER = 0x20; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java index dc6fe4e7178..78e0cf000fa 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsFormat.java @@ -67,6 +67,7 @@ import org.apache.lucene.store.IndexOutput; */ public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat { + static final String NAME = "Lucene99FlatVectorsFormat"; static final String META_CODEC_NAME = "Lucene99FlatVectorsFormatMeta"; static final String VECTOR_DATA_CODEC_NAME = "Lucene99FlatVectorsFormatData"; static final String META_EXTENSION = "vemf"; @@ -80,6 +81,7 @@ public final class Lucene99FlatVectorsFormat extends FlatVectorsFormat { /** Constructs a format */ public Lucene99FlatVectorsFormat(FlatVectorsScorer vectorsScorer) { + super(NAME); this.vectorsScorer = vectorsScorer; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java index 288a6ae6df9..6232489c08d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99FlatVectorsWriter.java @@ -119,6 +119,11 @@ public final class Lucene99FlatVectorsWriter extends FlatVectorsWriter { return newField; } + @Override + public FlatFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + return addField(fieldInfo, null); + } + @Override public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { for (FieldWriter field : fields) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java index 60908cab5c6..552260894a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java @@ -89,6 +89,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat { */ public Lucene99ScalarQuantizedVectorsFormat( Float confidenceInterval, int bits, boolean compress) { + super(NAME); if (confidenceInterval != null && confidenceInterval != DYNAMIC_CONFIDENCE_INTERVAL && (confidenceInterval < MINIMUM_CONFIDENCE_INTERVAL diff --git a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java index 0e8352da445..2496278fe7a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/perfield/PerFieldDocValuesFormat.java @@ -28,6 +28,7 @@ import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.MergeState; @@ -346,6 +347,12 @@ public abstract class PerFieldDocValuesFormat extends DocValuesFormat { return producer == null ? null : producer.getSortedSet(field); } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + DocValuesProducer producer = fields.get(field.name); + return producer == null ? null : producer.getSkipper(field); + } + @Override public void close() throws IOException { IOUtils.close(formats.values()); diff --git a/lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java b/lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java new file mode 100644 index 00000000000..fbefe128cca --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/DocValuesRangeIterator.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import java.io.IOException; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TwoPhaseIterator; + +/** + * Wrapper around a {@link TwoPhaseIterator} for a doc-values range query that speeds things up by + * taking advantage of a {@link DocValuesSkipper}. + */ +final class DocValuesRangeIterator extends TwoPhaseIterator { + + enum Match { + /** None of the documents in the range match */ + NO, + /** Document values need to be checked to verify matches */ + MAYBE, + /** All documents in the range that have a value match */ + IF_DOC_HAS_VALUE, + /** All docs in the range match */ + YES; + } + + private final Approximation approximation; + private final TwoPhaseIterator innerTwoPhase; + + DocValuesRangeIterator( + TwoPhaseIterator twoPhase, DocValuesSkipper skipper, long lowerValue, long upperValue) { + super(new Approximation(twoPhase.approximation(), skipper, lowerValue, upperValue)); + this.approximation = (Approximation) approximation(); + this.innerTwoPhase = twoPhase; + } + + static class Approximation extends DocIdSetIterator { + + private final DocIdSetIterator innerApproximation; + private final DocValuesSkipper skipper; + private final long lowerValue; + private final long upperValue; + + private int doc = -1; + + // Track a decision for all doc IDs between the current doc ID and upTo inclusive. + Match match = Match.MAYBE; + int upTo = -1; + + Approximation( + DocIdSetIterator innerApproximation, + DocValuesSkipper skipper, + long lowerValue, + long upperValue) { + this.innerApproximation = innerApproximation; + this.skipper = skipper; + this.lowerValue = lowerValue; + this.upperValue = upperValue; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(docID() + 1); + } + + @Override + public int advance(int target) throws IOException { + while (true) { + if (target > upTo) { + skipper.advance(target); + // If target doesn't have a value and is between two blocks, it is possible that advance() + // moved to a block that doesn't contain `target`. + target = Math.max(target, skipper.minDocID(0)); + if (target == NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } + upTo = skipper.maxDocID(0); + match = match(0); + + // If we have a YES or NO decision, see if we still have the same decision on a higher + // level (= on a wider range of doc IDs) + int nextLevel = 1; + while (match != Match.MAYBE + && nextLevel < skipper.numLevels() + && match == match(nextLevel)) { + upTo = skipper.maxDocID(nextLevel); + nextLevel++; + } + } + switch (match) { + case YES: + return doc = target; + case MAYBE: + case IF_DOC_HAS_VALUE: + if (target > innerApproximation.docID()) { + target = innerApproximation.advance(target); + } + if (target <= upTo) { + return doc = target; + } + // Otherwise we are breaking the invariant that `doc` must always be <= upTo, so let + // the loop run one more iteration to advance the skipper. + break; + case NO: + if (upTo == DocIdSetIterator.NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } + target = upTo + 1; + break; + default: + throw new AssertionError("Unknown enum constant: " + match); + } + } + } + + @Override + public long cost() { + return innerApproximation.cost(); + } + + private Match match(int level) { + long minValue = skipper.minValue(level); + long maxValue = skipper.maxValue(level); + if (minValue > upperValue || maxValue < lowerValue) { + return Match.NO; + } else if (minValue >= lowerValue && maxValue <= upperValue) { + if (skipper.docCount(level) == skipper.maxDocID(level) - skipper.minDocID(level) + 1) { + return Match.YES; + } else { + return Match.IF_DOC_HAS_VALUE; + } + } else { + return Match.MAYBE; + } + } + } + + @Override + public final boolean matches() throws IOException { + return switch (approximation.match) { + case YES -> true; + case IF_DOC_HAS_VALUE -> true; + case MAYBE -> innerTwoPhase.matches(); + case NO -> throw new IllegalStateException("Unpositioned approximation"); + }; + } + + @Override + public float matchCost() { + return innerTwoPhase.matchCost(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/document/FieldType.java b/lucene/core/src/java/org/apache/lucene/document/FieldType.java index 5b37955cc14..db4b37f6711 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java +++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java @@ -22,6 +22,7 @@ import java.util.Objects; import org.apache.lucene.analysis.Analyzer; // javadocs import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.VectorEncoding; @@ -40,6 +41,7 @@ public class FieldType implements IndexableFieldType { private IndexOptions indexOptions = IndexOptions.NONE; private boolean frozen; private DocValuesType docValuesType = DocValuesType.NONE; + private boolean docValuesSkipIndex; private int dimensionCount; private int indexDimensionCount; private int dimensionNumBytes; @@ -59,6 +61,7 @@ public class FieldType implements IndexableFieldType { this.omitNorms = ref.omitNorms(); this.indexOptions = ref.indexOptions(); this.docValuesType = ref.docValuesType(); + this.docValuesSkipIndex = ref.hasDocValuesSkipIndex(); this.dimensionCount = ref.pointDimensionCount(); this.indexDimensionCount = ref.pointIndexDimensionCount(); this.dimensionNumBytes = ref.pointNumBytes(); @@ -504,6 +507,22 @@ public class FieldType implements IndexableFieldType { docValuesType = type; } + @Override + public boolean hasDocValuesSkipIndex() { + return docValuesSkipIndex; + } + + /** + * Set whether to enable a skip index for doc values on this field. This is typically useful on + * fields that are part of the {@link IndexWriterConfig#setIndexSort index sort}, or that + * correlate with fields that are part of the index sort, so that values can be expected to be + * clustered in the doc ID space. + */ + public void setDocValuesSkipIndex(boolean docValuesSkipIndex) { + checkIfFrozen(); + this.docValuesSkipIndex = docValuesSkipIndex; + } + @Override public int hashCode() { final int prime = 31; @@ -512,6 +531,7 @@ public class FieldType implements IndexableFieldType { result = prime * result + indexDimensionCount; result = prime * result + dimensionNumBytes; result = prime * result + ((docValuesType == null) ? 0 : docValuesType.hashCode()); + result = prime * result + Boolean.hashCode(docValuesSkipIndex); result = prime * result + indexOptions.hashCode(); result = prime * result + (omitNorms ? 1231 : 1237); result = prime * result + (storeTermVectorOffsets ? 1231 : 1237); @@ -533,6 +553,7 @@ public class FieldType implements IndexableFieldType { if (indexDimensionCount != other.indexDimensionCount) return false; if (dimensionNumBytes != other.dimensionNumBytes) return false; if (docValuesType != other.docValuesType) return false; + if (docValuesSkipIndex != other.docValuesSkipIndex) return false; if (indexOptions != other.indexOptions) return false; if (omitNorms != other.omitNorms) return false; if (storeTermVectorOffsets != other.storeTermVectorOffsets) return false; diff --git a/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java index 6c3c3825e33..95ed6eb0711 100644 --- a/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/NumericDocValuesField.java @@ -35,9 +35,27 @@ public class NumericDocValuesField extends Field { /** Type for numeric DocValues. */ public static final FieldType TYPE = new FieldType(); + private static final FieldType INDEXED_TYPE; + static { TYPE.setDocValuesType(DocValuesType.NUMERIC); TYPE.freeze(); + + INDEXED_TYPE = new FieldType(TYPE); + INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.freeze(); + } + + /** + * Creates a new {@link NumericDocValuesField} with the specified 64-bit long value that also + * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * + * @param name field name + * @param value 64-bit long value + * @throws IllegalArgumentException if the field name is null + */ + public static NumericDocValuesField indexedField(String name, long value) { + return new NumericDocValuesField(name, value, INDEXED_TYPE); } /** @@ -60,7 +78,11 @@ public class NumericDocValuesField extends Field { * @throws IllegalArgumentException if the field name is null */ public NumericDocValuesField(String name, Long value) { - super(name, TYPE); + this(name, value, TYPE); + } + + private NumericDocValuesField(String name, Long value, FieldType fieldType) { + super(name, fieldType); fieldsData = value; } diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java index 61476e87a2b..2ed6956b717 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedDocValuesField.java @@ -41,9 +41,27 @@ public class SortedDocValuesField extends Field { /** Type for sorted bytes DocValues */ public static final FieldType TYPE = new FieldType(); + private static final FieldType INDEXED_TYPE; + static { TYPE.setDocValuesType(DocValuesType.SORTED); TYPE.freeze(); + + INDEXED_TYPE = new FieldType(TYPE); + INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.freeze(); + } + + /** + * Creates a new {@link SortedDocValuesField} with the specified 64-bit long value that also + * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * + * @param name field name + * @param bytes binary content + * @throws IllegalArgumentException if the field name is null + */ + public static SortedDocValuesField indexedField(String name, BytesRef bytes) { + return new SortedDocValuesField(name, bytes, INDEXED_TYPE); } /** @@ -54,7 +72,11 @@ public class SortedDocValuesField extends Field { * @throws IllegalArgumentException if the field name is null */ public SortedDocValuesField(String name, BytesRef bytes) { - super(name, TYPE); + this(name, bytes, TYPE); + } + + private SortedDocValuesField(String name, BytesRef bytes, FieldType fieldType) { + super(name, fieldType); fieldsData = bytes; } diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java index 091634cfb40..2d635462a22 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesField.java @@ -43,9 +43,27 @@ public class SortedNumericDocValuesField extends Field { /** Type for sorted numeric DocValues. */ public static final FieldType TYPE = new FieldType(); + private static final FieldType INDEXED_TYPE; + static { TYPE.setDocValuesType(DocValuesType.SORTED_NUMERIC); TYPE.freeze(); + + INDEXED_TYPE = new FieldType(TYPE); + INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.freeze(); + } + + /** + * Creates a new {@link SortedNumericDocValuesField} with the specified 64-bit long value that + * also creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * + * @param name field name + * @param value 64-bit long value + * @throws IllegalArgumentException if the field name is null + */ + public static SortedNumericDocValuesField indexedField(String name, long value) { + return new SortedNumericDocValuesField(name, value, INDEXED_TYPE); } /** @@ -56,8 +74,12 @@ public class SortedNumericDocValuesField extends Field { * @throws IllegalArgumentException if the field name is null */ public SortedNumericDocValuesField(String name, long value) { - super(name, TYPE); - fieldsData = Long.valueOf(value); + this(name, Long.valueOf(value), TYPE); + } + + private SortedNumericDocValuesField(String name, Long value, FieldType fieldType) { + super(name, fieldType); + fieldsData = value; } /** diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java index abbbeae571c..31ec0319db8 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java @@ -19,6 +19,7 @@ package org.apache.lucene.document; import java.io.IOException; import java.util.Objects; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SortedNumericDocValues; @@ -109,9 +110,17 @@ final class SortedNumericDocValuesRangeQuery extends Query { if (context.reader().getFieldInfos().fieldInfo(field) == null) { return null; } + + DocValuesSkipper skipper = context.reader().getDocValuesSkipper(field); + if (skipper != null) { + if (skipper.minValue() > upperValue || skipper.maxValue() < lowerValue) { + return null; + } + } + SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field); final NumericDocValues singleton = DocValues.unwrapSingleton(values); - final TwoPhaseIterator iterator; + TwoPhaseIterator iterator; if (singleton != null) { iterator = new TwoPhaseIterator(singleton) { @@ -149,6 +158,9 @@ final class SortedNumericDocValuesRangeQuery extends Query { } }; } + if (skipper != null) { + iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue); + } final var scorer = new ConstantScoreScorer(score(), scoreMode, iterator); return new DefaultScorerSupplier(scorer); } diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java index 9bec5459664..74ae5dc8043 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesField.java @@ -42,9 +42,27 @@ public class SortedSetDocValuesField extends Field { /** Type for sorted bytes DocValues */ public static final FieldType TYPE = new FieldType(); + private static final FieldType INDEXED_TYPE; + static { TYPE.setDocValuesType(DocValuesType.SORTED_SET); TYPE.freeze(); + + INDEXED_TYPE = new FieldType(TYPE); + INDEXED_TYPE.setDocValuesSkipIndex(true); + INDEXED_TYPE.freeze(); + } + + /** + * Creates a new {@link SortedSetDocValuesField} with the specified 64-bit long value that also + * creates a {@link FieldType#hasDocValuesSkipIndex() skip index}. + * + * @param name field name + * @param bytes binary content + * @throws IllegalArgumentException if the field name is null + */ + public static SortedSetDocValuesField indexedField(String name, BytesRef bytes) { + return new SortedSetDocValuesField(name, bytes, INDEXED_TYPE); } /** @@ -55,7 +73,11 @@ public class SortedSetDocValuesField extends Field { * @throws IllegalArgumentException if the field name is null */ public SortedSetDocValuesField(String name, BytesRef bytes) { - super(name, TYPE); + this(name, bytes, TYPE); + } + + private SortedSetDocValuesField(String name, BytesRef bytes, FieldType fieldType) { + super(name, fieldType); fieldsData = bytes; } diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java index 0c9959b8756..d03daac0689 100644 --- a/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/document/SortedSetDocValuesRangeQuery.java @@ -19,6 +19,7 @@ package org.apache.lucene.document; import java.io.IOException; import java.util.Objects; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; @@ -113,6 +114,7 @@ final class SortedSetDocValuesRangeQuery extends Query { if (context.reader().getFieldInfos().fieldInfo(field) == null) { return null; } + DocValuesSkipper skipper = context.reader().getDocValuesSkipper(field); SortedSetDocValues values = DocValues.getSortedSet(context.reader(), field); // implement ScorerSupplier, since we do some expensive stuff to make a scorer @@ -149,12 +151,15 @@ final class SortedSetDocValuesRangeQuery extends Query { } // no terms matched in this segment - if (minOrd > maxOrd) { + // no terms matched in this segment + if (minOrd > maxOrd + || (skipper != null + && (minOrd > skipper.maxValue() || maxOrd < skipper.minValue()))) { return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty()); } final SortedDocValues singleton = DocValues.unwrapSingleton(values); - final TwoPhaseIterator iterator; + TwoPhaseIterator iterator; if (singleton != null) { iterator = new TwoPhaseIterator(singleton) { @@ -192,6 +197,9 @@ final class SortedSetDocValuesRangeQuery extends Query { } }; } + if (skipper != null) { + iterator = new DocValuesRangeIterator(iterator, skipper, minOrd, maxOrd); + } return new ConstantScoreScorer(score(), scoreMode, iterator); } diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index f4e3d1248d8..0f6020f7873 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -45,11 +45,14 @@ import java.util.function.Supplier; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.TermVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; @@ -365,6 +368,9 @@ public final class CheckIndex implements Closeable { /** Total number of sortedset fields */ public long totalSortedSetFields; + /** Total number of skipping index tested. */ + public long totalSkippingIndex; + /** Exception thrown during doc values test (null on success) */ public Throwable error; } @@ -2739,6 +2745,14 @@ public final class CheckIndex implements Closeable { return status; } + private static boolean vectorsReaderSupportsSearch(CodecReader codecReader, String fieldName) { + KnnVectorsReader vectorsReader = codecReader.getVectorReader(); + if (vectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader perFieldReader) { + vectorsReader = perFieldReader.getFieldReader(fieldName); + } + return (vectorsReader instanceof FlatVectorsReader) == false; + } + private static void checkFloatVectorValues( FloatVectorValues values, FieldInfo fieldInfo, @@ -2751,11 +2765,15 @@ public final class CheckIndex implements Closeable { // search the first maxNumSearches vectors to exercise the graph if (values.docID() % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); - codecReader.getVectorReader().search(fieldInfo.name, values.vectorValue(), collector, null); - TopDocs docs = collector.topDocs(); - if (docs.scoreDocs.length == 0) { - throw new CheckIndexException( - "Field \"" + fieldInfo.name + "\" failed to search k nearest neighbors"); + if (vectorsReaderSupportsSearch(codecReader, fieldInfo.name)) { + codecReader + .getVectorReader() + .search(fieldInfo.name, values.vectorValue(), collector, null); + TopDocs docs = collector.topDocs(); + if (docs.scoreDocs.length == 0) { + throw new CheckIndexException( + "Field \"" + fieldInfo.name + "\" failed to search k nearest neighbors"); + } } } int valueLength = values.vectorValue().length; @@ -2791,9 +2809,10 @@ public final class CheckIndex implements Closeable { throws IOException { int docCount = 0; int everyNdoc = Math.max(values.size() / 64, 1); + boolean supportsSearch = vectorsReaderSupportsSearch(codecReader, fieldInfo.name); while (values.nextDoc() != NO_MORE_DOCS) { // search the first maxNumSearches vectors to exercise the graph - if (values.docID() % everyNdoc == 0) { + if (supportsSearch && values.docID() % everyNdoc == 0) { KnnCollector collector = new TopKnnCollector(10, Integer.MAX_VALUE); codecReader.getVectorReader().search(fieldInfo.name, values.vectorValue(), collector, null); TopDocs docs = collector.topDocs(); @@ -3228,13 +3247,14 @@ public final class CheckIndex implements Closeable { infoStream, String.format( Locale.ROOT, - "OK [%d docvalues fields; %d BINARY; %d NUMERIC; %d SORTED; %d SORTED_NUMERIC; %d SORTED_SET] [took %.3f sec]", + "OK [%d docvalues fields; %d BINARY; %d NUMERIC; %d SORTED; %d SORTED_NUMERIC; %d SORTED_SET; %d SKIPPING INDEX] [took %.3f sec]", status.totalValueFields, status.totalBinaryFields, status.totalNumericFields, status.totalSortedFields, status.totalSortedNumericFields, status.totalSortedSetFields, + status.totalSkippingIndex, nsToSec(System.nanoTime() - startNS))); } catch (Throwable e) { if (failFast) { @@ -3254,6 +3274,94 @@ public final class CheckIndex implements Closeable { DocValuesIterator get(FieldInfo fi) throws IOException; } + private static void checkDocValueSkipper(FieldInfo fi, DocValuesSkipper skipper) + throws IOException { + String fieldName = fi.name; + if (skipper.maxDocID(0) != -1) { + throw new CheckIndexException( + "binary dv iterator for field: " + + fieldName + + " should start at docID=-1, but got " + + skipper.maxDocID(0)); + } + if (skipper.docCount() > 0 && skipper.minValue() > skipper.maxValue()) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " reports wrong global value range, got " + + skipper.minValue() + + " > " + + skipper.maxValue()); + } + int docCount = 0; + int doc; + while (true) { + doc = skipper.maxDocID(0) + 1; + skipper.advance(doc); + if (skipper.maxDocID(0) == NO_MORE_DOCS) { + break; + } + int levels = skipper.numLevels(); + for (int level = 0; level < levels; level++) { + if (skipper.minDocID(level) < doc) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " reports wrong minDocID, got " + + skipper.minDocID(level) + + " < " + + doc); + } + if (skipper.minDocID(level) > skipper.maxDocID(level)) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " reports wrong doc range, got " + + skipper.minDocID(level) + + " > " + + skipper.maxDocID(level)); + } + if (skipper.minValue() > skipper.minValue(level)) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " : global minValue " + + skipper.minValue() + + " , got " + + skipper.minValue(level)); + } + if (skipper.maxValue() < skipper.maxValue(level)) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " : global maxValue " + + skipper.maxValue() + + " , got " + + skipper.maxValue(level)); + } + if (skipper.minValue(level) > skipper.maxValue(level)) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " reports wrong value range, got " + + skipper.minValue(level) + + " > " + + skipper.maxValue(level)); + } + } + docCount += skipper.docCount(0); + } + if (skipper.docCount() != docCount) { + throw new CheckIndexException( + "skipper dv iterator for field: " + + fieldName + + " inconsistent docCount, got " + + skipper.docCount() + + " != " + + docCount); + } + } + private static void checkDVIterator(FieldInfo fi, DocValuesIteratorSupplier producer) throws IOException { String field = fi.name; @@ -3627,6 +3735,10 @@ public final class CheckIndex implements Closeable { private static void checkDocValues( FieldInfo fi, DocValuesProducer dvReader, DocValuesStatus status) throws Exception { + if (fi.hasDocValuesSkipIndex()) { + status.totalSkippingIndex++; + checkDocValueSkipper(fi, dvReader.getSkipper(fi)); + } switch (fi.getDocValuesType()) { case SORTED: status.totalSortedFields++; diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java index 980f2dd9582..bec27c5176e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java @@ -196,6 +196,16 @@ public abstract class CodecReader extends LeafReader { return getDocValuesReader().getSortedSet(fi); } + @Override + public final DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + ensureOpen(); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (fi == null || fi.hasDocValuesSkipIndex() == false) { + return null; + } + return getDocValuesReader().getSkipper(fi); + } + @Override public final NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java index c238cea9d60..3504c7429a5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java @@ -108,4 +108,9 @@ abstract class DocValuesLeafReader extends LeafReader { public final CacheHelper getReaderCacheHelper() { throw new UnsupportedOperationException(); } + + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + throw new UnsupportedOperationException(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java new file mode 100644 index 00000000000..15d3c67c34d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesSkipper.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.search.DocIdSetIterator; + +/** + * Skipper for {@link DocValues}. + * + *

A skipper has a position that can only be advanced via {@link #advance(int)}. The next advance + * position must be greater than {@link #maxDocID(int)} at level 0. A skipper's position, along with + * a {@code level}, determines the interval at which the skipper is currently situated. + */ +public abstract class DocValuesSkipper { + + /** + * Advance this skipper so that all levels contain the next document on or after {@code target}. + * + *

NOTE: The behavior is undefined if {@code target} is less than or equal to {@code + * maxDocID(0)}. + * + *

NOTE: {@code minDocID(0)} may return a doc ID that is greater than {@code target} if + * the target document doesn't have a value. + */ + public abstract void advance(int target) throws IOException; + + /** Return the number of levels. This number may change when moving to a different interval. */ + public abstract int numLevels(); + + /** + * Return the minimum doc ID of the interval on the given level, inclusive. This returns {@code + * -1} if {@link #advance(int)} has not been called yet and {@link DocIdSetIterator#NO_MORE_DOCS} + * if the iterator is exhausted. This method is non-increasing when {@code level} increases. Said + * otherwise {@code minDocID(level+1) <= minDocId(level)}. + */ + public abstract int minDocID(int level); + + /** + * Return the maximum doc ID of the interval on the given level, inclusive. This returns {@code + * -1} if {@link #advance(int)} has not been called yet and {@link DocIdSetIterator#NO_MORE_DOCS} + * if the iterator is exhausted. This method is non-decreasing when {@code level} decreases. Said + * otherwise {@code maxDocID(level+1) >= maxDocId(level)}. + */ + public abstract int maxDocID(int level); + + /** + * Return the minimum value of the interval at the given level, inclusive. + * + *

NOTE: It is only guaranteed that values in this interval are greater than or equal + * the returned value. There is no guarantee that one document actually has this value. + */ + public abstract long minValue(int level); + + /** + * Return the maximum value of the interval at the given level, inclusive. + * + *

NOTE: It is only guaranteed that values in this interval are less than or equal the + * returned value. There is no guarantee that one document actually has this value. + */ + public abstract long maxValue(int level); + + /** + * Return the number of documents that have a value in the interval associated with the given + * level. + */ + public abstract int docCount(int level); + + /** + * Return the global minimum value. + * + *

NOTE: It is only guaranteed that values are greater than or equal the returned value. + * There is no guarantee that one document actually has this value. + */ + public abstract long minValue(); + + /** + * Return the global maximum value. + * + *

NOTE: It is only guaranteed that values are less than or equal the returned value. + * There is no guarantee that one document actually has this value. + */ + public abstract long maxValue(); + + /** Return the global number of documents with a value for the field. */ + public abstract int docCount(); +} diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java index 6a14772481b..f40132907a6 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesType.java @@ -22,31 +22,37 @@ package org.apache.lucene.index; */ public enum DocValuesType { /** No doc values for this field. */ - NONE, + NONE(false), /** A per-document Number */ - NUMERIC, + NUMERIC(true), /** * A per-document byte[]. Values may be larger than 32766 bytes, but different codecs may enforce * their own limits. */ - BINARY, + BINARY(false), /** * A pre-sorted byte[]. Fields with this type only store distinct byte values and store an * additional offset pointer per document to dereference the shared byte[]. The stored byte[] is * presorted and allows access via document id, ordinal and by-value. Values must be {@code <= * 32766} bytes. */ - SORTED, + SORTED(true), /** * A pre-sorted Number[]. Fields with this type store numeric values in sorted order according to * {@link Long#compare(long, long)}. */ - SORTED_NUMERIC, + SORTED_NUMERIC(true), /** * A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values and store * additional offset pointers per document to dereference the shared byte[]s. The stored byte[] is * presorted and allows access via document id, ordinal and by-value. Values must be {@code <= * 32766} bytes. */ - SORTED_SET, + SORTED_SET(true); + + final boolean supportsSkipIndex; // pkg-private for use in FieldInfo + + DocValuesType(boolean supportsSkipIndex) { + this.supportsSkipIndex = supportsSkipIndex; + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java index 89a034e1c41..8e6df43c43a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/index/EmptyDocValuesProducer.java @@ -51,6 +51,11 @@ public abstract class EmptyDocValuesProducer extends DocValuesProducer { throw new UnsupportedOperationException(); } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) { + throw new UnsupportedOperationException(); + } + @Override public void checkIntegrity() { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 610ea689e3c..862c6b37993 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -28,13 +28,16 @@ import java.util.Objects; * threads accessing this object. */ public final class FieldInfo { + /** Field's name */ public final String name; /** Internal field number */ public final int number; - private DocValuesType docValuesType; + private DocValuesType docValuesType = DocValuesType.NONE; + + private final boolean docValuesSkipIndex; // True if any document indexed term vectors private boolean storeTermVector; @@ -80,6 +83,7 @@ public final class FieldInfo { boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, + boolean hasDocValuesSkipIndex, long dvGen, Map attributes, int pointDimensionCount, @@ -95,6 +99,7 @@ public final class FieldInfo { this.docValuesType = Objects.requireNonNull( docValues, "DocValuesType must not be null (field: \"" + name + "\")"); + this.docValuesSkipIndex = hasDocValuesSkipIndex; this.indexOptions = Objects.requireNonNull( indexOptions, "IndexOptions must not be null (field: \"" + name + "\")"); @@ -152,6 +157,13 @@ public final class FieldInfo { if (docValuesType == null) { throw new IllegalArgumentException("DocValuesType must not be null (field: '" + name + "')"); } + if (docValuesType.supportsSkipIndex == false && docValuesSkipIndex) { + throw new IllegalArgumentException( + "field '" + + name + + "' cannot have docValuesSkipIndex set to true with doc values type " + + docValuesType); + } if (dvGen != -1 && docValuesType == DocValuesType.NONE) { throw new IllegalArgumentException( "field '" @@ -235,6 +247,7 @@ public final class FieldInfo { verifySameStoreTermVectors(fieldName, this.storeTermVector, o.storeTermVector); } verifySameDocValuesType(fieldName, this.docValuesType, o.docValuesType); + verifySameDocValuesSkipIndex(fieldName, this.docValuesSkipIndex, o.docValuesSkipIndex); verifySamePointsOptions( fieldName, this.pointDimensionCount, @@ -289,6 +302,24 @@ public final class FieldInfo { } } + /** + * Verify that the provided docValues type are the same + * + * @throws IllegalArgumentException if they are not the same + */ + static void verifySameDocValuesSkipIndex( + String fieldName, boolean hasDocValuesSkipIndex1, boolean hasDocValuesSkipIndex2) { + if (hasDocValuesSkipIndex1 != hasDocValuesSkipIndex2) { + throw new IllegalArgumentException( + "cannot change field \"" + + fieldName + + "\" from docValuesSkipIndex=" + + hasDocValuesSkipIndex1 + + " to inconsistent docValuesSkipIndex=" + + hasDocValuesSkipIndex2); + } + } + /** * Verify that the provided store term vectors options are the same * @@ -557,6 +588,11 @@ public final class FieldInfo { return docValuesType; } + /** Returns true if, and only if, this field has a skip index. */ + public boolean hasDocValuesSkipIndex() { + return docValuesSkipIndex; + } + /** Sets the docValues generation of this field. */ void setDocValuesGen(long dvGen) { this.dvGen = dvGen; diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 628cdd12844..fd0338bfe68 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.index; +import static org.apache.lucene.index.FieldInfo.verifySameDocValuesSkipIndex; import static org.apache.lucene.index.FieldInfo.verifySameDocValuesType; import static org.apache.lucene.index.FieldInfo.verifySameIndexOptions; import static org.apache.lucene.index.FieldInfo.verifySameOmitNorms; @@ -364,6 +365,7 @@ public class FieldInfos implements Iterable { IndexOptions indexOptions, IndexOptionsProperties indexOptionsProperties, DocValuesType docValuesType, + boolean docValuesSkipIndex, FieldDimensions fieldDimensions, FieldVectorProperties fieldVectorProperties) {} @@ -442,6 +444,7 @@ public class FieldInfos implements Iterable { ? new IndexOptionsProperties(fi.hasVectors(), fi.omitsNorms()) : null, fi.getDocValuesType(), + fi.hasDocValuesSkipIndex(), new FieldDimensions( fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), @@ -521,6 +524,9 @@ public class FieldInfos implements Iterable { DocValuesType currentDVType = fieldProperties.docValuesType; verifySameDocValuesType(fieldName, currentDVType, fi.getDocValuesType()); + boolean currentDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; + verifySameDocValuesSkipIndex( + fieldName, currentDocValuesSkipIndex, fi.hasDocValuesSkipIndex()); FieldDimensions dims = fieldProperties.fieldDimensions; verifySamePointsOptions( @@ -576,6 +582,7 @@ public class FieldInfos implements Iterable { false, IndexOptions.NONE, dvType, + false, -1, new HashMap<>(), 0, @@ -602,6 +609,15 @@ public class FieldInfos implements Iterable { + fieldDvType + "]."); } + boolean hasDocValuesSkipIndex = fieldProperties.docValuesSkipIndex; + if (hasDocValuesSkipIndex) { + throw new IllegalArgumentException( + "Can't update [" + + dvType + + "] doc values; the field [" + + fieldName + + "] must be doc values only field, bit it has doc values skip index"); + } FieldDimensions fdimensions = fieldProperties.fieldDimensions; if (fdimensions != null && fdimensions.dimensionCount != 0) { throw new IllegalArgumentException( @@ -660,6 +676,7 @@ public class FieldInfos implements Iterable { false, IndexOptions.NONE, dvType, + false, -1, new HashMap<>(), 0, @@ -780,6 +797,7 @@ public class FieldInfos implements Iterable { fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), + fi.hasDocValuesSkipIndex(), dvGen, // original attributes is UnmodifiableMap new HashMap<>(fi.attributes()), diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java index 38d0ac48d46..4935237178a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java @@ -441,6 +441,12 @@ public abstract class FilterLeafReader extends LeafReader { return in.getSortedSetDocValues(field); } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + ensureOpen(); + return in.getDocValuesSkipper(field); + } + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java index 5d32e5dbcf9..006828e98a2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java @@ -86,6 +86,9 @@ public interface IndexableFieldType { /** DocValues {@link DocValuesType}: how the field's value will be indexed into docValues. */ DocValuesType docValuesType(); + /** Whether a skip index for doc values should be created on this field. */ + boolean hasDocValuesSkipIndex(); + /** * If this is positive (representing the number of point dimensions), the field is indexed as a * point. diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index 701fa16f6b3..4e61afbc2b4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -680,6 +680,7 @@ final class IndexingChain implements Accountable { false, s.indexOptions, s.docValuesType, + s.hasDocValuesSkipIndex, -1, s.attributes, s.pointDimensionCount, @@ -831,7 +832,12 @@ final class IndexingChain implements Accountable { verifyUnIndexedFieldType(fieldName, fieldType); } if (fieldType.docValuesType() != DocValuesType.NONE) { - schema.setDocValues(fieldType.docValuesType()); + schema.setDocValues(fieldType.docValuesType(), fieldType.hasDocValuesSkipIndex()); + } else if (fieldType.hasDocValuesSkipIndex()) { + throw new IllegalArgumentException( + "field '" + + schema.name + + "' cannot have docValuesSkipIndex set to true without doc values"); } if (fieldType.pointDimensionCount() != 0) { schema.setPoints( @@ -1432,6 +1438,7 @@ final class IndexingChain implements Accountable { private boolean storeTermVector = false; private IndexOptions indexOptions = IndexOptions.NONE; private DocValuesType docValuesType = DocValuesType.NONE; + private boolean hasDocValuesSkipIndex = false; private int pointDimensionCount = 0; private int pointIndexDimensionCount = 0; private int pointNumBytes = 0; @@ -1497,11 +1504,13 @@ final class IndexingChain implements Accountable { } } - void setDocValues(DocValuesType newDocValuesType) { + void setDocValues(DocValuesType newDocValuesType, boolean newHasDocValuesSkipIndex) { if (docValuesType == DocValuesType.NONE) { this.docValuesType = newDocValuesType; + this.hasDocValuesSkipIndex = newHasDocValuesSkipIndex; } else { assertSame("doc values type", docValuesType, newDocValuesType); + assertSame("doc values skip index", hasDocValuesSkipIndex, newHasDocValuesSkipIndex); } } @@ -1549,6 +1558,7 @@ final class IndexingChain implements Accountable { assertSame("omit norms", fi.omitsNorms(), omitNorms); assertSame("store term vector", fi.hasVectors(), storeTermVector); assertSame("doc values type", fi.getDocValuesType(), docValuesType); + assertSame("doc values skip index", fi.hasDocValuesSkipIndex(), hasDocValuesSkipIndex); assertSame( "vector similarity function", fi.getVectorSimilarityFunction(), vectorSimilarityFunction); assertSame("vector encoding", fi.getVectorEncoding(), vectorEncoding); diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java index 5c3c9f8da5e..0f39d1ae1e8 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java @@ -202,6 +202,13 @@ public abstract non-sealed class LeafReader extends IndexReader { */ public abstract NumericDocValues getNormValues(String field) throws IOException; + /** + * Returns a {@link DocValuesSkipper} allowing skipping ranges of doc IDs that are not of + * interest, or {@code null} if a skip index was not indexed. The returned instance should be + * confined to the thread that created it. + */ + public abstract DocValuesSkipper getDocValuesSkipper(String field) throws IOException; + /** * Returns {@link FloatVectorValues} for this field, or null if no {@link FloatVectorValues} were * indexed. The returned instance should only be used by a single thread. diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 19c2fd6bd4e..1f1e2dba9c1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -399,6 +399,13 @@ public class ParallelLeafReader extends LeafReader { return reader == null ? null : reader.getSortedSetDocValues(field); } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + ensureOpen(); + LeafReader reader = fieldToReader.get(field); + return reader == null ? null : reader.getDocValuesSkipper(field); + } + @Override public NumericDocValues getNormValues(String field) throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java index 513f897d725..4be3e41ad19 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingDeletes.java @@ -97,9 +97,8 @@ class PendingDeletes { + info.info.name + " maxDoc=" + info.info.maxDoc(); - final boolean didDelete = mutableBits.get(docID); + final boolean didDelete = mutableBits.getAndClear(docID); if (didDelete) { - mutableBits.clear(docID); pendingDeleteCount++; } return didDelete; diff --git a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java index accdb57d1ba..557d31ad441 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java +++ b/lucene/core/src/java/org/apache/lucene/index/PendingSoftDeletes.java @@ -53,8 +53,7 @@ final class PendingSoftDeletes extends PendingDeletes { FixedBitSet mutableBits = getMutableBits(); // hardDeletes if (hardDeletes.delete(docID)) { - if (mutableBits.get(docID)) { // delete it here too! - mutableBits.clear(docID); + if (mutableBits.getAndClear(docID)) { // delete it here too! assert hardDeletes.delete(docID) == false; } else { // if it was deleted subtract the delCount @@ -135,16 +134,14 @@ final class PendingSoftDeletes extends PendingDeletes { : null; while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { if (hasValue == null || hasValue.hasValue()) { - if (bits.get(docID)) { // doc is live - clear it - bits.clear(docID); + if (bits.getAndClear(docID)) { // doc is live - clear it newDeletes++; // now that we know we deleted it and we fully control the hard deletes we can do correct // accounting // below. } } else { - if (bits.get(docID) == false) { - bits.set(docID); + if (bits.getAndSet(docID) == false) { newDeletes--; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index ed635c68038..9d4f79a5586 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -713,6 +713,7 @@ final class ReadersAndUpdates { fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), + fi.hasDocValuesSkipIndex(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()), fi.getPointDimensionCount(), diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java index 63baddb4174..1d9878fe0db 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentDocValuesProducer.java @@ -124,6 +124,13 @@ class SegmentDocValuesProducer extends DocValuesProducer { return dvProducer.getSortedSet(field); } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + DocValuesProducer dvProducer = dvProducersByField.get(field.name); + assert dvProducer != null; + return dvProducer.getSkipper(field); + } + @Override public void checkIntegrity() throws IOException { for (DocValuesProducer producer : dvProducers) { diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java index 497fa1162bd..4d05d241e69 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java @@ -240,6 +240,11 @@ public final class SlowCodecReaderWrapper { return reader.getSortedSetDocValues(field.name); } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + return reader.getDocValuesSkipper(field.name); + } + @Override public void checkIntegrity() throws IOException { // We already checkIntegrity the entire reader up front diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java index 5809a9aa4f4..fc6c1d9b294 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCompositeCodecReaderWrapper.java @@ -494,6 +494,11 @@ final class SlowCompositeCodecReaderWrapper extends CodecReader { } return new MultiSortedSetDocValues(values, docStarts, map, totalCost); } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + throw new UnsupportedOperationException("This method is for searching not for merging"); + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java index 4d1cb4c8cdb..ff88e30de4a 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortingCodecReader.java @@ -637,6 +637,12 @@ public final class SortingCodecReader extends FilterCodecReader { public void close() throws IOException { delegate.close(); } + + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + // We can hardly return information about min/max values if doc IDs have been reordered. + return null; + } }; } @@ -736,10 +742,14 @@ public final class SortingCodecReader extends FilterCodecReader { if (timesCached > 1) { assert norms == false : "[" + field + "] norms must not be cached twice"; boolean isSortField = false; - for (SortField sf : metaData.getSort().getSort()) { - if (field.equals(sf.getField())) { - isSortField = true; - break; + // For things that aren't sort fields, it's possible for sort to be null here + // In the event that we accidentally cache twice, its better not to throw an NPE + if (metaData.getSort() != null) { + for (SortField sf : metaData.getSort().getSort()) { + if (field.equals(sf.getField())) { + isSortField = true; + break; + } } } assert timesCached == 2 diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java index 23bd0fd0ec6..b625f0740a5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java +++ b/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java @@ -66,7 +66,10 @@ public enum VectorSimilarityFunction { * vectors to unit length, and instead use {@link VectorSimilarityFunction#DOT_PRODUCT}. You * should only use this function if you need to preserve the original vectors and cannot normalize * them in advance. The similarity score is normalised to assure it is positive. + * + * @deprecated Use MAXIMUM_INNER_PRODUCT or DOT_PRODUCT instead */ + @Deprecated COSINE { @Override public float compare(float[] v1, float[] v2) { diff --git a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java index 5f7b26f95d4..1ca979d6794 100644 --- a/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/knn/MultiLeafKnnCollector.java @@ -41,6 +41,7 @@ public final class MultiLeafKnnCollector implements KnnCollector { private final float greediness; // the queue of the local similarities to periodically update with the global queue private final FloatHeap updatesQueue; + private final float[] updatesScratch; // interval to synchronize the local and global queues, as a number of visited vectors private final int interval = 0xff; // 255 private boolean kResultsCollected = false; @@ -62,6 +63,7 @@ public final class MultiLeafKnnCollector implements KnnCollector { this.globalSimilarityQueue = globalSimilarityQueue; this.nonCompetitiveQueue = new FloatHeap(Math.max(1, Math.round((1 - greediness) * k))); this.updatesQueue = new FloatHeap(k); + this.updatesScratch = new float[k]; } @Override @@ -103,9 +105,18 @@ public final class MultiLeafKnnCollector implements KnnCollector { if (kResultsCollected) { // as we've collected k results, we can start do periodic updates with the global queue if (firstKResultsCollected || (subCollector.visitedCount() & interval) == 0) { - cachedGlobalMinSim = globalSimilarityQueue.offer(updatesQueue.getHeap()); - updatesQueue.clear(); - globalSimUpdated = true; + // BlockingFloatHeap#offer requires input to be sorted in ascending order, so we can't + // pass in the underlying updatesQueue array as-is since it is only partially ordered + // (see GH#13462): + int len = updatesQueue.size(); + if (len > 0) { + for (int i = 0; i < len; i++) { + updatesScratch[i] = updatesQueue.poll(); + } + assert updatesQueue.size() == 0; + cachedGlobalMinSim = globalSimilarityQueue.offer(updatesScratch, len); + globalSimUpdated = true; + } } } return localSimUpdated || globalSimUpdated; diff --git a/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java index 2b2c625a9ff..b1a53a966a0 100644 --- a/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java +++ b/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java @@ -92,7 +92,7 @@ public final class LSBRadixSorter { return; } - buffer = ArrayUtil.grow(buffer, len); + buffer = ArrayUtil.growNoCopy(buffer, len); int[] arr = array; diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java index a9acca64d69..43e5077695c 100644 --- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java @@ -70,7 +70,9 @@ public final class VectorUtil { * Returns the cosine similarity between the two vectors. * * @throws IllegalArgumentException if the vectors' dimensions differ. + * @deprecated use dot-product instead using normalized vectors */ + @Deprecated public static float cosine(float[] a, float[] b) { if (a.length != b.length) { throw new IllegalArgumentException("vector dimensions differ: " + a.length + "!=" + b.length); @@ -80,7 +82,12 @@ public final class VectorUtil { return r; } - /** Returns the cosine similarity between the two vectors. */ + /** + * Returns the cosine similarity between the two vectors. + * + * @deprecated use dot-product instead using normalized vectors + */ + @Deprecated public static float cosine(byte[] a, byte[] b) { if (a.length != b.length) { throw new IllegalArgumentException("vector dimensions differ: " + a.length + "!=" + b.length); diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java index d2d326b3a15..b2b109769f5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java @@ -43,7 +43,9 @@ public class BKDReader extends PointValues { final int version; final long minLeafBlockFP; - final IndexInput packedIndex; + private final long indexStartPointer; + private final int numIndexBytes; + private final IndexInput indexIn; // if true, the tree is a legacy balanced tree private final boolean isTreeBalanced; @@ -95,8 +97,7 @@ public class BKDReader extends PointValues { pointCount = metaIn.readVLong(); docCount = metaIn.readVInt(); - int numIndexBytes = metaIn.readVInt(); - long indexStartPointer; + numIndexBytes = metaIn.readVInt(); if (version >= BKDWriter.VERSION_META_FILE) { minLeafBlockFP = metaIn.readLong(); indexStartPointer = metaIn.readLong(); @@ -105,7 +106,7 @@ public class BKDReader extends PointValues { minLeafBlockFP = indexIn.readVLong(); indexIn.seek(indexStartPointer); } - this.packedIndex = indexIn.slice("packedIndex", indexStartPointer, numIndexBytes); + this.indexIn = indexIn; this.in = dataIn; // for only one leaf, balanced and unbalanced trees can be handled the same way // we set it to unbalanced. @@ -158,7 +159,7 @@ public class BKDReader extends PointValues { @Override public PointTree getPointTree() throws IOException { return new BKDPointTree( - packedIndex.clone(), + indexIn.slice("packedIndex", indexStartPointer, numIndexBytes), this.in.clone(), config, numLeaves, diff --git a/lucene/core/src/java/org/apache/lucene/util/hnsw/BlockingFloatHeap.java b/lucene/core/src/java/org/apache/lucene/util/hnsw/BlockingFloatHeap.java index a81eaf2fee0..6bbf6fdb741 100644 --- a/lucene/core/src/java/org/apache/lucene/util/hnsw/BlockingFloatHeap.java +++ b/lucene/core/src/java/org/apache/lucene/util/hnsw/BlockingFloatHeap.java @@ -72,12 +72,13 @@ public final class BlockingFloatHeap { *

Values must be sorted in ascending order. * * @param values a set of values to insert, must be sorted in ascending order + * @param len number of values from the {@code values} array to insert * @return the new 'top' element in the queue. */ - public float offer(float[] values) { + public float offer(float[] values, int len) { lock.lock(); try { - for (int i = values.length - 1; i >= 0; i--) { + for (int i = len - 1; i >= 0; i--) { if (size < maxSize) { push(values[i]); } else { diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index ba0de7b2464..cb5fee62aee 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -15,3 +15,4 @@ org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat +org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java index 4747c86834e..35ef3e0f826 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestWordlistLoader.java @@ -17,8 +17,13 @@ package org.apache.lucene.analysis; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.StringReader; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; import org.apache.lucene.tests.util.LuceneTestCase; public class TestWordlistLoader extends LuceneTestCase { @@ -77,4 +82,16 @@ public class TestWordlistLoader extends LuceneTestCase { assertTrue(wordset.contains("six")); assertTrue(wordset.contains("seven")); } + + public void testGetLines() throws IOException { + String s = "One \n#Comment \n \n Two \n Three \n"; + Charset charset = StandardCharsets.UTF_8; + byte[] sByteArr = s.getBytes(charset); + InputStream sInputStream = new ByteArrayInputStream(sByteArr); + List lines = WordlistLoader.getLines(sInputStream, charset); + assertEquals(3, lines.size()); + assertEquals("One", lines.get(0)); + assertEquals("Two", lines.get(1)); + assertEquals("Three", lines.get(2)); + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/TestCodecLoadingDeadlock.java b/lucene/core/src/test/org/apache/lucene/codecs/TestCodecLoadingDeadlock.java index 854e271eb12..0e0c3941bec 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/TestCodecLoadingDeadlock.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/TestCodecLoadingDeadlock.java @@ -16,8 +16,12 @@ */ package org.apache.lucene.codecs; +import com.carrotsearch.randomizedtesting.LifecycleScope; import com.carrotsearch.randomizedtesting.RandomizedContext; import com.carrotsearch.randomizedtesting.RandomizedRunner; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; @@ -34,6 +38,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.util.NamedThreadFactory; +import org.apache.lucene.util.SuppressForbidden; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; @@ -46,6 +51,7 @@ public class TestCodecLoadingDeadlock extends Assert { private static final int MAX_TIME_SECONDS = 30; @Test + @SuppressForbidden(reason = "Uses Path.toFile because ProcessBuilder requires it.") public void testDeadlock() throws Exception { // pick random codec names for stress test in separate process: final Random rnd = RandomizedContext.current().getRandom(); @@ -67,12 +73,23 @@ public class TestCodecLoadingDeadlock extends Assert { args.addAll(List.of(getClass().getName(), codecName, pfName, dvfName)); // Fork a separate JVM to reinitialize classes. - final Process p = new ProcessBuilder(args).inheritIO().start(); - if (p.waitFor(MAX_TIME_SECONDS * 2, TimeUnit.SECONDS)) { - assertEquals("Process died abnormally?", 0, p.waitFor()); - } else { - p.destroyForcibly().waitFor(); - fail("Process did not exit after 60 secs?"); + final Path output = RandomizedTest.newTempFile(LifecycleScope.TEST); + final Process p = + new ProcessBuilder(args).redirectErrorStream(true).redirectOutput(output.toFile()).start(); + boolean success = false; + try { + if (p.waitFor(MAX_TIME_SECONDS * 2, TimeUnit.SECONDS)) { + assertEquals("Process died abnormally?", 0, p.waitFor()); + success = true; + } else { + p.destroyForcibly().waitFor(); + fail("Process did not exit after 60 secs?"); + } + } finally { + if (!success) { + System.out.println("Subprocess emitted the following output:"); + System.out.write(Files.readAllBytes(output)); + } } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90FieldInfosFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90FieldInfosFormat.java index e723b5146c2..7b24f27e3f7 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90FieldInfosFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90FieldInfosFormat.java @@ -25,4 +25,9 @@ public class TestLucene90FieldInfosFormat extends BaseFieldInfoFormatTestCase { protected Codec getCodec() { return TestUtil.getDefaultCodec(); } + + @Override + protected boolean supportDocValuesSkipIndex() { + return false; + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java new file mode 100644 index 00000000000..107c39eed07 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene99; + +import static java.lang.String.format; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.oneOf; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.util.VectorUtil; +import org.apache.lucene.util.quantization.QuantizedByteVectorValues; +import org.apache.lucene.util.quantization.ScalarQuantizer; +import org.junit.Before; + +public class TestLucene99ScalarQuantizedVectorsFormat extends BaseKnnVectorsFormatTestCase { + + KnnVectorsFormat format; + Float confidenceInterval; + int bits; + + @Before + @Override + public void setUp() throws Exception { + bits = random().nextBoolean() ? 4 : 7; + confidenceInterval = random().nextBoolean() ? random().nextFloat(0.90f, 1.0f) : null; + if (random().nextBoolean()) { + confidenceInterval = 0f; + } + format = + new Lucene99ScalarQuantizedVectorsFormat(confidenceInterval, bits, random().nextBoolean()); + super.setUp(); + } + + @Override + protected Codec getCodec() { + return new Lucene99Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return format; + } + }; + } + + public void testSearch() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc = new Document(); + // randomly reuse a vector, this ensures the underlying codec doesn't rely on the array + // reference + doc.add( + new KnnFloatVectorField("f", new float[] {0, 1}, VectorSimilarityFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.commit(); + try (IndexReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + if (r instanceof CodecReader codecReader) { + KnnVectorsReader knnVectorsReader = codecReader.getVectorReader(); + // if this search found any results it would raise NPE attempting to collect them in our + // null collector + knnVectorsReader.search("f", new float[] {1, 0}, null, null); + } else { + fail("reader is not CodecReader"); + } + } + } + } + + public void testQuantizedVectorsWriteAndRead() throws Exception { + // create lucene directory with codec + int numVectors = 1 + random().nextInt(50); + VectorSimilarityFunction similarityFunction = randomSimilarity(); + boolean normalize = similarityFunction == VectorSimilarityFunction.COSINE; + int dim = random().nextInt(64) + 1; + if (dim % 2 == 1) { + dim++; + } + List vectors = new ArrayList<>(numVectors); + for (int i = 0; i < numVectors; i++) { + vectors.add(randomVector(dim)); + } + ScalarQuantizer scalarQuantizer = + confidenceInterval != null && confidenceInterval == 0f + ? ScalarQuantizer.fromVectorsAutoInterval( + new Lucene99ScalarQuantizedVectorsWriter.FloatVectorWrapper(vectors, normalize), + similarityFunction, + numVectors, + (byte) bits) + : ScalarQuantizer.fromVectors( + new Lucene99ScalarQuantizedVectorsWriter.FloatVectorWrapper(vectors, normalize), + confidenceInterval == null + ? Lucene99ScalarQuantizedVectorsFormat.calculateDefaultConfidenceInterval(dim) + : confidenceInterval, + numVectors, + (byte) bits); + float[] expectedCorrections = new float[numVectors]; + byte[][] expectedVectors = new byte[numVectors][]; + for (int i = 0; i < numVectors; i++) { + float[] vector = vectors.get(i); + if (normalize) { + float[] copy = new float[vector.length]; + System.arraycopy(vector, 0, copy, 0, copy.length); + VectorUtil.l2normalize(copy); + vector = copy; + } + + expectedVectors[i] = new byte[dim]; + expectedCorrections[i] = + scalarQuantizer.quantize(vector, expectedVectors[i], similarityFunction); + } + float[] randomlyReusedVector = new float[dim]; + + try (Directory dir = newDirectory(); + IndexWriter w = + new IndexWriter( + dir, + new IndexWriterConfig() + .setMaxBufferedDocs(numVectors + 1) + .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH) + .setMergePolicy(NoMergePolicy.INSTANCE))) { + for (int i = 0; i < numVectors; i++) { + Document doc = new Document(); + // randomly reuse a vector, this ensures the underlying codec doesn't rely on the array + // reference + final float[] v; + if (random().nextBoolean()) { + System.arraycopy(vectors.get(i), 0, randomlyReusedVector, 0, dim); + v = randomlyReusedVector; + } else { + v = vectors.get(i); + } + doc.add(new KnnFloatVectorField("f", v, similarityFunction)); + w.addDocument(doc); + } + w.commit(); + try (IndexReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + if (r instanceof CodecReader codecReader) { + KnnVectorsReader knnVectorsReader = codecReader.getVectorReader(); + if (knnVectorsReader instanceof PerFieldKnnVectorsFormat.FieldsReader fieldsReader) { + knnVectorsReader = fieldsReader.getFieldReader("f"); + } + if (knnVectorsReader instanceof Lucene99ScalarQuantizedVectorsReader quantizedReader) { + assertNotNull(quantizedReader.getQuantizationState("f")); + QuantizedByteVectorValues quantizedByteVectorValues = + quantizedReader.getQuantizedVectorValues("f"); + int docId = -1; + while ((docId = quantizedByteVectorValues.nextDoc()) != NO_MORE_DOCS) { + byte[] vector = quantizedByteVectorValues.vectorValue(); + float offset = quantizedByteVectorValues.getScoreCorrectionConstant(); + for (int i = 0; i < dim; i++) { + assertEquals(vector[i], expectedVectors[docId][i]); + } + assertEquals(offset, expectedCorrections[docId], 0.00001f); + } + } else { + fail("reader is not Lucene99ScalarQuantizedVectorsReader"); + } + } else { + fail("reader is not CodecReader"); + } + } + } + } + + public void testToString() { + FilterCodec customCodec = + new FilterCodec("foo", Codec.getDefault()) { + @Override + public KnnVectorsFormat knnVectorsFormat() { + return new Lucene99ScalarQuantizedVectorsFormat(0.9f, (byte) 4, false); + } + }; + String expectedPattern = + "Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s()))"; + var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer"); + var memSegScorer = + format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer"); + assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer))); + } + + public void testLimits() { + expectThrows( + IllegalArgumentException.class, + () -> new Lucene99ScalarQuantizedVectorsFormat(1.1f, 7, false)); + expectThrows( + IllegalArgumentException.class, + () -> new Lucene99ScalarQuantizedVectorsFormat(null, -1, false)); + expectThrows( + IllegalArgumentException.class, + () -> new Lucene99ScalarQuantizedVectorsFormat(null, 5, false)); + expectThrows( + IllegalArgumentException.class, + () -> new Lucene99ScalarQuantizedVectorsFormat(null, 9, false)); + } + + @Override + public void testRandomWithUpdatesAndGraph() { + // graph not supported + } + + @Override + public void testSearchWithVisitedLimit() { + // search not supported + } +} diff --git a/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java b/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java new file mode 100644 index 00000000000..5b24608320a --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/document/TestDocValuesRangeIterator.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.document; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestDocValuesRangeIterator extends LuceneTestCase { + + public void testSingleLevel() throws IOException { + doTestBasics(false); + } + + public void testMultipleLevels() throws IOException { + doTestBasics(true); + } + + private void doTestBasics(boolean doLevels) throws IOException { + long queryMin = 10; + long queryMax = 20; + + // Fake numeric doc values so that: + // docs 0-256 all match + // docs in 256-512 are all greater than queryMax + // docs in 512-768 are all less than queryMin + // docs in 768-1024 have some docs that match the range, others not + // docs in 1024-2048 follow a similar pattern as docs in 0-1024 except that not all docs have a + // value + NumericDocValues values = + new NumericDocValues() { + + int doc = -1; + + @Override + public boolean advanceExact(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target < 1024) { + // dense up to 1024 + return doc = target; + } else if (doc < 2047) { + // 50% docs have a value up to 2048 + return doc = target + (target & 1); + } else { + return doc = DocIdSetIterator.NO_MORE_DOCS; + } + } + + @Override + public long longValue() throws IOException { + int d = doc % 1024; + if (d < 128) { + return (queryMin + queryMax) >> 1; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 512) { + return queryMin - 1; + } else { + return switch ((d / 2) % 3) { + case 0 -> queryMin - 1; + case 1 -> queryMax + 1; + case 2 -> (queryMin + queryMax) >> 1; + default -> throw new AssertionError(); + }; + } + } + + @Override + public long cost() { + return 42; + } + }; + + AtomicBoolean twoPhaseCalled = new AtomicBoolean(); + TwoPhaseIterator twoPhase = + new TwoPhaseIterator(values) { + + @Override + public boolean matches() throws IOException { + twoPhaseCalled.set(true); + long v = values.longValue(); + return v >= queryMin && v <= queryMax; + } + + @Override + public float matchCost() { + return 2f; // 2 comparisons + } + }; + + DocValuesSkipper skipper = + new DocValuesSkipper() { + + int doc = -1; + + @Override + public void advance(int target) throws IOException { + doc = target; + } + + @Override + public int numLevels() { + return doLevels ? 3 : 1; + } + + @Override + public int minDocID(int level) { + int rangeLog = 9 - numLevels() + level; + + // the level is the log2 of the interval + if (doc < 0) { + return -1; + } else if (doc >= 2048) { + return DocIdSetIterator.NO_MORE_DOCS; + } else { + int mask = (1 << rangeLog) - 1; + // prior multiple of 2^level + return doc & ~mask; + } + } + + @Override + public int maxDocID(int level) { + int rangeLog = 9 - numLevels() + level; + + int minDocID = minDocID(level); + return switch (minDocID) { + case -1 -> -1; + case DocIdSetIterator.NO_MORE_DOCS -> DocIdSetIterator.NO_MORE_DOCS; + default -> minDocID + (1 << rangeLog) - 1; + }; + } + + @Override + public long minValue(int level) { + int d = doc % 1024; + if (d < 128) { + return queryMin; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 768) { + return queryMin - 1; + } else { + return queryMin - 1; + } + } + + @Override + public long maxValue(int level) { + int d = doc % 1024; + if (d < 128) { + return queryMax; + } else if (d < 256) { + return queryMax + 1; + } else if (d < 768) { + return queryMin - 1; + } else { + return queryMax + 1; + } + } + + @Override + public int docCount(int level) { + int rangeLog = 9 - numLevels() + level; + + if (doc < 1024) { + return 1 << rangeLog; + } else { + // half docs have a value + return 1 << rangeLog >> 1; + } + } + + @Override + public long minValue() { + return Long.MIN_VALUE; + } + + @Override + public long maxValue() { + return Long.MAX_VALUE; + } + + @Override + public int docCount() { + return 1024 + 1024 / 2; + } + }; + + DocValuesRangeIterator rangeIterator = + new DocValuesRangeIterator(twoPhase, skipper, queryMin, queryMax); + DocValuesRangeIterator.Approximation rangeApproximation = + (DocValuesRangeIterator.Approximation) rangeIterator.approximation(); + + assertEquals(100, rangeApproximation.advance(100)); + assertEquals(DocValuesRangeIterator.Match.YES, rangeApproximation.match); + assertEquals(255, rangeApproximation.upTo); + assertTrue(rangeIterator.matches()); + assertTrue(values.docID() < rangeApproximation.docID()); // we did not advance doc values + assertFalse(twoPhaseCalled.get()); + + assertEquals(768, rangeApproximation.advance(300)); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); + if (doLevels) { + assertEquals(831, rangeApproximation.upTo); + } else { + assertEquals(1023, rangeApproximation.upTo); + } + for (int i = 0; i < 10; ++i) { + assertEquals(values.docID(), rangeApproximation.docID()); + assertEquals(twoPhase.matches(), rangeIterator.matches()); + assertTrue(twoPhaseCalled.get()); + twoPhaseCalled.set(false); + rangeApproximation.nextDoc(); + } + + assertEquals(1100, rangeApproximation.advance(1099)); + assertEquals(DocValuesRangeIterator.Match.IF_DOC_HAS_VALUE, rangeApproximation.match); + assertEquals(1024 + 256 - 1, rangeApproximation.upTo); + assertEquals(values.docID(), rangeApproximation.docID()); + assertTrue(rangeIterator.matches()); + assertFalse(twoPhaseCalled.get()); + + assertEquals(1024 + 768, rangeApproximation.advance(1024 + 300)); + assertEquals(DocValuesRangeIterator.Match.MAYBE, rangeApproximation.match); + if (doLevels) { + assertEquals(1024 + 831, rangeApproximation.upTo); + } else { + assertEquals(2047, rangeApproximation.upTo); + } + for (int i = 0; i < 10; ++i) { + assertEquals(values.docID(), rangeApproximation.docID()); + assertEquals(twoPhase.matches(), rangeIterator.matches()); + assertTrue(twoPhaseCalled.get()); + twoPhaseCalled.set(false); + rangeApproximation.nextDoc(); + } + + assertEquals(DocIdSetIterator.NO_MORE_DOCS, rangeApproximation.advance(2048)); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java index 96e545e6011..496c1b78ff2 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java @@ -105,6 +105,9 @@ public class TestCheckIndex extends BaseTestCheckIndex { // doc value doc.add(new NumericDocValuesField("dv", random().nextLong())); + // doc value with skip index + doc.add(NumericDocValuesField.indexedField("dv_skip", random().nextLong())); + // point value byte[] point = new byte[4]; NumericUtils.intToSortableBytes(random().nextInt(), point, 0); @@ -154,7 +157,7 @@ public class TestCheckIndex extends BaseTestCheckIndex { assertNull(segStatus.liveDocStatus.error); // confirm field infos testing status - assertEquals(8, segStatus.fieldInfoStatus.totFields); + assertEquals(9, segStatus.fieldInfoStatus.totFields); assertTrue(output.toString(UTF_8).contains("test: field infos")); assertNull(segStatus.fieldInfoStatus.error); @@ -184,7 +187,8 @@ public class TestCheckIndex extends BaseTestCheckIndex { assertNull(segStatus.termVectorStatus.error); // confirm doc values testing status - assertEquals(2, segStatus.docValuesStatus.totalNumericFields); + assertEquals(3, segStatus.docValuesStatus.totalNumericFields); + assertEquals(1, segStatus.docValuesStatus.totalSkippingIndex); assertTrue(output.toString(UTF_8).contains("test: docvalues")); assertNull(segStatus.docValuesStatus.error); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java index eff9cbe763f..1759271012d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestCodecs.java @@ -106,6 +106,7 @@ public class TestCodecs extends LuceneTestCase { storePayloads, indexOptions, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java index e647b6d2b33..e19855bbdda 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldInfos.java @@ -250,6 +250,7 @@ public class TestFieldInfos extends LuceneTestCase { false, IndexOptions.NONE, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, @@ -271,6 +272,7 @@ public class TestFieldInfos extends LuceneTestCase { false, IndexOptions.NONE, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, @@ -294,6 +296,7 @@ public class TestFieldInfos extends LuceneTestCase { false, IndexOptions.NONE, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java index b526f04f39d..15d6dddcb58 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestFieldsReader.java @@ -58,6 +58,7 @@ public class TestFieldsReader extends LuceneTestCase { false, ift.indexOptions(), ift.docValuesType(), + ift.hasDocValuesSkipIndex(), -1, new HashMap<>(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index e2001a64728..46eaff59920 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -5027,4 +5027,62 @@ public class TestIndexWriter extends LuceneTestCase { } } } + + public void testDocValuesMixedSkippingIndex() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter writer = + new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) { + Document doc1 = new Document(); + doc1.add(SortedNumericDocValuesField.indexedField("test", random().nextLong())); + writer.addDocument(doc1); + + Document doc2 = new Document(); + doc2.add(new SortedNumericDocValuesField("test", random().nextLong())); + IllegalArgumentException ex = + expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc2)); + assertEquals( + "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index: expected 'true', but it has 'false'.", + ex.getMessage()); + } + } + try (Directory dir = newDirectory()) { + try (IndexWriter writer = + new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) { + Document doc1 = new Document(); + doc1.add(new SortedSetDocValuesField("test", TestUtil.randomBinaryTerm(random()))); + writer.addDocument(doc1); + + Document doc2 = new Document(); + doc2.add(SortedSetDocValuesField.indexedField("test", TestUtil.randomBinaryTerm(random()))); + IllegalArgumentException ex = + expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc2)); + assertEquals( + "Inconsistency of field data structures across documents for field [test] of doc [1]. doc values skip index: expected 'false', but it has 'true'.", + ex.getMessage()); + } + } + } + + public void testDocValuesSkippingIndexWithoutDocValues() throws Exception { + for (DocValuesType docValuesType : + new DocValuesType[] {DocValuesType.NONE, DocValuesType.BINARY}) { + FieldType fieldType = new FieldType(); + fieldType.setStored(true); + fieldType.setDocValuesType(docValuesType); + fieldType.setDocValuesSkipIndex(true); + fieldType.freeze(); + try (Directory dir = newMockDirectory()) { + try (IndexWriter writer = + new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random())))) { + Document doc1 = new Document(); + doc1.add(new Field("test", new byte[10], fieldType)); + IllegalArgumentException ex = + expectThrows(IllegalArgumentException.class, () -> writer.addDocument(doc1)); + assertTrue( + ex.getMessage() + .startsWith("field 'test' cannot have docValuesSkipIndex set to true")); + } + } + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java index 886417fabc8..339368da1f8 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java @@ -95,6 +95,11 @@ public class TestIndexableField extends LuceneTestCase { return DocValuesType.NONE; } + @Override + public boolean hasDocValuesSkipIndex() { + return false; + } + @Override public int pointDimensionCount() { return 0; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index 3c6d57c5d2f..cb57e983691 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -191,6 +191,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { false, IndexOptions.NONE, DocValuesType.NUMERIC, + false, 0, Collections.emptyMap(), 0, @@ -230,6 +231,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { false, IndexOptions.NONE, DocValuesType.NUMERIC, + false, 1, Collections.emptyMap(), 0, @@ -295,6 +297,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { false, IndexOptions.NONE, DocValuesType.NUMERIC, + false, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, @@ -365,6 +368,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { false, IndexOptions.NONE, DocValuesType.NUMERIC, + false, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, @@ -403,6 +407,7 @@ public class TestPendingSoftDeletes extends TestPendingDeletes { false, IndexOptions.NONE, DocValuesType.NUMERIC, + false, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java index f6aee2f3f27..609dd0359ab 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java @@ -101,6 +101,11 @@ public class TestSegmentToThreadMapping extends LuceneTestCase { return null; } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) { + return null; + } + @Override public PointValues getPointValues(String field) { return null; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java index 401bcb3e5c6..ad955e668ed 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java @@ -1289,6 +1289,7 @@ public class TestSortOptimization extends LuceneTestCase { false, IndexOptions.NONE, fi.getDocValuesType(), + fi.hasDocValuesSkipIndex(), fi.getDocValuesGen(), fi.attributes(), 0, diff --git a/lucene/core/src/test/org/apache/lucene/search/knn/TestMultiLeafKnnCollector.java b/lucene/core/src/test/org/apache/lucene/search/knn/TestMultiLeafKnnCollector.java new file mode 100644 index 00000000000..8466293f61a --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/knn/TestMultiLeafKnnCollector.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.knn; + +import org.apache.lucene.search.TopKnnCollector; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.hnsw.BlockingFloatHeap; + +public class TestMultiLeafKnnCollector extends LuceneTestCase { + + /** Validates a fix for GH#13462 */ + public void testGlobalScoreCoordination() { + int k = 7; + BlockingFloatHeap globalHeap = new BlockingFloatHeap(k); + MultiLeafKnnCollector collector1 = + new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE)); + MultiLeafKnnCollector collector2 = + new MultiLeafKnnCollector(k, globalHeap, new TopKnnCollector(k, Integer.MAX_VALUE)); + + // Collect k (7) hits in collector1 with scores [100, 106]: + for (int i = 0; i < k; i++) { + collector1.collect(0, 100f + i); + } + + // The global heap should be updated since k hits were collected, and have a min score of + // 100: + assertEquals(100f, globalHeap.peek(), 0f); + assertEquals(100f, collector1.minCompetitiveSimilarity(), 0f); + + // Collect k (7) hits in collector2 with only two that are competitive (200 and 300), + // which also forces an update of the global heap with collector2's hits. This is a tricky + // case where the heap will not be fully ordered, so it ensures global queue updates don't + // incorrectly short-circuit (see GH#13462): + collector2.collect(0, 10f); + collector2.collect(0, 11f); + collector2.collect(0, 12f); + collector2.collect(0, 13f); + collector2.collect(0, 200f); + collector2.collect(0, 14f); + collector2.collect(0, 300f); + + // At this point, our global heap should contain [102, 103, 104, 105, 106, 200, 300] since + // values 200 and 300 from collector2 should have pushed out 100 and 101 from collector1. + // The min value on the global heap should be 102: + assertEquals(102f, globalHeap.peek(), 0f); + assertEquals(102f, collector2.minCompetitiveSimilarity(), 0f); + } +} diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java index 9498676219d..2c7c20d35f3 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java @@ -91,7 +91,7 @@ public class SimpleSortedSetFacetsExample { SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader, config); - // Aggregatses the facet counts + // Aggregates the facet counts FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java index 62f8989ed86..9f2feb705e7 100644 --- a/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/StringValueFacetCountsExample.java @@ -95,7 +95,7 @@ public class StringValueFacetCountsExample { StringDocValuesReaderState publishState = new StringDocValuesReaderState(indexReader, "Publish Year"); - // Aggregatses the facet counts + // Aggregates the facet counts FacetsCollector fc = new FacetsCollector(); // MatchAllDocsQuery is for "browsing" (counts facets diff --git a/lucene/distribution.tests/build.gradle b/lucene/distribution.tests/build.gradle index 23e4352e0e5..4513198c267 100644 --- a/lucene/distribution.tests/build.gradle +++ b/lucene/distribution.tests/build.gradle @@ -29,16 +29,16 @@ configurations { dependencies { binaryDistribution project(path: ":lucene:distribution", configuration: "binaryDirForTests") - moduleTestImplementation "com.carrotsearch:procfork" - moduleTestImplementation("com.carrotsearch.randomizedtesting:randomizedtesting-runner", { + moduleTestImplementation deps.procfork + moduleTestImplementation(deps.randomizedtesting.runner, { exclude group: "junit" }) - moduleTestImplementation("junit:junit", { + moduleTestImplementation(deps.junit, { exclude group: "org.hamcrest" }) - moduleTestImplementation "org.hamcrest:hamcrest" - moduleTestImplementation "org.assertj:assertj-core" + moduleTestImplementation deps.hamcrest + moduleTestImplementation deps.assertj } test { diff --git a/lucene/distribution/build.gradle b/lucene/distribution/build.gradle index 9da62f00175..d1f70a1abf9 100644 --- a/lucene/distribution/build.gradle +++ b/lucene/distribution/build.gradle @@ -15,8 +15,6 @@ * limitations under the License. */ -import org.apache.lucene.gradle.Checksum - import java.nio.charset.StandardCharsets import java.nio.file.Files @@ -60,9 +58,7 @@ dependencies { // Compute checksums for release archives. -task computeChecksums(type: Checksum) { - algorithm = Checksum.Algorithm.SHA512 - +task computeChecksums(type: buildinfra.checksumClass()) { files = objects.fileCollection() [ tasks.assembleSourceTgz, diff --git a/lucene/expressions/build.gradle b/lucene/expressions/build.gradle index f9fcee6aacd..f129006a8ba 100644 --- a/lucene/expressions/build.gradle +++ b/lucene/expressions/build.gradle @@ -24,10 +24,10 @@ dependencies { moduleImplementation project(':lucene:codecs') - moduleImplementation 'org.antlr:antlr4-runtime' + moduleImplementation deps.antlr.runtime - moduleImplementation 'org.ow2.asm:asm' - moduleImplementation 'org.ow2.asm:asm-commons' + moduleImplementation deps.asm.core + moduleImplementation deps.asm.commons moduleTestImplementation project(':lucene:test-framework') } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 17ba68777fb..f60c7966f98 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -21,6 +21,7 @@ import java.util.Collections; import java.util.Iterator; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -95,6 +96,7 @@ public class TermVectorLeafReader extends LeafReader { terms.hasPayloads(), indexOptions, DocValuesType.NONE, + false, -1, Collections.emptyMap(), 0, @@ -141,6 +143,11 @@ public class TermVectorLeafReader extends LeafReader { return null; } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + return null; + } + @Override public NumericDocValues getNormValues(String field) throws IOException { return null; // Is this needed? See MemoryIndex for a way to do it. diff --git a/lucene/join/src/java/org/apache/lucene/search/join/PointInSetIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/PointInSetIncludingScoreQuery.java index 1ece42078cf..717acc22800 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/PointInSetIncludingScoreQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/PointInSetIncludingScoreQuery.java @@ -276,8 +276,7 @@ abstract class PointInSetIncludingScoreQuery extends Query implements Accountabl if (cmp == 0) { // Query point equals index point, so collect and return if (multipleValuesPerDocument) { - if (result.get(docID) == false) { - result.set(docID); + if (result.getAndSet(docID) == false) { scores[docID] = nextScore; } } else { diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java index a3fa1685844..c132c9f1a56 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java @@ -281,9 +281,8 @@ class TermsIncludingScoreQuery extends Query implements Accountable { matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: - if (!matchingDocs.get(doc)) { + if (!matchingDocs.getAndSet(doc)) { scores[doc] = score; - matchingDocs.set(doc); } } } diff --git a/lucene/luke/build.gradle b/lucene/luke/build.gradle index 150424003db..9d1dcdf3907 100644 --- a/lucene/luke/build.gradle +++ b/lucene/luke/build.gradle @@ -21,7 +21,7 @@ apply plugin: 'java-library' description = 'Luke - Lucene Toolbox' ext { - standaloneDistDir = file("$buildDir/${archivesBaseName}-${project.version}") + standaloneDistDir = file("$buildDir/${project.base.archivesName.get()}-${project.version}") } dependencies { @@ -72,7 +72,7 @@ tasks.withType(ProcessResources).configureEach { task -> task standaloneJar(type: Jar) { dependsOn classes - archiveFileName = "${archivesBaseName}-${project.version}-standalone.jar" + archiveFileName = "${project.base.archivesName.get()}-${project.version}-standalone.jar" from(sourceSets.main.output) @@ -127,10 +127,10 @@ assemble.dependsOn standaloneAssemble task standalonePackage(type: Tar) { from standaloneAssemble - into "${archivesBaseName}-${project.version}/" + into "${project.base.archivesName.get()}-${project.version}/" compression = Compression.GZIP - archiveFileName = "${archivesBaseName}-${project.version}-standalone.tgz" + archiveFileName = "${project.base.archivesName.get()}-${project.version}-standalone.tgz" } // Utility to launch Luke (and fork it from the build). diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 2bd50c5355c..ff65bca6a16 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -728,6 +728,7 @@ public class MemoryIndex { storePayloads, indexOptions, fieldType.docValuesType(), + false, -1, Collections.emptyMap(), fieldType.pointDimensionCount(), @@ -782,6 +783,7 @@ public class MemoryIndex { info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, + false, -1, info.fieldInfo.attributes(), info.fieldInfo.getPointDimensionCount(), @@ -1622,6 +1624,12 @@ public class MemoryIndex { } } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + // Skipping isn't needed on a 1-doc index. + return null; + } + @Override public PointValues getPointValues(String fieldName) { Info info = fields.get(fieldName); diff --git a/lucene/spatial-extras/build.gradle b/lucene/spatial-extras/build.gradle index baa772f3053..c2ab9e631cb 100644 --- a/lucene/spatial-extras/build.gradle +++ b/lucene/spatial-extras/build.gradle @@ -27,18 +27,18 @@ dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:spatial3d') - moduleApi 'org.locationtech.spatial4j:spatial4j' - moduleApi 'io.sgr:s2-geometry-library-java' + moduleApi deps.spatial4j + moduleApi deps.s2.geometry moduleTestImplementation project(':lucene:test-framework') moduleTestImplementation project(':lucene:spatial-test-fixtures') - moduleTestImplementation 'org.locationtech.jts:jts-core' + moduleTestImplementation deps.jts // We add patched modules to this configuration because otherwise IDEs would not see the // dependency at all, even in classpath mode (they don't see --patch-module commands we // add to the compiler and test tasks). - moduleTestPatchOnly 'org.locationtech.spatial4j:spatial4j::tests' - spatial4jTestPatch 'org.locationtech.spatial4j:spatial4j::tests' + moduleTestPatchOnly(variantOf(deps.spatial4j) { classifier("tests") }) + spatial4jTestPatch(variantOf(deps.spatial4j) { classifier("tests") }) } sourceSets.test.extensions.configure("modularPaths", { diff --git a/lucene/test-framework/build.gradle b/lucene/test-framework/build.gradle index 7eff02317b2..7954b1578d7 100644 --- a/lucene/test-framework/build.gradle +++ b/lucene/test-framework/build.gradle @@ -22,13 +22,13 @@ description = 'Framework for testing Lucene-based applications' dependencies { moduleApi project(':lucene:core') - moduleApi ("com.carrotsearch.randomizedtesting:randomizedtesting-runner", { + moduleApi (deps.randomizedtesting.runner, { exclude group: "junit" }) - moduleApi ("junit:junit", { + moduleApi (deps.junit, { exclude group: "org.hamcrest" }) - moduleApi ('org.hamcrest:hamcrest') + moduleApi deps.hamcrest moduleImplementation project(':lucene:codecs') } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java index f33521fc10a..3355d925eca 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/asserting/AssertingDocValuesFormat.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.NumericDocValues; @@ -280,6 +281,14 @@ public class AssertingDocValuesFormat extends DocValuesFormat { return AssertingLeafReader.AssertingSortedSetDocValues.create(values, maxDoc); } + @Override + public DocValuesSkipper getSkipper(FieldInfo field) throws IOException { + assert field.hasDocValuesSkipIndex(); + DocValuesSkipper skipper = in.getSkipper(field); + assert skipper != null; + return new AssertingLeafReader.AssertingDocValuesSkipper(skipper); + } + @Override public void close() throws IOException { in.close(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java index 8f55dfb5322..fc02ae82de2 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/AssertingLeafReader.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Objects; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; @@ -1155,6 +1156,109 @@ public class AssertingLeafReader extends FilterLeafReader { } } + /** Wraps a DocValuesSkipper but with additional asserts */ + public static class AssertingDocValuesSkipper extends DocValuesSkipper { + + private final Thread creationThread = Thread.currentThread(); + private final DocValuesSkipper in; + + /** Sole constructor */ + public AssertingDocValuesSkipper(DocValuesSkipper in) { + this.in = in; + assert minDocID(0) == -1; + assert maxDocID(0) == -1; + } + + @Override + public void advance(int target) throws IOException { + assertThread("Doc values skipper", creationThread); + assert target > maxDocID(0) + : "Illegal to call advance() on a target that is not beyond the current interval"; + in.advance(target); + assert in.minDocID(0) <= in.maxDocID(0); + } + + private boolean iterating() { + return maxDocID(0) != -1 + && minDocID(0) != -1 + && maxDocID(0) != DocIdSetIterator.NO_MORE_DOCS + && minDocID(0) != DocIdSetIterator.NO_MORE_DOCS; + } + + @Override + public int numLevels() { + assertThread("Doc values skipper", creationThread); + return in.numLevels(); + } + + @Override + public int minDocID(int level) { + assertThread("Doc values skipper", creationThread); + Objects.checkIndex(level, numLevels()); + int minDocID = in.minDocID(level); + assert minDocID <= in.maxDocID(level); + if (level > 0) { + assert minDocID <= in.minDocID(level - 1); + } + return minDocID; + } + + @Override + public int maxDocID(int level) { + assertThread("Doc values skipper", creationThread); + Objects.checkIndex(level, numLevels()); + int maxDocID = in.maxDocID(level); + + assert maxDocID >= in.minDocID(level); + if (level > 0) { + assert maxDocID >= in.maxDocID(level - 1); + } + return maxDocID; + } + + @Override + public long minValue(int level) { + assertThread("Doc values skipper", creationThread); + assert iterating() : "Unpositioned iterator"; + Objects.checkIndex(level, numLevels()); + return in.minValue(level); + } + + @Override + public long maxValue(int level) { + assertThread("Doc values skipper", creationThread); + assert iterating() : "Unpositioned iterator"; + Objects.checkIndex(level, numLevels()); + return in.maxValue(level); + } + + @Override + public int docCount(int level) { + assertThread("Doc values skipper", creationThread); + assert iterating() : "Unpositioned iterator"; + Objects.checkIndex(level, numLevels()); + return in.docCount(level); + } + + @Override + public long minValue() { + assertThread("Doc values skipper", creationThread); + return in.minValue(); + } + + @Override + public long maxValue() { + assertThread("Doc values skipper", creationThread); + return in.maxValue(); + } + + @Override + public int docCount() { + assertThread("Doc values skipper", creationThread); + return in.docCount(); + } + } + /** Wraps a SortedSetDocValues but with additional asserts */ public static class AssertingPointValues extends PointValues { private final Thread creationThread = Thread.currentThread(); @@ -1483,6 +1587,19 @@ public class AssertingLeafReader extends FilterLeafReader { } } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + DocValuesSkipper skipper = super.getDocValuesSkipper(field); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (skipper != null) { + assert fi.hasDocValuesSkipIndex(); + return new AssertingDocValuesSkipper(skipper); + } else { + assert fi == null || fi.hasDocValuesSkipIndex() == false; + return null; + } + } + @Override public NumericDocValues getNormValues(String field) throws IOException { NumericDocValues dv = super.getNormValues(field); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java index 07c4507ae92..16f33460991 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseDocValuesFormatTestCase.java @@ -19,672 +19,64 @@ package org.apache.lucene.tests.index; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; -import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeSet; -import java.util.concurrent.CountDownLatch; -import java.util.function.LongSupplier; import java.util.function.Supplier; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.simpletext.SimpleTextCodec; -import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.FloatDocValuesField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; -import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum.SeekStatus; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.util.TestUtil; -import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.automaton.Operations; -import org.apache.lucene.util.automaton.RegExp; /** - * Abstract class to do basic tests for a docvalues format. NOTE: This test focuses on the docvalues - * impl, nothing else. The [stretch] goal is for this test to be so thorough in testing a new - * DocValuesFormat that if this test passes, then all Lucene tests should also pass. Ie, if there is - * some bug in a given DocValuesFormat that this test fails to catch then this test needs to be - * improved! + * Extends {@link LegacyBaseDocValuesFormatTestCase} and adds checks for {@link DocValuesSkipper}. */ -public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTestCase { +public abstract class BaseDocValuesFormatTestCase extends LegacyBaseDocValuesFormatTestCase { - @Override - protected void addRandomFields(Document doc) { - if (usually()) { - doc.add(new NumericDocValuesField("ndv", random().nextInt(1 << 12))); - doc.add(new BinaryDocValuesField("bdv", newBytesRef(TestUtil.randomSimpleString(random())))); - doc.add( - new SortedDocValuesField("sdv", newBytesRef(TestUtil.randomSimpleString(random(), 2)))); - } - int numValues = random().nextInt(5); - for (int i = 0; i < numValues; ++i) { - doc.add( - new SortedSetDocValuesField( - "ssdv", newBytesRef(TestUtil.randomSimpleString(random(), 2)))); - } - numValues = random().nextInt(5); - for (int i = 0; i < numValues; ++i) { - doc.add( - new SortedNumericDocValuesField( - "sndv", TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); - } + /** + * Override and return {@code false} if the {@link DocValuesSkipper} produced by this format + * sometimes returns documents in {@link DocValuesSkipper#minDocID(int)} or {@link + * DocValuesSkipper#maxDocID(int)} that may not have a value. + */ + protected boolean skipperHasAccurateDocBounds() { + return true; } - public void testOneNumber() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm" - + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new NumericDocValuesField("dv", 5)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - Document hitDoc = storedFields.document(hits.scoreDocs[i].doc); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - int docID = hits.scoreDocs[i].doc; - assertEquals(docID, dv.advance(docID)); - assertEquals(5, dv.longValue()); - } - - ireader.close(); - directory.close(); + /** + * Override and return {@code false} if the {@link DocValuesSkipper} produced by this format + * sometimes returns values in {@link DocValuesSkipper#minValue(int)} or {@link + * DocValuesSkipper#maxValue(int)} that none of the documents in the range have. + */ + protected boolean skipperHasAccurateValueBounds() { + return true; } - public void testOneFloat() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new FloatDocValuesField("dv", 5.7f)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - assertEquals(docID, dv.advance(docID)); - assertEquals(Float.floatToRawIntBits(5.7f), dv.longValue()); - } - - ireader.close(); - directory.close(); - } - - public void testTwoNumbers() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 5)); - doc.add(new NumericDocValuesField("dv2", 17)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv1"); - assertEquals(docID, dv.advance(docID)); - assertEquals(5, dv.longValue()); - dv = ireader.leaves().get(0).reader().getNumericDocValues("dv2"); - assertEquals(docID, dv.advance(docID)); - assertEquals(17, dv.longValue()); - } - - ireader.close(); - directory.close(); - } - - public void testTwoBinaryValues() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef(longTerm))); - doc.add(new BinaryDocValuesField("dv2", newBytesRef(text))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int hitDocID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(hitDocID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); - assertEquals(hitDocID, dv.advance(hitDocID)); - BytesRef scratch = dv.binaryValue(); - assertEquals(newBytesRef(longTerm), scratch); - dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv2"); - assertEquals(hitDocID, dv.advance(hitDocID)); - scratch = dv.binaryValue(); - assertEquals(newBytesRef(text), scratch); - } - - ireader.close(); - directory.close(); - } - - public void testVariouslyCompressibleBinaryValues() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - int numDocs = 1 + random().nextInt(100); - - HashMap writtenValues = new HashMap<>(); - - // Small vocabulary ranges will be highly compressible - int vocabRange = 1 + random().nextInt(Byte.MAX_VALUE - 1); - - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - - // Generate random-sized byte array with random choice of bytes in vocab range - byte[] value = new byte[500 + random().nextInt(1024)]; - for (int j = 0; j < value.length; j++) { - value[j] = (byte) random().nextInt(vocabRange); - } - BytesRef bytesRef = newBytesRef(value); - writtenValues.put(i, bytesRef); - doc.add(newTextField("id", Integer.toString(i), Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", bytesRef)); - iwriter.addDocument(doc); - } - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - for (int i = 0; i < numDocs; i++) { - String id = Integer.toString(i); - Query query = new TermQuery(new Term("id", id)); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - int hitDocID = hits.scoreDocs[0].doc; - Document hitDoc = storedFields.document(hitDocID); - assertEquals(id, hitDoc.get("id")); - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); - assertEquals(hitDocID, dv.advance(hitDocID)); - BytesRef scratch = dv.binaryValue(); - assertEquals(writtenValues.get(i), scratch); - } - - ireader.close(); - directory.close(); - } - - public void testTwoFieldsMixed() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 5)); - doc.add(new BinaryDocValuesField("dv2", newBytesRef("hello world"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv1"); - assertEquals(docID, dv.advance(docID)); - assertEquals(5, dv.longValue()); - BinaryDocValues dv2 = ireader.leaves().get(0).reader().getBinaryDocValues("dv2"); - assertEquals(docID, dv2.advance(docID)); - assertEquals(newBytesRef("hello world"), dv2.binaryValue()); - } - - ireader.close(); - directory.close(); - } - - public void testThreeFieldsMixed() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new SortedDocValuesField("dv1", newBytesRef("hello hello"))); - doc.add(new NumericDocValuesField("dv2", 5)); - doc.add(new BinaryDocValuesField("dv3", newBytesRef("hello world"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - StoredFields storedFields = isearcher.storedFields(); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv1"); - assertEquals(docID, dv.advance(docID)); - int ord = dv.ordValue(); - BytesRef scratch = dv.lookupOrd(ord); - assertEquals(newBytesRef("hello hello"), scratch); - NumericDocValues dv2 = ireader.leaves().get(0).reader().getNumericDocValues("dv2"); - assertEquals(docID, dv2.advance(docID)); - assertEquals(5, dv2.longValue()); - BinaryDocValues dv3 = ireader.leaves().get(0).reader().getBinaryDocValues("dv3"); - assertEquals(docID, dv3.advance(docID)); - assertEquals(newBytesRef("hello world"), dv3.binaryValue()); - } - - ireader.close(); - directory.close(); - } - - public void testThreeFieldsMixed2() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef("hello world"))); - doc.add(new SortedDocValuesField("dv2", newBytesRef("hello hello"))); - doc.add(new NumericDocValuesField("dv3", 5)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - BytesRef scratch; - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv2"); - assertEquals(docID, dv.advance(docID)); - int ord = dv.ordValue(); - scratch = dv.lookupOrd(ord); - assertEquals(newBytesRef("hello hello"), scratch); - NumericDocValues dv2 = ireader.leaves().get(0).reader().getNumericDocValues("dv3"); - assertEquals(docID, dv2.advance(docID)); - assertEquals(5, dv2.longValue()); - BinaryDocValues dv3 = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); - assertEquals(docID, dv3.advance(docID)); - assertEquals(newBytesRef("hello world"), dv3.binaryValue()); - } - - ireader.close(); - directory.close(); - } - - public void testTwoDocumentsNumeric() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new NumericDocValuesField("dv", 1)); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new NumericDocValuesField("dv", 2)); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(1, dv.longValue()); - assertEquals(1, dv.nextDoc()); - assertEquals(2, dv.longValue()); - - ireader.close(); - directory.close(); - } - - public void testTwoDocumentsMerged() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(newField("id", "0", StringField.TYPE_STORED)); - doc.add(new NumericDocValuesField("dv", -10)); - iwriter.addDocument(doc); - iwriter.commit(); - doc = new Document(); - doc.add(newField("id", "1", StringField.TYPE_STORED)); - doc.add(new NumericDocValuesField("dv", 99)); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); - for (int i = 0; i < 2; i++) { - Document doc2 = storedFields.document(i); - long expected; - if (doc2.get("id").equals("0")) { - expected = -10; - } else { - expected = 99; - } - assertEquals(i, dv.nextDoc()); - assertEquals(expected, dv.longValue()); - } - - ireader.close(); - directory.close(); - } - - public void testBigNumericRange() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new NumericDocValuesField("dv", Long.MIN_VALUE)); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new NumericDocValuesField("dv", Long.MAX_VALUE)); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(Long.MIN_VALUE, dv.longValue()); - assertEquals(1, dv.nextDoc()); - assertEquals(Long.MAX_VALUE, dv.longValue()); - - ireader.close(); - directory.close(); - } - - public void testBigNumericRange2() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new NumericDocValuesField("dv", -8841491950446638677L)); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new NumericDocValuesField("dv", 9062230939892376225L)); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(-8841491950446638677L, dv.longValue()); - assertEquals(1, dv.nextDoc()); - assertEquals(9062230939892376225L, dv.longValue()); - - ireader.close(); - directory.close(); - } - - public void testBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv", newBytesRef("hello world"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - StoredFields storedFields = isearcher.storedFields(); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - // Iterate through the results: - for (int i = 0; i < hits.scoreDocs.length; i++) { - int hitDocID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(hitDocID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - assertEquals(hitDocID, dv.advance(hitDocID)); - assertEquals(newBytesRef("hello world"), dv.binaryValue()); - } - - ireader.close(); - directory.close(); - } - - public void testBytesTwoDocumentsMerged() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(newField("id", "0", StringField.TYPE_STORED)); - doc.add(new BinaryDocValuesField("dv", newBytesRef("hello world 1"))); - iwriter.addDocument(doc); - iwriter.commit(); - doc = new Document(); - doc.add(newField("id", "1", StringField.TYPE_STORED)); - doc.add(new BinaryDocValuesField("dv", newBytesRef("hello 2"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); - for (int i = 0; i < 2; i++) { - Document doc2 = storedFields.document(i); - String expected; - if (doc2.get("id").equals("0")) { - expected = "hello world 1"; - } else { - expected = "hello 2"; - } - assertEquals(i, dv.nextDoc()); - assertEquals(expected, dv.binaryValue().utf8ToString()); - } - - ireader.close(); - directory.close(); - } - - public void testBytesMergeAwayAllValues() throws IOException { + public void testSortedMergeAwayAllValuesWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -696,200 +88,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new BinaryDocValuesField("field", newBytesRef("hi"))); - iwriter.addDocument(doc); - iwriter.commit(); - iwriter.deleteDocuments(new Term("id", "1")); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - BinaryDocValues dv = getOnlyLeafReader(ireader).getBinaryDocValues("field"); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - - ireader.close(); - directory.close(); - } - - public void testSortedBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - String longTerm = - "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; - String text = "This is the text to be indexed. " + longTerm; - doc.add(newTextField("fieldname", text, Field.Store.YES)); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - IndexSearcher isearcher = new IndexSearcher(ireader); - - assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); - Query query = new TermQuery(new Term("fieldname", "text")); - TopDocs hits = isearcher.search(query, 1); - assertEquals(1, hits.totalHits.value); - BytesRef scratch; - // Iterate through the results: - StoredFields storedFields = isearcher.storedFields(); - for (int i = 0; i < hits.scoreDocs.length; i++) { - int docID = hits.scoreDocs[i].doc; - Document hitDoc = storedFields.document(docID); - assertEquals(text, hitDoc.get("fieldname")); - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - assertEquals(docID, dv.advance(docID)); - scratch = dv.lookupOrd(dv.ordValue()); - assertEquals(newBytesRef("hello world"), scratch); - } - - ireader.close(); - directory.close(); - } - - public void testSortedBytesTwoDocuments() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - BytesRef scratch; - assertEquals(0, dv.nextDoc()); - scratch = dv.lookupOrd(dv.ordValue()); - assertEquals("hello world 1", scratch.utf8ToString()); - assertEquals(1, dv.nextDoc()); - scratch = dv.lookupOrd(dv.ordValue()); - assertEquals("hello world 2", scratch.utf8ToString()); - - ireader.close(); - directory.close(); - } - - public void testSortedBytesThreeDocuments() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - assertEquals(2, dv.getValueCount()); - assertEquals(0, dv.nextDoc()); - assertEquals(0, dv.ordValue()); - BytesRef scratch = dv.lookupOrd(0); - assertEquals("hello world 1", scratch.utf8ToString()); - assertEquals(1, dv.nextDoc()); - assertEquals(1, dv.ordValue()); - scratch = dv.lookupOrd(1); - assertEquals("hello world 2", scratch.utf8ToString()); - assertEquals(2, dv.nextDoc()); - assertEquals(0, dv.ordValue()); - - ireader.close(); - directory.close(); - } - - public void testSortedBytesTwoDocumentsMerged() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(newField("id", "0", StringField.TYPE_STORED)); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); - iwriter.addDocument(doc); - iwriter.commit(); - doc = new Document(); - doc.add(newField("id", "1", StringField.TYPE_STORED)); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - assertEquals(2, dv.getValueCount()); // 2 ords - assertEquals(0, dv.nextDoc()); - BytesRef scratch = dv.lookupOrd(dv.ordValue()); - assertEquals(newBytesRef("hello world 1"), scratch); - scratch = dv.lookupOrd(1); - assertEquals(newBytesRef("hello world 2"), scratch); - StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); - for (int i = 0; i < 2; i++) { - Document doc2 = storedFields.document(i); - String expected; - if (doc2.get("id").equals("0")) { - expected = "hello world 1"; - } else { - expected = "hello world 2"; - } - if (dv.docID() < i) { - assertEquals(i, dv.nextDoc()); - } - scratch = dv.lookupOrd(dv.ordValue()); - assertEquals(expected, scratch.utf8ToString()); - } - - ireader.close(); - directory.close(); - } - - public void testSortedMergeAwayAllValues() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.NO)); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); + doc.add(SortedDocValuesField.indexedField("field", newBytesRef("hello"))); iwriter.addDocument(doc); iwriter.commit(); iwriter.deleteDocuments(new Term("id", "1")); @@ -901,6 +100,11 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + TermsEnum termsEnum = dv.termsEnum(); assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); @@ -910,1319 +114,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes directory.close(); } - public void testBytesWithNewline() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new BinaryDocValuesField("dv", newBytesRef("hello\nworld\r1"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef("hello\nworld\r1"), dv.binaryValue()); - - ireader.close(); - directory.close(); - } - - public void testMissingSortedBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); - iwriter.addDocument(doc); - // 2nd doc missing the DV field - iwriter.addDocument(new Document()); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - assertEquals(0, dv.nextDoc()); - BytesRef scratch = dv.lookupOrd(dv.ordValue()); - assertEquals(newBytesRef("hello world 2"), scratch); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ireader.close(); - directory.close(); - } - - public void testSortedTermsEnum() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - - doc = new Document(); - doc.add(new SortedDocValuesField("field", newBytesRef("world"))); - iwriter.addDocument(doc); - - doc = new Document(); - doc.add(new SortedDocValuesField("field", newBytesRef("beer"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); - assertEquals(3, dv.getValueCount()); - - TermsEnum termsEnum = dv.termsEnum(); - - // next() - assertEquals("beer", termsEnum.next().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals("world", termsEnum.next().utf8ToString()); - assertEquals(2, termsEnum.ord()); - - // seekCeil() - assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("ha!"))); - assertEquals("hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(newBytesRef("beer"))); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertEquals(SeekStatus.END, termsEnum.seekCeil(newBytesRef("zzz"))); - assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("aba"))); - assertEquals(0, termsEnum.ord()); - - // seekExact() - assertTrue(termsEnum.seekExact(newBytesRef("beer"))); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertTrue(termsEnum.seekExact(newBytesRef("hello"))); - assertEquals(Codec.getDefault().toString(), "hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertTrue(termsEnum.seekExact(newBytesRef("world"))); - assertEquals("world", termsEnum.term().utf8ToString()); - assertEquals(2, termsEnum.ord()); - assertFalse(termsEnum.seekExact(newBytesRef("bogus"))); - - // seek(ord) - termsEnum.seekExact(0); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - termsEnum.seekExact(1); - assertEquals("hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - termsEnum.seekExact(2); - assertEquals("world", termsEnum.term().utf8ToString()); - assertEquals(2, termsEnum.ord()); - - // NORMAL automaton - termsEnum = - dv.intersect( - new CompiledAutomaton( - Operations.determinize( - new RegExp(".*l.*").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT))); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals("world", termsEnum.next().utf8ToString()); - assertEquals(2, termsEnum.ord()); - assertNull(termsEnum.next()); - - // SINGLE automaton - termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertNull(termsEnum.next()); - - ireader.close(); - directory.close(); - } - - public void testEmptySortedBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef(""))); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef(""))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(0, dv.ordValue()); - assertEquals(1, dv.nextDoc()); - assertEquals(0, dv.ordValue()); - BytesRef scratch = dv.lookupOrd(0); - assertEquals("", scratch.utf8ToString()); - - ireader.close(); - directory.close(); - } - - public void testEmptyBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new BinaryDocValuesField("dv", newBytesRef(""))); - iwriter.addDocument(doc); - doc = new Document(); - doc.add(new BinaryDocValuesField("dv", newBytesRef(""))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals("", dv.binaryValue().utf8ToString()); - assertEquals(1, dv.nextDoc()); - assertEquals("", dv.binaryValue().utf8ToString()); - - ireader.close(); - directory.close(); - } - - public void testVeryLargeButLegalBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - byte[] bytes = new byte[32766]; - random().nextBytes(bytes); - BytesRef b = newBytesRef(bytes); - doc.add(new BinaryDocValuesField("dv", b)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef(bytes), dv.binaryValue()); - - ireader.close(); - directory.close(); - } - - public void testVeryLargeButLegalSortedBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - byte[] bytes = new byte[32766]; - random().nextBytes(bytes); - BytesRef b = newBytesRef(bytes); - doc.add(new SortedDocValuesField("dv", b)); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = DocValues.getSorted(ireader.leaves().get(0).reader(), "dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef(bytes), dv.lookupOrd(dv.ordValue())); - ireader.close(); - directory.close(); - } - - public void testCodecUsesOwnBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new BinaryDocValuesField("dv", newBytesRef("boo!"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals("boo!", dv.binaryValue().utf8ToString()); - - ireader.close(); - directory.close(); - } - - public void testCodecUsesOwnSortedBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new SortedDocValuesField("dv", newBytesRef("boo!"))); - iwriter.addDocument(doc); - iwriter.close(); - - // Now search the index: - IndexReader ireader = - maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true - assert ireader.leaves().size() == 1; - SortedDocValues dv = DocValues.getSorted(ireader.leaves().get(0).reader(), "dv"); - byte[] mybytes = new byte[20]; - assertEquals(0, dv.nextDoc()); - assertEquals("boo!", dv.lookupOrd(dv.ordValue()).utf8ToString()); - assertFalse(dv.lookupOrd(dv.ordValue()).bytes == mybytes); - - ireader.close(); - directory.close(); - } - - /* - * Simple test case to show how to use the API - */ - public void testDocValuesSimple() throws IOException { - Directory dir = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - IndexWriter writer = new IndexWriter(dir, conf); - for (int i = 0; i < 5; i++) { - Document doc = new Document(); - doc.add(new NumericDocValuesField("docId", i)); - doc.add(new TextField("docId", "" + i, Field.Store.NO)); - writer.addDocument(doc); - } - writer.commit(); - writer.forceMerge(1, true); - - writer.close(); - - DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir)); - assertEquals(1, reader.leaves().size()); - - IndexSearcher searcher = new IndexSearcher(reader); - - BooleanQuery.Builder query = new BooleanQuery.Builder(); - query.add(new TermQuery(new Term("docId", "0")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("docId", "1")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("docId", "2")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("docId", "3")), BooleanClause.Occur.SHOULD); - query.add(new TermQuery(new Term("docId", "4")), BooleanClause.Occur.SHOULD); - - TopDocs search = searcher.search(query.build(), 10); - assertEquals(5, search.totalHits.value); - ScoreDoc[] scoreDocs = search.scoreDocs; - NumericDocValues docValues = getOnlyLeafReader(reader).getNumericDocValues("docId"); - for (int i = 0; i < scoreDocs.length; i++) { - assertEquals(i, scoreDocs[i].doc); - assertEquals(i, docValues.advance(i)); - assertEquals(i, docValues.longValue()); - } - reader.close(); - dir.close(); - } - - public void testRandomSortedBytes() throws IOException { - Directory dir = newDirectory(); - IndexWriterConfig cfg = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter w = new RandomIndexWriter(random(), dir, cfg); - int numDocs = atLeast(100); - BytesRefHash hash = new BytesRefHash(); - Map docToString = new HashMap<>(); - int maxLength = TestUtil.nextInt(random(), 1, 50); - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - doc.add(newTextField("id", "" + i, Field.Store.YES)); - String string = TestUtil.randomRealisticUnicodeString(random(), 1, maxLength); - BytesRef br = newBytesRef(string); - doc.add(new SortedDocValuesField("field", br)); - hash.add(br); - docToString.put("" + i, string); - w.addDocument(doc); - } - if (rarely()) { - w.commit(); - } - int numDocsNoValue = atLeast(10); - for (int i = 0; i < numDocsNoValue; i++) { - Document doc = new Document(); - doc.add(newTextField("id", "noValue", Field.Store.YES)); - w.addDocument(doc); - } - if (rarely()) { - w.commit(); - } - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - String id = "" + (i + numDocs); - doc.add(newTextField("id", id, Field.Store.YES)); - String string = TestUtil.randomRealisticUnicodeString(random(), 1, maxLength); - BytesRef br = newBytesRef(string); - hash.add(br); - docToString.put(id, string); - doc.add(new SortedDocValuesField("field", br)); - w.addDocument(doc); - } - w.commit(); - IndexReader reader = w.getReader(); - SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field"); - int[] sort = hash.sort(); - BytesRef expected = newBytesRef(); - assertEquals(hash.size(), docValues.getValueCount()); - for (int i = 0; i < hash.size(); i++) { - hash.get(sort[i], expected); - final BytesRef actual = docValues.lookupOrd(i); - assertEquals(expected.utf8ToString(), actual.utf8ToString()); - int ord = docValues.lookupTerm(expected); - assertEquals(i, ord); - } - Set> entrySet = docToString.entrySet(); - - for (Entry entry : entrySet) { - // pk lookup - PostingsEnum termPostingsEnum = - TestUtil.docs(random(), reader, "id", newBytesRef(entry.getKey()), null, 0); - int docId = termPostingsEnum.nextDoc(); - expected = newBytesRef(entry.getValue()); - docValues = MultiDocValues.getSortedValues(reader, "field"); - assertEquals(docId, docValues.advance(docId)); - final BytesRef actual = docValues.lookupOrd(docValues.ordValue()); - assertEquals(expected, actual); - } - - reader.close(); - w.close(); - dir.close(); - } - - private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception { - doTestNumericsVsStoredFields(density, longs, 256); - } - - private void doTestNumericsVsStoredFields(double density, LongSupplier longs, int minDocs) - throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - Document doc = new Document(); - Field idField = new StringField("id", "", Field.Store.NO); - Field storedField = newStringField("stored", "", Field.Store.YES); - Field dvField = new NumericDocValuesField("dv", 0); - doc.add(idField); - doc.add(storedField); - doc.add(dvField); - - // index some docs - int numDocs = atLeast((int) (minDocs * 1.172)); - // numDocs should be always > 256 so that in case of a codec that optimizes - // for numbers of values <= 256, all storage layouts are tested - assert numDocs > 256; - for (int i = 0; i < numDocs; i++) { - if (random().nextDouble() > density) { - writer.addDocument(new Document()); - continue; - } - idField.setStringValue(Integer.toString(i)); - long value = longs.getAsLong(); - storedField.setStringValue(Long.toString(value)); - dvField.setLongValue(value); - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - - // merge some segments and ensure that at least one of them has more than - // max(256, minDocs) values - writer.forceMerge(numDocs / Math.max(256, minDocs)); - - writer.close(); - // compare - assertDVIterate(dir); - dir.close(); - } - - // Asserts equality of stored value vs. DocValue by iterating DocValues one at a time - protected void assertDVIterate(Directory dir) throws IOException { - DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); - TestUtil.checkReader(ir); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - NumericDocValues docValues = DocValues.getNumeric(r, "dv"); - docValues.nextDoc(); - StoredFields storedFields = r.storedFields(); - for (int i = 0; i < r.maxDoc(); i++) { - String storedValue = storedFields.document(i).get("stored"); - if (storedValue == null) { - assertTrue(docValues.docID() > i); - } else { - assertEquals(i, docValues.docID()); - assertEquals(Long.parseLong(storedValue), docValues.longValue()); - docValues.nextDoc(); - } - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); - } - ir.close(); - } - - protected void compareStoredFieldWithSortedNumericsDV( - DirectoryReader directoryReader, String storedField, String dvField) throws IOException { - for (LeafReaderContext leaf : directoryReader.leaves()) { - LeafReader reader = leaf.reader(); - StoredFields storedFields = reader.storedFields(); - SortedNumericDocValues docValues = reader.getSortedNumericDocValues(dvField); - if (docValues == null) { - // no stored values at all - for (int doc = 0; doc < reader.maxDoc(); doc++) { - assertArrayEquals(new String[0], storedFields.document(doc).getValues(storedField)); - } - continue; - } - for (int doc = 0; doc < reader.maxDoc(); doc++) { - String[] storedValues = storedFields.document(doc).getValues(storedField); - if (storedValues.length == 0) { - assertFalse(docValues.advanceExact(doc)); - continue; - } - switch (random().nextInt(3)) { - case 0 -> assertEquals(doc, docValues.nextDoc()); - case 1 -> assertEquals(doc, docValues.advance(doc)); - default -> assertTrue(docValues.advanceExact(doc)); - } - assertEquals(doc, docValues.docID()); - int repeats = 1 + random().nextInt(3); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - assertEquals(storedValues.length, docValues.docValueCount()); - for (int v = 0; v < docValues.docValueCount(); v++) { - assertEquals(storedValues[v], Long.toString(docValues.nextValue())); - } - } - } - // jump with advanceExact - int iters = 1 + random().nextInt(3); - for (int i = 0; i < iters; i++) { - docValues = reader.getSortedNumericDocValues(dvField); - for (int doc = random().nextInt(leaf.reader().maxDoc()); doc < reader.maxDoc(); doc++) { - String[] storedValues = storedFields.document(doc).getValues(storedField); - if (docValues.advanceExact(doc)) { - assertEquals(doc, docValues.docID()); - int repeats = 1 + random().nextInt(3); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - assertEquals(storedValues.length, docValues.docValueCount()); - for (int v = 0; v < docValues.docValueCount(); v++) { - assertEquals(storedValues[v], Long.toString(docValues.nextValue())); - } - } - } else { - assertArrayEquals(new String[0], storedValues); - } - doc += random().nextInt(5); // skip some docs - } - } - // jump with advance - for (int i = 0; i < iters; i++) { - docValues = reader.getSortedNumericDocValues(dvField); - int doc = random().nextInt(leaf.reader().maxDoc()); - while (doc != NO_MORE_DOCS) { - int nextDoc = docValues.advance(doc); - // no stored fields in between - for (int d = doc; d < (nextDoc == NO_MORE_DOCS ? reader.maxDoc() : nextDoc); d++) { - String[] storedValues = storedFields.document(d).getValues(storedField); - assertArrayEquals(new String[0], storedValues); - } - doc = nextDoc; - if (doc != NO_MORE_DOCS) { - String[] storedValues = storedFields.document(doc).getValues(storedField); - int repeats = 1 + random().nextInt(3); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - assertEquals(storedValues.length, docValues.docValueCount()); - for (int v = 0; v < docValues.docValueCount(); v++) { - assertEquals(storedValues[v], Long.toString(docValues.nextValue())); - } - } - doc = nextDoc + 1; - doc += random().nextInt(5); // skip some docs - } - } - } - } - } - - private void doTestSortedNumericsVsStoredFields(LongSupplier counts, LongSupplier values) - throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - - // index some docs - int numDocs = atLeast(300); - // numDocs should be always > 256 so that in case of a codec that optimizes - // for numbers of values <= 256, all storage layouts are tested - assert numDocs > 256; - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - doc.add(new StringField("id", Integer.toString(i), Field.Store.NO)); - - int valueCount = (int) counts.getAsLong(); - long[] valueArray = new long[valueCount]; - for (int j = 0; j < valueCount; j++) { - long value = values.getAsLong(); - valueArray[j] = value; - doc.add(new SortedNumericDocValuesField("dv", value)); - } - Arrays.sort(valueArray); - for (int j = 0; j < valueCount; j++) { - doc.add(new StoredField("stored", Long.toString(valueArray[j]))); - } - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - try (DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir))) { - TestUtil.checkReader(reader); - compareStoredFieldWithSortedNumericsDV(reader, "stored", "dv"); - } - // merge some segments and ensure that at least one of them has more than - // 256 values - writer.forceMerge(numDocs / 256); - try (DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir))) { - TestUtil.checkReader(reader); - compareStoredFieldWithSortedNumericsDV(reader, "stored", "dv"); - } - IOUtils.close(writer, dir); - } - - public void testBooleanNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(1, () -> random().nextInt(2)); - } - } - - public void testSparseBooleanNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(random().nextDouble(), () -> random().nextInt(2)); - } - } - - public void testByteNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields( - 1, () -> TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)); - } - } - - public void testSparseByteNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields( - random().nextDouble(), () -> TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)); - } - } - - public void testShortNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields( - 1, () -> TestUtil.nextInt(random(), Short.MIN_VALUE, Short.MAX_VALUE)); - } - } - - public void testSparseShortNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields( - random().nextDouble(), - () -> TestUtil.nextInt(random(), Short.MIN_VALUE, Short.MAX_VALUE)); - } - } - - public void testIntNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(1, random()::nextInt); - } - } - - public void testSparseIntNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(random().nextDouble(), random()::nextInt); - } - } - - public void testLongNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(1, random()::nextLong); - } - } - - public void testSparseLongNumericsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestNumericsVsStoredFields(random().nextDouble(), random()::nextLong); - } - } - - private void doTestBinaryVsStoredFields(double density, Supplier bytes) throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - Document doc = new Document(); - Field idField = new StringField("id", "", Field.Store.NO); - Field storedField = new StoredField("stored", new byte[0]); - Field dvField = new BinaryDocValuesField("dv", newBytesRef()); - doc.add(idField); - doc.add(storedField); - doc.add(dvField); - - // index some docs - int numDocs = atLeast(300); - for (int i = 0; i < numDocs; i++) { - if (random().nextDouble() > density) { - writer.addDocument(new Document()); - continue; - } - idField.setStringValue(Integer.toString(i)); - byte[] buffer = bytes.get(); - storedField.setBytesValue(buffer); - dvField.setBytesValue(buffer); - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - - // compare - DirectoryReader ir = writer.getReader(); - TestUtil.checkReader(ir); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - BinaryDocValues docValues = DocValues.getBinary(r, "dv"); - docValues.nextDoc(); - for (int i = 0; i < r.maxDoc(); i++) { - BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); - if (binaryValue == null) { - assertTrue(docValues.docID() > i); - } else { - assertEquals(i, docValues.docID()); - assertEquals(binaryValue, docValues.binaryValue()); - docValues.nextDoc(); - } - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); - } - ir.close(); - - // compare again - writer.forceMerge(1); - ir = writer.getReader(); - TestUtil.checkReader(ir); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - BinaryDocValues docValues = DocValues.getBinary(r, "dv"); - docValues.nextDoc(); - for (int i = 0; i < r.maxDoc(); i++) { - BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); - if (binaryValue == null) { - assertTrue(docValues.docID() > i); - } else { - assertEquals(i, docValues.docID()); - assertEquals(binaryValue, docValues.binaryValue()); - docValues.nextDoc(); - } - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); - } - ir.close(); - writer.close(); - dir.close(); - } - - public void testBinaryFixedLengthVsStoredFields() throws Exception { - doTestBinaryFixedLengthVsStoredFields(1); - } - - public void testSparseBinaryFixedLengthVsStoredFields() throws Exception { - doTestBinaryFixedLengthVsStoredFields(random().nextDouble()); - } - - private void doTestBinaryFixedLengthVsStoredFields(double density) throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - int fixedLength = TestUtil.nextInt(random(), 0, 10); - doTestBinaryVsStoredFields( - density, - () -> { - byte[] buffer = new byte[fixedLength]; - random().nextBytes(buffer); - return buffer; - }); - } - } - - public void testBinaryVariableLengthVsStoredFields() throws Exception { - doTestBinaryVariableLengthVsStoredFields(1); - } - - public void testSparseBinaryVariableLengthVsStoredFields() throws Exception { - doTestBinaryVariableLengthVsStoredFields(random().nextDouble()); - } - - public void doTestBinaryVariableLengthVsStoredFields(double density) throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestBinaryVsStoredFields( - density, - () -> { - final int length = random().nextInt(10); - byte[] buffer = new byte[length]; - random().nextBytes(buffer); - return buffer; - }); - } - } - - protected void doTestSortedVsStoredFields(int numDocs, double density, Supplier bytes) - throws Exception { - Directory dir = newFSDirectory(createTempDir("dvduel")); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - Document doc = new Document(); - Field idField = new StringField("id", "", Field.Store.NO); - Field storedField = new StoredField("stored", new byte[0]); - Field dvField = new SortedDocValuesField("dv", newBytesRef()); - doc.add(idField); - doc.add(storedField); - doc.add(dvField); - - // index some docs - for (int i = 0; i < numDocs; i++) { - if (random().nextDouble() > density) { - writer.addDocument(new Document()); - continue; - } - idField.setStringValue(Integer.toString(i)); - byte[] buffer = bytes.get(); - storedField.setBytesValue(buffer); - dvField.setBytesValue(buffer); - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - - // compare - DirectoryReader ir = writer.getReader(); - TestUtil.checkReader(ir); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - SortedDocValues docValues = DocValues.getSorted(r, "dv"); - docValues.nextDoc(); - for (int i = 0; i < r.maxDoc(); i++) { - BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); - if (binaryValue == null) { - assertTrue(docValues.docID() > i); - } else { - assertEquals(i, docValues.docID()); - assertEquals(binaryValue, docValues.lookupOrd(docValues.ordValue())); - docValues.nextDoc(); - } - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); - } - ir.close(); - writer.forceMerge(1); - - // compare again - ir = writer.getReader(); - TestUtil.checkReader(ir); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - SortedDocValues docValues = DocValues.getSorted(r, "dv"); - docValues.nextDoc(); - for (int i = 0; i < r.maxDoc(); i++) { - BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); - if (binaryValue == null) { - assertTrue(docValues.docID() > i); - } else { - assertEquals(i, docValues.docID()); - assertEquals(binaryValue, docValues.lookupOrd(docValues.ordValue())); - docValues.nextDoc(); - } - } - assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); - } - ir.close(); - writer.close(); - dir.close(); - } - - public void testSortedFixedLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedVsStoredFields(atLeast(300), 1, fixedLength, fixedLength); - } - } - - public void testSparseSortedFixedLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedVsStoredFields(atLeast(300), random().nextDouble(), fixedLength, fixedLength); - } - } - - public void testSortedVariableLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedVsStoredFields(atLeast(300), 1, 1, 10); - } - } - - public void testSparseSortedVariableLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedVsStoredFields(atLeast(300), random().nextDouble(), 1, 10); - } - } - - protected void doTestSortedVsStoredFields( - int numDocs, double density, int minLength, int maxLength) throws Exception { - doTestSortedVsStoredFields( - numDocs, - density, - () -> { - int length = TestUtil.nextInt(random(), minLength, maxLength); - byte[] buffer = new byte[length]; - random().nextBytes(buffer); - return buffer; - }); - } - - public void testSortedSetOneValue() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(0, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoFields() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - doc.add(new SortedSetDocValuesField("field2", newBytesRef("world"))); - iwriter.addDocument(doc); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(0, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field2"); - assertEquals(0, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("world"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoDocumentsMerged() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - iwriter.commit(); - - doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(2, dv.getValueCount()); - - assertEquals(0, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - assertEquals(1, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(1, dv.nextOrd()); - - bytes = dv.lookupOrd(1); - assertEquals(newBytesRef("world"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoValues() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); - iwriter.addDocument(doc); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(0, dv.nextDoc()); - - assertEquals(2, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - assertEquals(1, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - bytes = dv.lookupOrd(1); - assertEquals(newBytesRef("world"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoValuesUnordered() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(0, dv.nextDoc()); - - assertEquals(2, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - assertEquals(1, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - bytes = dv.lookupOrd(1); - assertEquals(newBytesRef("world"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetThreeValuesTwoDocs() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); - iwriter.addDocument(doc); - iwriter.commit(); - - doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("beer"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(3, dv.getValueCount()); - - assertEquals(0, dv.nextDoc()); - assertEquals(2, dv.docValueCount()); - assertEquals(1, dv.nextOrd()); - assertEquals(2, dv.nextOrd()); - - assertEquals(1, dv.nextDoc()); - assertEquals(2, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - assertEquals(1, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("beer"), bytes); - - bytes = dv.lookupOrd(1); - assertEquals(newBytesRef("hello"), bytes); - - bytes = dv.lookupOrd(2); - assertEquals(newBytesRef("world"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoDocumentsLastMissing() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - - doc = new Document(); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(1, dv.getValueCount()); - assertEquals(0, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoDocumentsLastMissingMerge() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - iwriter.commit(); - - doc = new Document(); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(1, dv.getValueCount()); - assertEquals(0, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoDocumentsFirstMissing() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - iwriter.addDocument(doc); - - doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - - iwriter.forceMerge(1); - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(1, dv.getValueCount()); - assertEquals(1, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetTwoDocumentsFirstMissingMerge() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - iwriter.addDocument(doc); - iwriter.commit(); - - doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(1, dv.getValueCount()); - assertEquals(1, dv.nextDoc()); - - assertEquals(1, dv.docValueCount()); - assertEquals(0, dv.nextOrd()); - - BytesRef bytes = dv.lookupOrd(0); - assertEquals(newBytesRef("hello"), bytes); - - ireader.close(); - directory.close(); - } - - public void testSortedSetMergeAwayAllValues() throws IOException { + public void testSortedSetMergeAwayAllValuesWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -2234,7 +126,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(SortedSetDocValuesField.indexedField("field", newBytesRef("hello"))); iwriter.addDocument(doc); iwriter.commit(); iwriter.deleteDocuments(new Term("id", "1")); @@ -2246,6 +138,11 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); assertEquals(0, dv.getValueCount()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + TermsEnum termsEnum = dv.termsEnum(); assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); @@ -2255,951 +152,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes directory.close(); } - public void testSortedSetTermsEnum() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); - doc.add(new SortedSetDocValuesField("field", newBytesRef("beer"))); - iwriter.addDocument(doc); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - assertEquals(3, dv.getValueCount()); - - TermsEnum termsEnum = dv.termsEnum(); - - // next() - assertEquals("beer", termsEnum.next().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals("world", termsEnum.next().utf8ToString()); - assertEquals(2, termsEnum.ord()); - - // seekCeil() - assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("ha!"))); - assertEquals("hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(newBytesRef("beer"))); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertEquals(SeekStatus.END, termsEnum.seekCeil(newBytesRef("zzz"))); - - // seekExact() - assertTrue(termsEnum.seekExact(newBytesRef("beer"))); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - assertTrue(termsEnum.seekExact(newBytesRef("hello"))); - assertEquals("hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertTrue(termsEnum.seekExact(newBytesRef("world"))); - assertEquals("world", termsEnum.term().utf8ToString()); - assertEquals(2, termsEnum.ord()); - assertFalse(termsEnum.seekExact(newBytesRef("bogus"))); - - // seek(ord) - termsEnum.seekExact(0); - assertEquals("beer", termsEnum.term().utf8ToString()); - assertEquals(0, termsEnum.ord()); - termsEnum.seekExact(1); - assertEquals("hello", termsEnum.term().utf8ToString()); - assertEquals(1, termsEnum.ord()); - termsEnum.seekExact(2); - assertEquals("world", termsEnum.term().utf8ToString()); - assertEquals(2, termsEnum.ord()); - - // NORMAL automaton - termsEnum = - dv.intersect( - new CompiledAutomaton( - Operations.determinize( - new RegExp(".*l.*").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT))); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertEquals("world", termsEnum.next().utf8ToString()); - assertEquals(2, termsEnum.ord()); - assertNull(termsEnum.next()); - - // SINGLE automaton - termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); - assertEquals("hello", termsEnum.next().utf8ToString()); - assertEquals(1, termsEnum.ord()); - assertNull(termsEnum.next()); - - ireader.close(); - directory.close(); - } - - protected void compareStoredFieldWithSortedSetDV( - DirectoryReader directoryReader, String storedField, String dvField) throws IOException { - for (LeafReaderContext leaf : directoryReader.leaves()) { - LeafReader reader = leaf.reader(); - StoredFields storedFields = reader.storedFields(); - SortedSetDocValues docValues = reader.getSortedSetDocValues(dvField); - if (docValues == null) { - // no stored values at all - for (int doc = 0; doc < reader.maxDoc(); doc++) { - assertArrayEquals(new String[0], storedFields.document(doc).getValues(storedField)); - } - continue; - } - // sequentially - for (int doc = 0; doc < reader.maxDoc(); doc++) { - String[] storedValues = storedFields.document(doc).getValues(storedField); - if (storedValues.length == 0) { - assertFalse(docValues.advanceExact(doc)); - continue; - } - switch (random().nextInt(3)) { - case 0 -> assertEquals(doc, docValues.nextDoc()); - case 1 -> assertEquals(doc, docValues.advance(doc)); - default -> assertTrue(docValues.advanceExact(doc)); - } - assertEquals(doc, docValues.docID()); - assertEquals(storedValues.length, docValues.docValueCount()); - int repeats = 1 + random().nextInt(3); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - for (int v = 0; v < docValues.docValueCount(); v++) { - long ord = docValues.nextOrd(); - assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); - } - } - } - // jump with advanceExact - int iters = 1 + random().nextInt(3); - for (int i = 0; i < iters; i++) { - docValues = reader.getSortedSetDocValues(dvField); - for (int doc = random().nextInt(leaf.reader().maxDoc()); doc < reader.maxDoc(); doc++) { - String[] storedValues = storedFields.document(doc).getValues(storedField); - if (docValues.advanceExact(doc)) { - assertEquals(doc, docValues.docID()); - assertEquals(storedValues.length, docValues.docValueCount()); - int repeats = 1 + random().nextInt(3); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - for (int v = 0; v < docValues.docValueCount(); v++) { - long ord = docValues.nextOrd(); - assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); - } - } - } else { - assertArrayEquals(new String[0], storedValues); - } - doc += random().nextInt(5); // skip some docs - } - } - // jump with advance - for (int i = 0; i < iters; i++) { - docValues = reader.getSortedSetDocValues(dvField); - int doc = random().nextInt(leaf.reader().maxDoc()); - while (doc != NO_MORE_DOCS) { - int nextDoc = docValues.advance(doc); - // no stored fields in between - for (int d = doc; d < (nextDoc == NO_MORE_DOCS ? reader.maxDoc() : nextDoc); d++) { - String[] storedValues = storedFields.document(d).getValues(storedField); - assertArrayEquals(new String[0], storedValues); - } - doc = nextDoc; - if (doc != NO_MORE_DOCS) { - int repeats = 1 + random().nextInt(3); - String[] storedValues = storedFields.document(doc).getValues(storedField); - for (int r = 0; r < repeats; r++) { - if (r > 0 || random().nextBoolean()) { - assertTrue(docValues.advanceExact(doc)); - } - for (int v = 0; v < docValues.docValueCount(); v++) { - long ord = docValues.nextOrd(); - assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); - } - } - doc = nextDoc + 1; - doc += random().nextInt(5); // skip some docs - } - } - } - } - } - - protected void doTestSortedSetVsStoredFields( - int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) - throws Exception { - Directory dir = newFSDirectory(createTempDir("dvduel")); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - - Set valueSet = new HashSet<>(); - for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) { - final int length = TestUtil.nextInt(random(), minLength, maxLength); - valueSet.add(TestUtil.randomSimpleString(random(), length)); - } - String[] uniqueValues = valueSet.toArray(new String[0]); - - // index some docs - if (VERBOSE) { - System.out.println("\nTEST: now add numDocs=" + numDocs); - } - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); - doc.add(idField); - int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc); - // create a random set of strings - Set values = new TreeSet<>(); - for (int v = 0; v < numValues; v++) { - values.add(RandomPicks.randomFrom(random(), uniqueValues)); - } - - // add ordered to the stored field - for (String v : values) { - doc.add(new StoredField("stored", v)); - } - - // add in any order to the dv field - ArrayList unordered = new ArrayList<>(values); - Collections.shuffle(unordered, random()); - for (String v : unordered) { - doc.add(new SortedSetDocValuesField("dv", newBytesRef(v))); - } - - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - - try (DirectoryReader reader = writer.getReader()) { - TestUtil.checkReader(reader); - compareStoredFieldWithSortedSetDV(reader, "stored", "dv"); - } - writer.forceMerge(1); - try (DirectoryReader reader = writer.getReader()) { - TestUtil.checkReader(reader); - compareStoredFieldWithSortedSetDV(reader, "stored", "dv"); - } - IOUtils.close(writer, dir); - } - - public void testSortedSetFixedLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100); - } - } - - public void testSortedNumericsSingleValuedVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedNumericsVsStoredFields(() -> 1, random()::nextLong); - } - } - - public void testSortedNumericsSingleValuedMissingVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedNumericsVsStoredFields(() -> random().nextBoolean() ? 0 : 1, random()::nextLong); - } - } - - public void testSortedNumericsMultipleValuesVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedNumericsVsStoredFields( - () -> TestUtil.nextLong(random(), 0, 50), random()::nextLong); - } - } - - public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception { - final long[] values = new long[TestUtil.nextInt(random(), 2, 6)]; - for (int i = 0; i < values.length; ++i) { - values[i] = random().nextLong(); - } - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedNumericsVsStoredFields( - () -> TestUtil.nextLong(random(), 0, 6), () -> values[random().nextInt(values.length)]); - } - } - - public void testSortedSetVariableLengthVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100); - } - } - - public void testSortedSetFixedLengthSingleValuedVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - int fixedLength = TestUtil.nextInt(random(), 1, 10); - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100); - } - } - - public void testSortedSetVariableLengthSingleValuedVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100); - } - } - - public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6); - } - } - - public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6); - } - } - - public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000); - } - } - - public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000); - } - } - - public void testGCDCompression() throws Exception { - doTestGCDCompression(1); - } - - public void testSparseGCDCompression() throws Exception { - doTestGCDCompression(random().nextDouble()); - } - - private void doTestGCDCompression(double density) throws Exception { - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - final long min = -(((long) random().nextInt(1 << 30)) << 32); - final long mul = random().nextInt() & 0xFFFFFFFFL; - final LongSupplier longs = () -> min + mul * random().nextInt(1 << 20); - doTestNumericsVsStoredFields(density, longs); - } - } - - public void testZeros() throws Exception { - doTestNumericsVsStoredFields(1, () -> 0); - } - - public void testSparseZeros() throws Exception { - doTestNumericsVsStoredFields(random().nextDouble(), () -> 0); - } - - public void testZeroOrMin() throws Exception { - // try to make GCD compression fail if the format did not anticipate that - // the GCD of 0 and MIN_VALUE is negative - int numIterations = atLeast(1); - for (int i = 0; i < numIterations; i++) { - final LongSupplier longs = () -> random().nextBoolean() ? 0 : Long.MIN_VALUE; - doTestNumericsVsStoredFields(1, longs); - } - } - - public void testTwoNumbersOneMissing() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 0)); - iw.addDocument(doc); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - NumericDocValues dv = ar.getNumericDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(0, dv.longValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ir.close(); - directory.close(); - } - - public void testTwoNumbersOneMissingWithMerging() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 0)); - iw.addDocument(doc); - iw.commit(); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - NumericDocValues dv = ar.getNumericDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(0, dv.longValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ir.close(); - directory.close(); - } - - public void testThreeNumbersOneMissingWithMerging() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 0)); - iw.addDocument(doc); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.commit(); - doc = new Document(); - doc.add(new StringField("id", "2", Field.Store.YES)); - doc.add(new NumericDocValuesField("dv1", 5)); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - NumericDocValues dv = ar.getNumericDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(0, dv.longValue()); - assertEquals(2, dv.nextDoc()); - assertEquals(5, dv.longValue()); - ir.close(); - directory.close(); - } - - public void testTwoBytesOneMissing() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef())); - iw.addDocument(doc); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - BinaryDocValues dv = ar.getBinaryDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef(), dv.binaryValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ir.close(); - directory.close(); - } - - public void testTwoBytesOneMissingWithMerging() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef())); - iw.addDocument(doc); - iw.commit(); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - BinaryDocValues dv = ar.getBinaryDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef(), dv.binaryValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ir.close(); - directory.close(); - } - - public void testThreeBytesOneMissingWithMerging() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(null); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); - Document doc = new Document(); - doc.add(new StringField("id", "0", Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef())); - iw.addDocument(doc); - doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.YES)); - iw.addDocument(doc); - iw.commit(); - doc = new Document(); - doc.add(new StringField("id", "2", Field.Store.YES)); - doc.add(new BinaryDocValuesField("dv1", newBytesRef("boo"))); - iw.addDocument(doc); - iw.forceMerge(1); - iw.close(); - - IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assertEquals(1, ir.leaves().size()); - LeafReader ar = ir.leaves().get(0).reader(); - BinaryDocValues dv = ar.getBinaryDocValues("dv1"); - assertEquals(0, dv.nextDoc()); - assertEquals(newBytesRef(), dv.binaryValue()); - assertEquals(2, dv.nextDoc()); - assertEquals(newBytesRef("boo"), dv.binaryValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ir.close(); - directory.close(); - } - - /** Tests dv against stored fields with threads (binary/numeric/sorted, no missing) */ - public void testThreads() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - Document doc = new Document(); - Field idField = new StringField("id", "", Field.Store.NO); - Field storedBinField = new StoredField("storedBin", new byte[0]); - Field dvBinField = new BinaryDocValuesField("dvBin", newBytesRef()); - Field dvSortedField = new SortedDocValuesField("dvSorted", newBytesRef()); - Field storedNumericField = new StoredField("storedNum", ""); - Field dvNumericField = new NumericDocValuesField("dvNum", 0); - doc.add(idField); - doc.add(storedBinField); - doc.add(dvBinField); - doc.add(dvSortedField); - doc.add(storedNumericField); - doc.add(dvNumericField); - - // index some docs - int numDocs = atLeast(300); - for (int i = 0; i < numDocs; i++) { - idField.setStringValue(Integer.toString(i)); - int length = TestUtil.nextInt(random(), 0, 8); - byte[] buffer = new byte[length]; - random().nextBytes(buffer); - storedBinField.setBytesValue(buffer); - dvBinField.setBytesValue(buffer); - dvSortedField.setBytesValue(buffer); - long numericValue = random().nextLong(); - storedNumericField.setStringValue(Long.toString(numericValue)); - dvNumericField.setLongValue(numericValue); - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - writer.close(); - - // compare - final DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); - int numThreads = TestUtil.nextInt(random(), 2, 7); - Thread[] threads = new Thread[numThreads]; - final CountDownLatch startingGun = new CountDownLatch(1); - - for (int i = 0; i < threads.length; i++) { - threads[i] = - new Thread() { - @Override - public void run() { - try { - startingGun.await(); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); - SortedDocValues sorted = r.getSortedDocValues("dvSorted"); - NumericDocValues numerics = r.getNumericDocValues("dvNum"); - for (int j = 0; j < r.maxDoc(); j++) { - BytesRef binaryValue = storedFields.document(j).getBinaryValue("storedBin"); - assertEquals(j, binaries.nextDoc()); - BytesRef scratch = binaries.binaryValue(); - assertEquals(binaryValue, scratch); - assertEquals(j, sorted.nextDoc()); - scratch = sorted.lookupOrd(sorted.ordValue()); - assertEquals(binaryValue, scratch); - String expected = storedFields.document(j).get("storedNum"); - assertEquals(j, numerics.nextDoc()); - assertEquals(Long.parseLong(expected), numerics.longValue()); - } - } - TestUtil.checkReader(ir); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - }; - threads[i].start(); - } - startingGun.countDown(); - for (Thread t : threads) { - t.join(); - } - ir.close(); - dir.close(); - } - - /** Tests dv against stored fields with threads (all types + missing) */ - @Nightly - public void testThreads2() throws Exception { - Directory dir = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - Field idField = new StringField("id", "", Field.Store.NO); - Field storedBinField = new StoredField("storedBin", new byte[0]); - Field dvBinField = new BinaryDocValuesField("dvBin", newBytesRef()); - Field dvSortedField = new SortedDocValuesField("dvSorted", newBytesRef()); - Field storedNumericField = new StoredField("storedNum", ""); - Field dvNumericField = new NumericDocValuesField("dvNum", 0); - - // index some docs - int numDocs = TestUtil.nextInt(random(), 1025, 2047); - for (int i = 0; i < numDocs; i++) { - idField.setStringValue(Integer.toString(i)); - int length = TestUtil.nextInt(random(), 0, 8); - byte[] buffer = new byte[length]; - random().nextBytes(buffer); - storedBinField.setBytesValue(buffer); - dvBinField.setBytesValue(buffer); - dvSortedField.setBytesValue(buffer); - long numericValue = random().nextLong(); - storedNumericField.setStringValue(Long.toString(numericValue)); - dvNumericField.setLongValue(numericValue); - Document doc = new Document(); - doc.add(idField); - if (random().nextInt(4) > 0) { - doc.add(storedBinField); - doc.add(dvBinField); - doc.add(dvSortedField); - } - if (random().nextInt(4) > 0) { - doc.add(storedNumericField); - doc.add(dvNumericField); - } - int numSortedSetFields = random().nextInt(3); - Set values = new TreeSet<>(); - for (int j = 0; j < numSortedSetFields; j++) { - values.add(TestUtil.randomSimpleString(random())); - } - for (String v : values) { - doc.add(new SortedSetDocValuesField("dvSortedSet", newBytesRef(v))); - doc.add(new StoredField("storedSortedSet", v)); - } - int numSortedNumericFields = random().nextInt(3); - Set numValues = new TreeSet<>(); - for (int j = 0; j < numSortedNumericFields; j++) { - numValues.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE)); - } - for (Long l : numValues) { - doc.add(new SortedNumericDocValuesField("dvSortedNumeric", l)); - doc.add(new StoredField("storedSortedNumeric", Long.toString(l))); - } - writer.addDocument(doc); - if (random().nextInt(31) == 0) { - writer.commit(); - } - } - - // delete some docs - int numDeletions = random().nextInt(numDocs / 10); - for (int i = 0; i < numDeletions; i++) { - int id = random().nextInt(numDocs); - writer.deleteDocuments(new Term("id", Integer.toString(id))); - } - writer.close(); - - // compare - final DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); - int numThreads = TestUtil.nextInt(random(), 2, 7); - Thread[] threads = new Thread[numThreads]; - final CountDownLatch startingGun = new CountDownLatch(1); - - for (int i = 0; i < threads.length; i++) { - threads[i] = - new Thread() { - @Override - public void run() { - try { - startingGun.await(); - for (LeafReaderContext context : ir.leaves()) { - LeafReader r = context.reader(); - StoredFields storedFields = r.storedFields(); - BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); - SortedDocValues sorted = r.getSortedDocValues("dvSorted"); - NumericDocValues numerics = r.getNumericDocValues("dvNum"); - SortedSetDocValues sortedSet = r.getSortedSetDocValues("dvSortedSet"); - SortedNumericDocValues sortedNumeric = - r.getSortedNumericDocValues("dvSortedNumeric"); - for (int j = 0; j < r.maxDoc(); j++) { - BytesRef binaryValue = storedFields.document(j).getBinaryValue("storedBin"); - if (binaryValue != null) { - if (binaries != null) { - assertEquals(j, binaries.nextDoc()); - BytesRef scratch = binaries.binaryValue(); - assertEquals(binaryValue, scratch); - assertEquals(j, sorted.nextDoc()); - scratch = sorted.lookupOrd(sorted.ordValue()); - assertEquals(binaryValue, scratch); - } - } - - String number = storedFields.document(j).get("storedNum"); - if (number != null) { - if (numerics != null) { - assertEquals(j, numerics.advance(j)); - assertEquals(Long.parseLong(number), numerics.longValue()); - } - } - - String[] values = storedFields.document(j).getValues("storedSortedSet"); - if (values.length > 0) { - assertNotNull(sortedSet); - assertEquals(j, sortedSet.nextDoc()); - assertEquals(values.length, sortedSet.docValueCount()); - for (String s : values) { - long ord = sortedSet.nextOrd(); - BytesRef value = sortedSet.lookupOrd(ord); - assertEquals(s, value.utf8ToString()); - } - } - - String[] numValues = storedFields.document(j).getValues("storedSortedNumeric"); - if (numValues.length > 0) { - assertNotNull(sortedNumeric); - assertEquals(j, sortedNumeric.nextDoc()); - assertEquals(numValues.length, sortedNumeric.docValueCount()); - for (String numValue : numValues) { - long v = sortedNumeric.nextValue(); - assertEquals(numValue, Long.toString(v)); - } - } - } - } - TestUtil.checkReader(ir); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - }; - threads[i].start(); - } - startingGun.countDown(); - for (Thread t : threads) { - t.join(); - } - ir.close(); - dir.close(); - } - - @Nightly - public void testThreads3() throws Exception { - Directory dir = newFSDirectory(createTempDir()); - IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); - - int numSortedSets = random().nextInt(21); - int numBinaries = random().nextInt(21); - int numSortedNums = random().nextInt(21); - - int numDocs = TestUtil.nextInt(random(), 2025, 2047); - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - - for (int j = 0; j < numSortedSets; j++) { - doc.add( - new SortedSetDocValuesField( - "ss" + j, newBytesRef(TestUtil.randomSimpleString(random())))); - doc.add( - new SortedSetDocValuesField( - "ss" + j, newBytesRef(TestUtil.randomSimpleString(random())))); - } - - for (int j = 0; j < numBinaries; j++) { - doc.add( - new BinaryDocValuesField("b" + j, newBytesRef(TestUtil.randomSimpleString(random())))); - } - - for (int j = 0; j < numSortedNums; j++) { - doc.add( - new SortedNumericDocValuesField( - "sn" + j, TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); - doc.add( - new SortedNumericDocValuesField( - "sn" + j, TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); - } - writer.addDocument(doc); - } - writer.close(); - - // now check with threads - for (int i = 0; i < 10; i++) { - final DirectoryReader r = maybeWrapWithMergingReader(DirectoryReader.open(dir)); - final CountDownLatch startingGun = new CountDownLatch(1); - Thread[] threads = new Thread[TestUtil.nextInt(random(), 4, 10)]; - for (int tid = 0; tid < threads.length; tid++) { - threads[tid] = - new Thread() { - @Override - public void run() { - try { - ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); - PrintStream infoStream = new PrintStream(bos, false, UTF_8); - startingGun.await(); - for (LeafReaderContext leaf : r.leaves()) { - DocValuesStatus status = - CheckIndex.testDocValues((CodecReader) leaf.reader(), infoStream, true); - if (status.error != null) { - throw status.error; - } - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - }; - } - for (Thread thread : threads) { - thread.start(); - } - startingGun.countDown(); - for (Thread thread : threads) { - thread.join(); - } - r.close(); - } - - dir.close(); - } - - // LUCENE-5218 - public void testEmptyBinaryValueOnPageSizes() throws Exception { - // Test larger and larger power-of-two sized values, - // followed by empty string value: - for (int i = 0; i < 20; i++) { - if (i > 14 && codecAcceptsHugeBinaryValues("field") == false) { - break; - } - Directory dir = newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random(), dir); - BytesRef bytes = newBytesRef(new byte[1 << i], 0, 1 << i); - for (int j = 0; j < 4; j++) { - Document doc = new Document(); - doc.add(new BinaryDocValuesField("field", bytes)); - w.addDocument(doc); - } - Document doc = new Document(); - doc.add(new StoredField("id", "5")); - doc.add(new BinaryDocValuesField("field", newBytesRef())); - w.addDocument(doc); - IndexReader r = w.getReader(); - w.close(); - - BinaryDocValues values = MultiDocValues.getBinaryValues(r, "field"); - for (int j = 0; j < 5; j++) { - assertEquals(j, values.nextDoc()); - BytesRef result = values.binaryValue(); - assertTrue(result.length == 0 || result.length == 1 << i); - } - r.close(); - dir.close(); - } - } - - public void testOneSortedNumber() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 5)); - writer.addDocument(doc); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(5, dv.nextValue()); - - reader.close(); - directory.close(); - } - - public void testOneSortedNumberOneMissing() throws IOException { - Directory directory = newDirectory(); - IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(null)); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 5)); - writer.addDocument(doc); - writer.addDocument(new Document()); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(5, dv.nextValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - - reader.close(); - directory.close(); - } - - public void testNumberMergeAwayAllValues() throws IOException { + public void testNumberMergeAwayAllValuesWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3211,7 +164,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new NumericDocValuesField("field", 5)); + doc.add(NumericDocValuesField.indexedField("field", 5)); iwriter.addDocument(doc); iwriter.commit(); iwriter.deleteDocuments(new Term("id", "1")); @@ -3223,109 +176,16 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes NumericDocValues dv = getOnlyLeafReader(ireader).getNumericDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + ireader.close(); directory.close(); } - public void testTwoSortedNumber() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 11)); - doc.add(new SortedNumericDocValuesField("dv", -5)); - writer.addDocument(doc); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(2, dv.docValueCount()); - assertEquals(-5, dv.nextValue()); - assertEquals(11, dv.nextValue()); - - reader.close(); - directory.close(); - } - - public void testTwoSortedNumberSameValue() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 11)); - doc.add(new SortedNumericDocValuesField("dv", 11)); - writer.addDocument(doc); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(2, dv.docValueCount()); - assertEquals(11, dv.nextValue()); - assertEquals(11, dv.nextValue()); - - reader.close(); - directory.close(); - } - - public void testTwoSortedNumberOneMissing() throws IOException { - Directory directory = newDirectory(); - IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(null)); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 11)); - doc.add(new SortedNumericDocValuesField("dv", -5)); - writer.addDocument(doc); - writer.addDocument(new Document()); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(2, dv.docValueCount()); - assertEquals(-5, dv.nextValue()); - assertEquals(11, dv.nextValue()); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); - - reader.close(); - directory.close(); - } - - public void testSortedNumberMerge() throws IOException { - Directory directory = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(null); - iwc.setMergePolicy(newLogMergePolicy()); - IndexWriter writer = new IndexWriter(directory, iwc); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", 11)); - writer.addDocument(doc); - writer.commit(); - doc = new Document(); - doc.add(new SortedNumericDocValuesField("dv", -5)); - writer.addDocument(doc); - writer.forceMerge(1); - writer.close(); - - // Now search the index: - IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); - assert reader.leaves().size() == 1; - SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); - assertEquals(0, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(11, dv.nextValue()); - assertEquals(1, dv.nextDoc()); - assertEquals(1, dv.docValueCount()); - assertEquals(-5, dv.nextValue()); - - reader.close(); - directory.close(); - } - - public void testSortedNumberMergeAwayAllValues() throws IOException { + public void testSortedNumberMergeAwayAllValuesWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3337,7 +197,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes iwriter.addDocument(doc); doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedNumericDocValuesField("field", 5)); + doc.add(SortedNumericDocValuesField.indexedField("field", 5)); iwriter.addDocument(doc); iwriter.commit(); iwriter.deleteDocuments(new Term("id", "1")); @@ -3349,148 +209,17 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedNumericDocValues dv = getOnlyLeafReader(ireader).getSortedNumericDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ireader.close(); - directory.close(); - } - - public void testSortedEnumAdvanceIndependently() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - SortedDocValuesField field = new SortedDocValuesField("field", newBytesRef("2")); - doc.add(field); - iwriter.addDocument(doc); - field.setBytesValue(newBytesRef("1")); - iwriter.addDocument(doc); - field.setBytesValue(newBytesRef("3")); - iwriter.addDocument(doc); - - iwriter.commit(); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); - doTestSortedSetEnumAdvanceIndependently(DocValues.singleton(dv)); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); ireader.close(); directory.close(); } - public void testSortedSetEnumAdvanceIndependently() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - SortedSetDocValuesField field1 = new SortedSetDocValuesField("field", newBytesRef("2")); - SortedSetDocValuesField field2 = new SortedSetDocValuesField("field", newBytesRef("3")); - doc.add(field1); - doc.add(field2); - iwriter.addDocument(doc); - field1.setBytesValue(newBytesRef("1")); - iwriter.addDocument(doc); - field2.setBytesValue(newBytesRef("2")); - iwriter.addDocument(doc); - - iwriter.commit(); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); - doTestSortedSetEnumAdvanceIndependently(dv); - - ireader.close(); - directory.close(); - } - - protected void doTestSortedSetEnumAdvanceIndependently(SortedSetDocValues dv) throws IOException { - if (dv.getValueCount() < 2) { - return; - } - List terms = new ArrayList<>(); - TermsEnum te = dv.termsEnum(); - terms.add(BytesRef.deepCopyOf(te.next())); - terms.add(BytesRef.deepCopyOf(te.next())); - - // Make sure that calls to next() does not modify the term of the other enum - TermsEnum enum1 = dv.termsEnum(); - TermsEnum enum2 = dv.termsEnum(); - BytesRefBuilder term1 = new BytesRefBuilder(); - BytesRefBuilder term2 = new BytesRefBuilder(); - - term1.copyBytes(enum1.next()); - term2.copyBytes(enum2.next()); - term1.copyBytes(enum1.next()); - - assertEquals(term1.get(), enum1.term()); - assertEquals(term2.get(), enum2.term()); - - // Same for seekCeil - enum1 = dv.termsEnum(); - enum2 = dv.termsEnum(); - term1 = new BytesRefBuilder(); - term2 = new BytesRefBuilder(); - - term2.copyBytes(enum2.next()); - BytesRefBuilder seekTerm = new BytesRefBuilder(); - seekTerm.append(terms.get(0)); - seekTerm.append((byte) 0); - enum1.seekCeil(seekTerm.get()); - term1.copyBytes(enum1.term()); - - assertEquals(term1.get(), enum1.term()); - assertEquals(term2.get(), enum2.term()); - - // Same for seekCeil on an exact value - enum1 = dv.termsEnum(); - enum2 = dv.termsEnum(); - term1 = new BytesRefBuilder(); - term2 = new BytesRefBuilder(); - - term2.copyBytes(enum2.next()); - enum1.seekCeil(terms.get(1)); - term1.copyBytes(enum1.term()); - - assertEquals(term1.get(), enum1.term()); - assertEquals(term2.get(), enum2.term()); - - // Same for seekExact - enum1 = dv.termsEnum(); - enum2 = dv.termsEnum(); - term1 = new BytesRefBuilder(); - term2 = new BytesRefBuilder(); - - term2.copyBytes(enum2.next()); - final boolean found = enum1.seekExact(terms.get(1)); - assertTrue(found); - term1.copyBytes(enum1.term()); - - // Same for seek by ord - enum1 = dv.termsEnum(); - enum2 = dv.termsEnum(); - term1 = new BytesRefBuilder(); - term2 = new BytesRefBuilder(); - - term2.copyBytes(enum2.next()); - enum1.seekExact(1); - term1.copyBytes(enum1.term()); - - assertEquals(term1.get(), enum1.term()); - assertEquals(term2.get(), enum2.term()); - } - // same as testSortedMergeAwayAllValues but on more than 1024 docs to have sparse encoding on - public void testSortedMergeAwayAllValuesLargeSegment() throws IOException { + public void testSortedMergeAwayAllValuesLargeSegmentWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3499,7 +228,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); + doc.add(SortedDocValuesField.indexedField("field", newBytesRef("hello"))); iwriter.addDocument(doc); final int numEmptyDocs = atLeast(1024); for (int i = 0; i < numEmptyDocs; ++i) { @@ -3515,6 +244,11 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + TermsEnum termsEnum = dv.termsEnum(); assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); @@ -3525,7 +259,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes } // same as testSortedSetMergeAwayAllValues but on more than 1024 docs to have sparse encoding on - public void testSortedSetMergeAwayAllValuesLargeSegment() throws IOException { + public void testSortedSetMergeAwayAllValuesLargeSegmentWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3534,7 +268,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(SortedSetDocValuesField.indexedField("field", newBytesRef("hello"))); iwriter.addDocument(doc); final int numEmptyDocs = atLeast(1024); for (int i = 0; i < numEmptyDocs; ++i) { @@ -3550,6 +284,11 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + TermsEnum termsEnum = dv.termsEnum(); assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); @@ -3560,7 +299,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes } // same as testNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on - public void testNumericMergeAwayAllValuesLargeSegment() throws IOException { + public void testNumericMergeAwayAllValuesLargeSegmentWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3569,7 +308,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new NumericDocValuesField("field", 42L)); + doc.add(NumericDocValuesField.indexedField("field", 42L)); iwriter.addDocument(doc); final int numEmptyDocs = atLeast(1024); for (int i = 0; i < numEmptyDocs; ++i) { @@ -3585,13 +324,18 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes NumericDocValues dv = getOnlyLeafReader(ireader).getNumericDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); + ireader.close(); directory.close(); } // same as testSortedNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding // on - public void testSortedNumericMergeAwayAllValuesLargeSegment() throws IOException { + public void testSortedNumericMergeAwayAllValuesLargeSegmentWithSkipper() throws IOException { Directory directory = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); @@ -3600,7 +344,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes Document doc = new Document(); doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new SortedNumericDocValuesField("field", 42L)); + doc.add(SortedNumericDocValuesField.indexedField("field", 42L)); iwriter.addDocument(doc); final int numEmptyDocs = atLeast(1024); for (int i = 0; i < numEmptyDocs; ++i) { @@ -3616,195 +360,465 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes SortedNumericDocValues dv = getOnlyLeafReader(ireader).getSortedNumericDocValues("field"); assertEquals(NO_MORE_DOCS, dv.nextDoc()); - ireader.close(); - directory.close(); - } - - // same as testBinaryMergeAwayAllValues but on more than 1024 docs to have sparse encoding on - public void testBinaryMergeAwayAllValuesLargeSegment() throws IOException { - Directory directory = newDirectory(); - Analyzer analyzer = new MockAnalyzer(random()); - IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); - iwconfig.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); - - Document doc = new Document(); - doc.add(new StringField("id", "1", Field.Store.NO)); - doc.add(new BinaryDocValuesField("field", newBytesRef("hello"))); - iwriter.addDocument(doc); - final int numEmptyDocs = atLeast(1024); - for (int i = 0; i < numEmptyDocs; ++i) { - iwriter.addDocument(new Document()); - } - iwriter.commit(); - iwriter.deleteDocuments(new Term("id", "1")); - iwriter.forceMerge(1); - - DirectoryReader ireader = iwriter.getReader(); - iwriter.close(); - - BinaryDocValues dv = getOnlyLeafReader(ireader).getBinaryDocValues("field"); - assertEquals(NO_MORE_DOCS, dv.nextDoc()); + DocValuesSkipper skipper = getOnlyLeafReader(ireader).getDocValuesSkipper("field"); + assertEquals(0, skipper.docCount()); + skipper.advance(0); + assertEquals(NO_MORE_DOCS, skipper.minDocID(0)); ireader.close(); directory.close(); } - public void testRandomAdvanceNumeric() throws IOException { - final long longRange; - if (random().nextBoolean()) { - longRange = TestUtil.nextInt(random(), 1, 1024); - } else { - longRange = TestUtil.nextLong(random(), 1, Long.MAX_VALUE); - } - doTestRandomAdvance( - new FieldCreator() { - @Override - public Field next() { - return new NumericDocValuesField("field", TestUtil.nextLong(random(), 0, longRange)); - } - - @Override - public DocIdSetIterator iterator(IndexReader r) throws IOException { - return MultiDocValues.getNumericValues(r, "field"); - } - }); + public void testNumericDocValuesWithSkipperSmall() throws Exception { + doTestNumericDocValuesWithSkipper(random().nextInt(1, 1000)); } - public void testRandomAdvanceBinary() throws IOException { - doTestRandomAdvance( - new FieldCreator() { - @Override - public Field next() { - byte[] bytes = new byte[random().nextInt(10)]; - random().nextBytes(bytes); - return new BinaryDocValuesField("field", newBytesRef(bytes)); - } - - @Override - public DocIdSetIterator iterator(IndexReader r) throws IOException { - return MultiDocValues.getBinaryValues(r, "field"); - } - }); + public void testNumericDocValuesWithSkipperMedium() throws Exception { + doTestNumericDocValuesWithSkipper(random().nextInt(1000, 20000)); } - /** - * Tests where a DVField uses a high number of packed bits to store its ords. See: - * https://issues.apache.org/jira/browse/LUCENE-10159 - */ @Nightly - public void testHighOrdsSortedSetDV() throws Exception { - assumeFalse( - "This test with SimpleTextCodec requires a lot of memory", - getCodec() instanceof SimpleTextCodec); - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(); - iwc.setRAMBufferSizeMB(8 + random().nextInt(64)); - IndexWriter writer = new IndexWriter(dir, iwc); - // many docs with some of them have very high ords - int numDocs = 20_000 + random().nextInt(10_000); - for (int i = 1; i < numDocs; i++) { - final int numOrds; - if (random().nextInt(100) <= 5) { - numOrds = 1000 + random().nextInt(500); - } else { - numOrds = random().nextInt(10); - } + public void testNumericDocValuesWithSkipperBig() throws Exception { + doTestNumericDocValuesWithSkipper(random().nextInt(50000, 100000)); + } + + private void doTestNumericDocValuesWithSkipper(int totalDocs) throws Exception { + assertDocValuesWithSkipper( + totalDocs, + new TestDocValueSkipper() { + @Override + public void populateDoc(Document doc) { + doc.add(NumericDocValuesField.indexedField("test", random().nextLong())); + } + + @Override + public DocValuesWrapper docValuesWrapper(LeafReader leafReader) throws IOException { + NumericDocValues numericDocValues = leafReader.getNumericDocValues("test"); + return new DocValuesWrapper() { + + @Override + public int advance(int target) throws IOException { + return numericDocValues.advance(target); + } + + @Override + public boolean advanceExact(int target) throws IOException { + return numericDocValues.advanceExact(target); + } + + @Override + public long maxValue() throws IOException { + return numericDocValues.longValue(); + } + + @Override + public long minValue() throws IOException { + return numericDocValues.longValue(); + } + + @Override + public int docID() { + return numericDocValues.docID(); + } + }; + } + + @Override + public DocValuesSkipper docValuesSkipper(LeafReader leafReader) throws IOException { + return leafReader.getDocValuesSkipper("test"); + } + }); + } + + public void testSortedNumericDocValuesWithSkipperSmall() throws Exception { + doTestSortedNumericDocValuesWithSkipper(random().nextInt(1, 1000)); + } + + public void testSortedNumericDocValuesWithSkipperMedium() throws Exception { + doTestSortedNumericDocValuesWithSkipper(random().nextInt(1000, 20000)); + } + + @Nightly + public void testSortedNumericDocValuesWithSkipperBig() throws Exception { + doTestSortedNumericDocValuesWithSkipper(random().nextInt(50000, 100000)); + } + + private void doTestSortedNumericDocValuesWithSkipper(int totalDocs) throws Exception { + assertDocValuesWithSkipper( + totalDocs, + new TestDocValueSkipper() { + @Override + public void populateDoc(Document doc) { + for (int j = 0; j < random().nextInt(1, 5); j++) { + doc.add(SortedNumericDocValuesField.indexedField("test", random().nextLong())); + } + } + + @Override + public DocValuesWrapper docValuesWrapper(LeafReader leafReader) throws IOException { + SortedNumericDocValues sortedNumericDocValues = + leafReader.getSortedNumericDocValues("test"); + return new DocValuesWrapper() { + long max; + long min; + + @Override + public int advance(int target) throws IOException { + int doc = sortedNumericDocValues.advance(target); + if (doc != NO_MORE_DOCS) { + readValues(); + } + return doc; + } + + @Override + public boolean advanceExact(int target) throws IOException { + if (sortedNumericDocValues.advanceExact(target)) { + readValues(); + return true; + } + return false; + } + + private void readValues() throws IOException { + max = Long.MIN_VALUE; + min = Long.MAX_VALUE; + for (int i = 0; i < sortedNumericDocValues.docValueCount(); i++) { + long value = sortedNumericDocValues.nextValue(); + max = Math.max(max, value); + min = Math.min(min, value); + } + } + + @Override + public long maxValue() { + return max; + } + + @Override + public long minValue() { + return min; + } + + @Override + public int docID() { + return sortedNumericDocValues.docID(); + } + }; + } + + @Override + public DocValuesSkipper docValuesSkipper(LeafReader leafReader) throws IOException { + return leafReader.getDocValuesSkipper("test"); + } + }); + } + + public void testSortedDocValuesWithSkipperSmall() throws Exception { + doTestSortedDocValuesWithSkipper(random().nextInt(1, 1000)); + } + + public void testSortedDocValuesWithSkipperMedium() throws Exception { + doTestSortedDocValuesWithSkipper(random().nextInt(1000, 20000)); + } + + @Nightly + public void testSortedDocValuesWithSkipperBig() throws Exception { + doTestSortedDocValuesWithSkipper(random().nextInt(50000, 100000)); + } + + private void doTestSortedDocValuesWithSkipper(int totalDocs) throws Exception { + assertDocValuesWithSkipper( + totalDocs, + new TestDocValueSkipper() { + @Override + public void populateDoc(Document doc) { + doc.add(SortedDocValuesField.indexedField("test", TestUtil.randomBinaryTerm(random()))); + } + + @Override + public DocValuesWrapper docValuesWrapper(LeafReader leafReader) throws IOException { + SortedDocValues sortedDocValues = leafReader.getSortedDocValues("test"); + return new DocValuesWrapper() { + + @Override + public int advance(int target) throws IOException { + return sortedDocValues.advance(target); + } + + @Override + public boolean advanceExact(int target) throws IOException { + return sortedDocValues.advanceExact(target); + } + + @Override + public long maxValue() throws IOException { + return sortedDocValues.ordValue(); + } + + @Override + public long minValue() throws IOException { + return sortedDocValues.ordValue(); + } + + @Override + public int docID() { + return sortedDocValues.docID(); + } + }; + } + + @Override + public DocValuesSkipper docValuesSkipper(LeafReader leafReader) throws IOException { + return leafReader.getDocValuesSkipper("test"); + } + }); + } + + public void testSortedSetDocValuesWithSkipperSmall() throws Exception { + doTestSortedSetDocValuesWithSkipper(random().nextInt(1, 1000)); + } + + public void testSortedSetDocValuesWithSkipperMedium() throws Exception { + doTestSortedSetDocValuesWithSkipper(random().nextInt(10000, 20000)); + } + + @Nightly + public void testSortedSetDocValuesWithSkipperBig() throws Exception { + doTestSortedSetDocValuesWithSkipper(random().nextInt(50000, 100000)); + } + + private void doTestSortedSetDocValuesWithSkipper(int totalDocs) throws Exception { + assertDocValuesWithSkipper( + totalDocs, + new TestDocValueSkipper() { + @Override + public void populateDoc(Document doc) { + for (int j = 0; j < random().nextInt(1, 5); j++) { + doc.add( + SortedSetDocValuesField.indexedField( + "test", TestUtil.randomBinaryTerm(random()))); + } + } + + @Override + public DocValuesWrapper docValuesWrapper(LeafReader leafReader) throws IOException { + SortedSetDocValues sortedSetDocValues = leafReader.getSortedSetDocValues("test"); + return new DocValuesWrapper() { + long max; + long min; + + @Override + public int advance(int target) throws IOException { + int doc = sortedSetDocValues.advance(target); + if (doc != NO_MORE_DOCS) { + readValues(); + } + return doc; + } + + @Override + public boolean advanceExact(int target) throws IOException { + if (sortedSetDocValues.advanceExact(target)) { + readValues(); + return true; + } + return false; + } + + private void readValues() throws IOException { + max = Long.MIN_VALUE; + min = Long.MAX_VALUE; + for (int i = 0; i < sortedSetDocValues.docValueCount(); i++) { + long value = sortedSetDocValues.nextOrd(); + max = Math.max(max, value); + min = Math.min(min, value); + } + } + + @Override + public long maxValue() { + return max; + } + + @Override + public long minValue() { + return min; + } + + @Override + public int docID() { + return sortedSetDocValues.docID(); + } + }; + } + + @Override + public DocValuesSkipper docValuesSkipper(LeafReader leafReader) throws IOException { + return leafReader.getDocValuesSkipper("test"); + } + }); + } + + private void assertDocValuesWithSkipper(int totalDocs, TestDocValueSkipper testDocValueSkipper) + throws Exception { + Supplier booleanSupplier; + switch (random().nextInt(3)) { + case 0 -> booleanSupplier = () -> true; + case 1 -> booleanSupplier = () -> random().nextBoolean(); + case 2 -> booleanSupplier = () -> random().nextBoolean() && random().nextBoolean(); + default -> throw new AssertionError(); + } + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + int numDocs = 0; + for (int i = 0; i < totalDocs; i++) { Document doc = new Document(); - for (int ord = 0; ord < numOrds; ord++) { - doc.add( - new SortedSetDocValuesField("sorted_set_dv", TestUtil.randomBinaryTerm(random(), 2))); + if (booleanSupplier.get()) { + testDocValueSkipper.populateDoc(doc); + numDocs++; } writer.addDocument(doc); - } - writer.forceMerge(1, true); - try (DirectoryReader reader = DirectoryReader.open(writer)) { - TestUtil.checkReader(reader); - } - IOUtils.close(writer, dir); - } - - private interface FieldCreator { - Field next(); - - DocIdSetIterator iterator(IndexReader r) throws IOException; - } - - private void doTestRandomAdvance(FieldCreator fieldCreator) throws IOException { - - Analyzer analyzer = new MockAnalyzer(random()); - - Directory directory = newDirectory(); - IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setMergePolicy(newLogMergePolicy()); - RandomIndexWriter w = new RandomIndexWriter(random(), directory, conf); - int numChunks = atLeast(10); - int id = 0; - Set missingSet = new HashSet<>(); - for (int i = 0; i < numChunks; i++) { - // change sparseness for each chunk - double sparseChance = random().nextDouble(); - int docCount = atLeast(1000); - for (int j = 0; j < docCount; j++) { - Document doc = new Document(); - doc.add(new StoredField("id", id)); - if (random().nextDouble() > sparseChance) { - doc.add(fieldCreator.next()); - } else { - missingSet.add(id); - } - id++; - w.addDocument(doc); + if (rarely()) { + writer.commit(); } } + writer.flush(); if (random().nextBoolean()) { - w.forceMerge(1); + writer.forceMerge(1); } - // Now search the index: - IndexReader r = w.getReader(); - StoredFields storedFields = r.storedFields(); - BitSet missing = new FixedBitSet(r.maxDoc()); - for (int docID = 0; docID < r.maxDoc(); docID++) { - Document doc = storedFields.document(docID); - if (missingSet.contains(doc.getField("id").numericValue())) { - missing.set(docID); + IndexReader r = writer.getReader(); + int readDocs = 0; + for (LeafReaderContext readerContext : r.leaves()) { + LeafReader reader = readerContext.reader(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream infoStream = new PrintStream(bos, false, UTF_8); + DocValuesStatus status = CheckIndex.testDocValues((CodecReader) reader, infoStream, true); + if (status.error != null) { + throw new Exception(status.error); + } + readDocs += + assertDocValuesSkipSequential( + testDocValueSkipper.docValuesWrapper(reader), + testDocValueSkipper.docValuesSkipper(reader)); + for (int i = 0; i < 10; i++) { + assertDocValuesSkipRandom( + testDocValueSkipper.docValuesWrapper(reader), + testDocValueSkipper.docValuesSkipper(reader), + reader.maxDoc()); } } - - int numIters = atLeast(10); - for (int iter = 0; iter < numIters; iter++) { - DocIdSetIterator values = fieldCreator.iterator(r); - assertEquals(-1, values.docID()); - - while (true) { - int docID; - if (random().nextBoolean()) { - docID = values.nextDoc(); - } else { - int range; - if (random().nextInt(10) == 7) { - // big jump - range = r.maxDoc() - values.docID(); - } else { - // small jump - range = 25; - } - int inc = TestUtil.nextInt(random(), 1, range); - docID = values.advance(values.docID() + inc); - } - if (docID == NO_MORE_DOCS) { - break; - } - assertFalse(missing.get(docID)); - } - } - - IOUtils.close(r, w, directory); + assertEquals(numDocs, readDocs); + IOUtils.close(r, writer, directory); } - protected boolean codecAcceptsHugeBinaryValues(String field) { - return true; + private int assertDocValuesSkipSequential(DocValuesWrapper iterator, DocValuesSkipper skipper) + throws IOException { + if (skipper == null) { + return 0; + } + + assertEquals(-1, iterator.docID()); + assertEquals(-1, skipper.minDocID(0)); + assertEquals(-1, skipper.maxDocID(0)); + + iterator.advance(0); + int docCount = 0; + while (true) { + int previousMaxDoc = skipper.maxDocID(0); + skipper.advance(previousMaxDoc + 1); + assertTrue(skipper.minDocID(0) > previousMaxDoc); + if (skipperHasAccurateDocBounds()) { + assertEquals(iterator.docID(), skipper.minDocID(0)); + } else { + assertTrue( + "Expected: " + iterator.docID() + " but got " + skipper.minDocID(0), + skipper.minDocID(0) <= iterator.docID()); + } + + if (skipper.minDocID(0) == NO_MORE_DOCS) { + assertEquals(NO_MORE_DOCS, skipper.maxDocID(0)); + break; + } + assertTrue(skipper.docCount(0) > 0); + + int maxDoc = -1; + long minVal = Long.MAX_VALUE; + long maxVal = Long.MIN_VALUE; + for (int i = 0; i < skipper.docCount(0); ++i) { + assertNotEquals(NO_MORE_DOCS, iterator.docID()); + maxDoc = Math.max(maxDoc, iterator.docID()); + minVal = Math.min(minVal, iterator.minValue()); + maxVal = Math.max(maxVal, iterator.maxValue()); + iterator.advance(iterator.docID() + 1); + } + if (skipperHasAccurateDocBounds()) { + assertEquals(maxDoc, skipper.maxDocID(0)); + } else { + assertTrue( + "Expected: " + maxDoc + " but got " + skipper.maxDocID(0), + skipper.maxDocID(0) >= maxDoc); + } + if (skipperHasAccurateValueBounds()) { + assertEquals(minVal, skipper.minValue(0)); + assertEquals(maxVal, skipper.maxValue(0)); + } else { + assertTrue( + "Expected: " + minVal + " but got " + skipper.minValue(0), + minVal >= skipper.minValue(0)); + assertTrue( + "Expected: " + maxVal + " but got " + skipper.maxValue(0), + maxVal <= skipper.maxValue(0)); + } + docCount += skipper.docCount(0); + } + + assertEquals(docCount, skipper.docCount()); + return docCount; + } + + private static void assertDocValuesSkipRandom( + DocValuesWrapper iterator, DocValuesSkipper skipper, int maxDoc) throws IOException { + if (skipper == null) { + return; + } + while (true) { + int doc = random().nextInt(skipper.maxDocID(0), maxDoc + 1) + 1; + skipper.advance(doc); + if (skipper.minDocID(0) == NO_MORE_DOCS) { + assertEquals(NO_MORE_DOCS, skipper.maxDocID(0)); + return; + } + if (iterator.advanceExact(doc)) { + assertTrue(iterator.docID() >= skipper.minDocID(0)); + assertTrue(iterator.docID() <= skipper.maxDocID(0)); + assertTrue(iterator.minValue() >= skipper.minValue(0)); + assertTrue(iterator.maxValue() <= skipper.maxValue(0)); + } + } + } + + private interface TestDocValueSkipper { + + void populateDoc(Document doc); + + DocValuesWrapper docValuesWrapper(LeafReader leafReader) throws IOException; + + DocValuesSkipper docValuesSkipper(LeafReader leafReader) throws IOException; + } + + private interface DocValuesWrapper { + + int advance(int target) throws IOException; + + boolean advanceExact(int target) throws IOException; + + long maxValue() throws IOException; + + long minValue() throws IOException; + + int docID(); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java index a8af8b03a6b..7d4f2839c6d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java @@ -19,6 +19,7 @@ package org.apache.lucene.tests.index; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.IOException; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -60,6 +61,13 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes private static final IndexPackageAccess INDEX_PACKAGE_ACCESS = TestSecrets.getIndexPackageAccess(); + /** + * Override and return {@code false} if the format does not support setting doc values skip index. + */ + protected boolean supportDocValuesSkipIndex() { + return true; + } + /** Test field infos read/write with a single field */ public void testOneField() throws Exception { Directory dir = newDirectory(); @@ -295,6 +303,15 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes storePayloads = random().nextBoolean(); } } + boolean hasDocValuesSkipIndex = false; + if (EnumSet.of( + DocValuesType.NUMERIC, + DocValuesType.SORTED, + DocValuesType.SORTED_NUMERIC, + DocValuesType.SORTED_SET) + .contains(fieldType.docValuesType())) { + hasDocValuesSkipIndex = fieldType.hasDocValuesSkipIndex(); + } FieldInfo fi = new FieldInfo( field, @@ -304,6 +321,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes storePayloads, fieldType.indexOptions(), fieldType.docValuesType(), + hasDocValuesSkipIndex, -1, new HashMap<>(), fieldType.pointDimensionCount(), @@ -349,8 +367,15 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes } if (r.nextBoolean()) { - DocValuesType values[] = DocValuesType.values(); + DocValuesType[] values = DocValuesType.values(); + DocValuesType current = values[r.nextInt(values.length)]; type.setDocValuesType(values[r.nextInt(values.length)]); + if (current == DocValuesType.NUMERIC + || current == DocValuesType.SORTED_NUMERIC + || current == DocValuesType.SORTED + || current == DocValuesType.SORTED_SET) { + type.setDocValuesSkipIndex(supportDocValuesSkipIndex() && random().nextBoolean()); + } } if (r.nextBoolean()) { @@ -389,6 +414,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes assertEquals(expected.number, actual.number); assertEquals(expected.name, actual.name); assertEquals(expected.getDocValuesType(), actual.getDocValuesType()); + assertEquals(expected.hasDocValuesSkipIndex(), actual.hasDocValuesSkipIndex()); assertEquals(expected.getIndexOptions(), actual.getIndexOptions()); assertEquals(expected.hasNorms(), actual.hasNorms()); assertEquals(expected.hasPayloads(), actual.hasPayloads()); @@ -429,6 +455,7 @@ public abstract class BaseFieldInfoFormatTestCase extends BaseIndexFileFormatTes false, TextField.TYPE_STORED.indexOptions(), DocValuesType.NONE, + false, -1, new HashMap<>(), 0, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java index 10e748363d7..6f2bc38ff64 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseIndexFileFormatTestCase.java @@ -357,6 +357,7 @@ public abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), + proto.hasDocValuesSkipIndex(), proto.getDocValuesGen(), new HashMap<>(), proto.getPointDimensionCount(), diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java new file mode 100644 index 00000000000..2e797c2ced4 --- /dev/null +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/LegacyBaseDocValuesFormatTestCase.java @@ -0,0 +1,3810 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.tests.index; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.CountDownLatch; +import java.util.function.LongSupplier; +import java.util.function.Supplier; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.simpletext.SimpleTextCodec; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FloatDocValuesField; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.BinaryDocValues; +import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus; +import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.MultiDocValues; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; + +/** + * Abstract class to do basic tests for a docvalues format. NOTE: This test focuses on the docvalues + * impl, nothing else. The [stretch] goal is for this test to be so thorough in testing a new + * DocValuesFormat that if this test passes, then all Lucene tests should also pass. Ie, if there is + * some bug in a given DocValuesFormat that this test fails to catch then this test needs to be + * improved! + */ +public abstract class LegacyBaseDocValuesFormatTestCase extends BaseIndexFileFormatTestCase { + + @Override + protected void addRandomFields(Document doc) { + if (usually()) { + doc.add(new NumericDocValuesField("ndv", random().nextInt(1 << 12))); + doc.add(new BinaryDocValuesField("bdv", newBytesRef(TestUtil.randomSimpleString(random())))); + doc.add( + new SortedDocValuesField("sdv", newBytesRef(TestUtil.randomSimpleString(random(), 2)))); + } + int numValues = random().nextInt(5); + for (int i = 0; i < numValues; ++i) { + doc.add( + new SortedSetDocValuesField( + "ssdv", newBytesRef(TestUtil.randomSimpleString(random(), 2)))); + } + numValues = random().nextInt(5); + for (int i = 0; i < numValues; ++i) { + doc.add( + new SortedNumericDocValuesField( + "sndv", TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); + } + } + + public void testOneNumber() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm" + + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new NumericDocValuesField("dv", 5)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + Document hitDoc = storedFields.document(hits.scoreDocs[i].doc); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + int docID = hits.scoreDocs[i].doc; + assertEquals(docID, dv.advance(docID)); + assertEquals(5, dv.longValue()); + } + + ireader.close(); + directory.close(); + } + + public void testOneFloat() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new FloatDocValuesField("dv", 5.7f)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + assertEquals(docID, dv.advance(docID)); + assertEquals(Float.floatToRawIntBits(5.7f), dv.longValue()); + } + + ireader.close(); + directory.close(); + } + + public void testTwoNumbers() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 5)); + doc.add(new NumericDocValuesField("dv2", 17)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv1"); + assertEquals(docID, dv.advance(docID)); + assertEquals(5, dv.longValue()); + dv = ireader.leaves().get(0).reader().getNumericDocValues("dv2"); + assertEquals(docID, dv.advance(docID)); + assertEquals(17, dv.longValue()); + } + + ireader.close(); + directory.close(); + } + + public void testTwoBinaryValues() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef(longTerm))); + doc.add(new BinaryDocValuesField("dv2", newBytesRef(text))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int hitDocID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(hitDocID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); + assertEquals(hitDocID, dv.advance(hitDocID)); + BytesRef scratch = dv.binaryValue(); + assertEquals(newBytesRef(longTerm), scratch); + dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv2"); + assertEquals(hitDocID, dv.advance(hitDocID)); + scratch = dv.binaryValue(); + assertEquals(newBytesRef(text), scratch); + } + + ireader.close(); + directory.close(); + } + + public void testVariouslyCompressibleBinaryValues() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + int numDocs = 1 + random().nextInt(100); + + HashMap writtenValues = new HashMap<>(); + + // Small vocabulary ranges will be highly compressible + int vocabRange = 1 + random().nextInt(Byte.MAX_VALUE - 1); + + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + + // Generate random-sized byte array with random choice of bytes in vocab range + byte[] value = new byte[500 + random().nextInt(1024)]; + for (int j = 0; j < value.length; j++) { + value[j] = (byte) random().nextInt(vocabRange); + } + BytesRef bytesRef = newBytesRef(value); + writtenValues.put(i, bytesRef); + doc.add(newTextField("id", Integer.toString(i), Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", bytesRef)); + iwriter.addDocument(doc); + } + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + for (int i = 0; i < numDocs; i++) { + String id = Integer.toString(i); + Query query = new TermQuery(new Term("id", id)); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + int hitDocID = hits.scoreDocs[0].doc; + Document hitDoc = storedFields.document(hitDocID); + assertEquals(id, hitDoc.get("id")); + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); + assertEquals(hitDocID, dv.advance(hitDocID)); + BytesRef scratch = dv.binaryValue(); + assertEquals(writtenValues.get(i), scratch); + } + + ireader.close(); + directory.close(); + } + + public void testTwoFieldsMixed() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 5)); + doc.add(new BinaryDocValuesField("dv2", newBytesRef("hello world"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv1"); + assertEquals(docID, dv.advance(docID)); + assertEquals(5, dv.longValue()); + BinaryDocValues dv2 = ireader.leaves().get(0).reader().getBinaryDocValues("dv2"); + assertEquals(docID, dv2.advance(docID)); + assertEquals(newBytesRef("hello world"), dv2.binaryValue()); + } + + ireader.close(); + directory.close(); + } + + public void testThreeFieldsMixed() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new SortedDocValuesField("dv1", newBytesRef("hello hello"))); + doc.add(new NumericDocValuesField("dv2", 5)); + doc.add(new BinaryDocValuesField("dv3", newBytesRef("hello world"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + StoredFields storedFields = isearcher.storedFields(); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv1"); + assertEquals(docID, dv.advance(docID)); + int ord = dv.ordValue(); + BytesRef scratch = dv.lookupOrd(ord); + assertEquals(newBytesRef("hello hello"), scratch); + NumericDocValues dv2 = ireader.leaves().get(0).reader().getNumericDocValues("dv2"); + assertEquals(docID, dv2.advance(docID)); + assertEquals(5, dv2.longValue()); + BinaryDocValues dv3 = ireader.leaves().get(0).reader().getBinaryDocValues("dv3"); + assertEquals(docID, dv3.advance(docID)); + assertEquals(newBytesRef("hello world"), dv3.binaryValue()); + } + + ireader.close(); + directory.close(); + } + + public void testThreeFieldsMixed2() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef("hello world"))); + doc.add(new SortedDocValuesField("dv2", newBytesRef("hello hello"))); + doc.add(new NumericDocValuesField("dv3", 5)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + BytesRef scratch; + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv2"); + assertEquals(docID, dv.advance(docID)); + int ord = dv.ordValue(); + scratch = dv.lookupOrd(ord); + assertEquals(newBytesRef("hello hello"), scratch); + NumericDocValues dv2 = ireader.leaves().get(0).reader().getNumericDocValues("dv3"); + assertEquals(docID, dv2.advance(docID)); + assertEquals(5, dv2.longValue()); + BinaryDocValues dv3 = ireader.leaves().get(0).reader().getBinaryDocValues("dv1"); + assertEquals(docID, dv3.advance(docID)); + assertEquals(newBytesRef("hello world"), dv3.binaryValue()); + } + + ireader.close(); + directory.close(); + } + + public void testTwoDocumentsNumeric() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new NumericDocValuesField("dv", 1)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("dv", 2)); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(1, dv.longValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(2, dv.longValue()); + + ireader.close(); + directory.close(); + } + + public void testTwoDocumentsMerged() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(newField("id", "0", StringField.TYPE_STORED)); + doc.add(new NumericDocValuesField("dv", -10)); + iwriter.addDocument(doc); + iwriter.commit(); + doc = new Document(); + doc.add(newField("id", "1", StringField.TYPE_STORED)); + doc.add(new NumericDocValuesField("dv", 99)); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); + for (int i = 0; i < 2; i++) { + Document doc2 = storedFields.document(i); + long expected; + if (doc2.get("id").equals("0")) { + expected = -10; + } else { + expected = 99; + } + assertEquals(i, dv.nextDoc()); + assertEquals(expected, dv.longValue()); + } + + ireader.close(); + directory.close(); + } + + public void testBigNumericRange() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new NumericDocValuesField("dv", Long.MIN_VALUE)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("dv", Long.MAX_VALUE)); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(Long.MIN_VALUE, dv.longValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(Long.MAX_VALUE, dv.longValue()); + + ireader.close(); + directory.close(); + } + + public void testBigNumericRange2() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new NumericDocValuesField("dv", -8841491950446638677L)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new NumericDocValuesField("dv", 9062230939892376225L)); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + NumericDocValues dv = ireader.leaves().get(0).reader().getNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(-8841491950446638677L, dv.longValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(9062230939892376225L, dv.longValue()); + + ireader.close(); + directory.close(); + } + + public void testBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv", newBytesRef("hello world"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + StoredFields storedFields = isearcher.storedFields(); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + // Iterate through the results: + for (int i = 0; i < hits.scoreDocs.length; i++) { + int hitDocID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(hitDocID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + assertEquals(hitDocID, dv.advance(hitDocID)); + assertEquals(newBytesRef("hello world"), dv.binaryValue()); + } + + ireader.close(); + directory.close(); + } + + public void testBytesTwoDocumentsMerged() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(newField("id", "0", StringField.TYPE_STORED)); + doc.add(new BinaryDocValuesField("dv", newBytesRef("hello world 1"))); + iwriter.addDocument(doc); + iwriter.commit(); + doc = new Document(); + doc.add(newField("id", "1", StringField.TYPE_STORED)); + doc.add(new BinaryDocValuesField("dv", newBytesRef("hello 2"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); + for (int i = 0; i < 2; i++) { + Document doc2 = storedFields.document(i); + String expected; + if (doc2.get("id").equals("0")) { + expected = "hello world 1"; + } else { + expected = "hello 2"; + } + assertEquals(i, dv.nextDoc()); + assertEquals(expected, dv.binaryValue().utf8ToString()); + } + + ireader.close(); + directory.close(); + } + + public void testBytesMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new BinaryDocValuesField("field", newBytesRef("hi"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + BinaryDocValues dv = getOnlyLeafReader(ireader).getBinaryDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + public void testSortedBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + String longTerm = + "longtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongtermlongterm"; + String text = "This is the text to be indexed. " + longTerm; + doc.add(newTextField("fieldname", text, Field.Store.YES)); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + IndexSearcher isearcher = new IndexSearcher(ireader); + + assertEquals(1, isearcher.count(new TermQuery(new Term("fieldname", longTerm)))); + Query query = new TermQuery(new Term("fieldname", "text")); + TopDocs hits = isearcher.search(query, 1); + assertEquals(1, hits.totalHits.value); + BytesRef scratch; + // Iterate through the results: + StoredFields storedFields = isearcher.storedFields(); + for (int i = 0; i < hits.scoreDocs.length; i++) { + int docID = hits.scoreDocs[i].doc; + Document hitDoc = storedFields.document(docID); + assertEquals(text, hitDoc.get("fieldname")); + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + assertEquals(docID, dv.advance(docID)); + scratch = dv.lookupOrd(dv.ordValue()); + assertEquals(newBytesRef("hello world"), scratch); + } + + ireader.close(); + directory.close(); + } + + public void testSortedBytesTwoDocuments() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + BytesRef scratch; + assertEquals(0, dv.nextDoc()); + scratch = dv.lookupOrd(dv.ordValue()); + assertEquals("hello world 1", scratch.utf8ToString()); + assertEquals(1, dv.nextDoc()); + scratch = dv.lookupOrd(dv.ordValue()); + assertEquals("hello world 2", scratch.utf8ToString()); + + ireader.close(); + directory.close(); + } + + public void testSortedBytesThreeDocuments() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + assertEquals(2, dv.getValueCount()); + assertEquals(0, dv.nextDoc()); + assertEquals(0, dv.ordValue()); + BytesRef scratch = dv.lookupOrd(0); + assertEquals("hello world 1", scratch.utf8ToString()); + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.ordValue()); + scratch = dv.lookupOrd(1); + assertEquals("hello world 2", scratch.utf8ToString()); + assertEquals(2, dv.nextDoc()); + assertEquals(0, dv.ordValue()); + + ireader.close(); + directory.close(); + } + + public void testSortedBytesTwoDocumentsMerged() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(newField("id", "0", StringField.TYPE_STORED)); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 1"))); + iwriter.addDocument(doc); + iwriter.commit(); + doc = new Document(); + doc.add(newField("id", "1", StringField.TYPE_STORED)); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + assertEquals(2, dv.getValueCount()); // 2 ords + assertEquals(0, dv.nextDoc()); + BytesRef scratch = dv.lookupOrd(dv.ordValue()); + assertEquals(newBytesRef("hello world 1"), scratch); + scratch = dv.lookupOrd(1); + assertEquals(newBytesRef("hello world 2"), scratch); + StoredFields storedFields = ireader.leaves().get(0).reader().storedFields(); + for (int i = 0; i < 2; i++) { + Document doc2 = storedFields.document(i); + String expected; + if (doc2.get("id").equals("0")) { + expected = "hello world 1"; + } else { + expected = "hello world 2"; + } + if (dv.docID() < i) { + assertEquals(i, dv.nextDoc()); + } + scratch = dv.lookupOrd(dv.ordValue()); + assertEquals(expected, scratch.utf8ToString()); + } + + ireader.close(); + directory.close(); + } + + public void testSortedMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + TermsEnum termsEnum = dv.termsEnum(); + assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); + assertEquals(-1, dv.lookupTerm(new BytesRef("lucene"))); + + ireader.close(); + directory.close(); + } + + public void testBytesWithNewline() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new BinaryDocValuesField("dv", newBytesRef("hello\nworld\r1"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef("hello\nworld\r1"), dv.binaryValue()); + + ireader.close(); + directory.close(); + } + + public void testMissingSortedBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("hello world 2"))); + iwriter.addDocument(doc); + // 2nd doc missing the DV field + iwriter.addDocument(new Document()); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + assertEquals(0, dv.nextDoc()); + BytesRef scratch = dv.lookupOrd(dv.ordValue()); + assertEquals(newBytesRef("hello world 2"), scratch); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ireader.close(); + directory.close(); + } + + public void testSortedTermsEnum() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + + doc = new Document(); + doc.add(new SortedDocValuesField("field", newBytesRef("world"))); + iwriter.addDocument(doc); + + doc = new Document(); + doc.add(new SortedDocValuesField("field", newBytesRef("beer"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); + assertEquals(3, dv.getValueCount()); + + TermsEnum termsEnum = dv.termsEnum(); + + // next() + assertEquals("beer", termsEnum.next().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + + // seekCeil() + assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("ha!"))); + assertEquals("hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(newBytesRef("beer"))); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertEquals(SeekStatus.END, termsEnum.seekCeil(newBytesRef("zzz"))); + assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("aba"))); + assertEquals(0, termsEnum.ord()); + + // seekExact() + assertTrue(termsEnum.seekExact(newBytesRef("beer"))); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertTrue(termsEnum.seekExact(newBytesRef("hello"))); + assertEquals(Codec.getDefault().toString(), "hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertTrue(termsEnum.seekExact(newBytesRef("world"))); + assertEquals("world", termsEnum.term().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertFalse(termsEnum.seekExact(newBytesRef("bogus"))); + + // seek(ord) + termsEnum.seekExact(0); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + termsEnum.seekExact(1); + assertEquals("hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + termsEnum.seekExact(2); + assertEquals("world", termsEnum.term().utf8ToString()); + assertEquals(2, termsEnum.ord()); + + // NORMAL automaton + termsEnum = + dv.intersect( + new CompiledAutomaton( + Operations.determinize( + new RegExp(".*l.*").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT))); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertNull(termsEnum.next()); + + // SINGLE automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertNull(termsEnum.next()); + + ireader.close(); + directory.close(); + } + + public void testEmptySortedBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef(""))); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef(""))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = ireader.leaves().get(0).reader().getSortedDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(0, dv.ordValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(0, dv.ordValue()); + BytesRef scratch = dv.lookupOrd(0); + assertEquals("", scratch.utf8ToString()); + + ireader.close(); + directory.close(); + } + + public void testEmptyBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new BinaryDocValuesField("dv", newBytesRef(""))); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new BinaryDocValuesField("dv", newBytesRef(""))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals("", dv.binaryValue().utf8ToString()); + assertEquals(1, dv.nextDoc()); + assertEquals("", dv.binaryValue().utf8ToString()); + + ireader.close(); + directory.close(); + } + + public void testVeryLargeButLegalBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + byte[] bytes = new byte[32766]; + random().nextBytes(bytes); + BytesRef b = newBytesRef(bytes); + doc.add(new BinaryDocValuesField("dv", b)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef(bytes), dv.binaryValue()); + + ireader.close(); + directory.close(); + } + + public void testVeryLargeButLegalSortedBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + byte[] bytes = new byte[32766]; + random().nextBytes(bytes); + BytesRef b = newBytesRef(bytes); + doc.add(new SortedDocValuesField("dv", b)); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = DocValues.getSorted(ireader.leaves().get(0).reader(), "dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef(bytes), dv.lookupOrd(dv.ordValue())); + ireader.close(); + directory.close(); + } + + public void testCodecUsesOwnBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new BinaryDocValuesField("dv", newBytesRef("boo!"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + BinaryDocValues dv = ireader.leaves().get(0).reader().getBinaryDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals("boo!", dv.binaryValue().utf8ToString()); + + ireader.close(); + directory.close(); + } + + public void testCodecUsesOwnSortedBytes() throws IOException { + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new SortedDocValuesField("dv", newBytesRef("boo!"))); + iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + IndexReader ireader = + maybeWrapWithMergingReader(DirectoryReader.open(directory)); // read-only=true + assert ireader.leaves().size() == 1; + SortedDocValues dv = DocValues.getSorted(ireader.leaves().get(0).reader(), "dv"); + byte[] mybytes = new byte[20]; + assertEquals(0, dv.nextDoc()); + assertEquals("boo!", dv.lookupOrd(dv.ordValue()).utf8ToString()); + assertFalse(dv.lookupOrd(dv.ordValue()).bytes == mybytes); + + ireader.close(); + directory.close(); + } + + /* + * Simple test case to show how to use the API + */ + public void testDocValuesSimple() throws IOException { + Directory dir = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + IndexWriter writer = new IndexWriter(dir, conf); + for (int i = 0; i < 5; i++) { + Document doc = new Document(); + doc.add(new NumericDocValuesField("docId", i)); + doc.add(new TextField("docId", "" + i, Field.Store.NO)); + writer.addDocument(doc); + } + writer.commit(); + writer.forceMerge(1, true); + + writer.close(); + + DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir)); + assertEquals(1, reader.leaves().size()); + + IndexSearcher searcher = new IndexSearcher(reader); + + BooleanQuery.Builder query = new BooleanQuery.Builder(); + query.add(new TermQuery(new Term("docId", "0")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("docId", "1")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("docId", "2")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("docId", "3")), BooleanClause.Occur.SHOULD); + query.add(new TermQuery(new Term("docId", "4")), BooleanClause.Occur.SHOULD); + + TopDocs search = searcher.search(query.build(), 10); + assertEquals(5, search.totalHits.value); + ScoreDoc[] scoreDocs = search.scoreDocs; + NumericDocValues docValues = getOnlyLeafReader(reader).getNumericDocValues("docId"); + for (int i = 0; i < scoreDocs.length; i++) { + assertEquals(i, scoreDocs[i].doc); + assertEquals(i, docValues.advance(i)); + assertEquals(i, docValues.longValue()); + } + reader.close(); + dir.close(); + } + + public void testRandomSortedBytes() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig cfg = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, cfg); + int numDocs = atLeast(100); + BytesRefHash hash = new BytesRefHash(); + Map docToString = new HashMap<>(); + int maxLength = TestUtil.nextInt(random(), 1, 50); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(newTextField("id", "" + i, Field.Store.YES)); + String string = TestUtil.randomRealisticUnicodeString(random(), 1, maxLength); + BytesRef br = newBytesRef(string); + doc.add(new SortedDocValuesField("field", br)); + hash.add(br); + docToString.put("" + i, string); + w.addDocument(doc); + } + if (rarely()) { + w.commit(); + } + int numDocsNoValue = atLeast(10); + for (int i = 0; i < numDocsNoValue; i++) { + Document doc = new Document(); + doc.add(newTextField("id", "noValue", Field.Store.YES)); + w.addDocument(doc); + } + if (rarely()) { + w.commit(); + } + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + String id = "" + (i + numDocs); + doc.add(newTextField("id", id, Field.Store.YES)); + String string = TestUtil.randomRealisticUnicodeString(random(), 1, maxLength); + BytesRef br = newBytesRef(string); + hash.add(br); + docToString.put(id, string); + doc.add(new SortedDocValuesField("field", br)); + w.addDocument(doc); + } + w.commit(); + IndexReader reader = w.getReader(); + SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field"); + int[] sort = hash.sort(); + BytesRef expected = newBytesRef(); + assertEquals(hash.size(), docValues.getValueCount()); + for (int i = 0; i < hash.size(); i++) { + hash.get(sort[i], expected); + final BytesRef actual = docValues.lookupOrd(i); + assertEquals(expected.utf8ToString(), actual.utf8ToString()); + int ord = docValues.lookupTerm(expected); + assertEquals(i, ord); + } + Set> entrySet = docToString.entrySet(); + + for (Entry entry : entrySet) { + // pk lookup + PostingsEnum termPostingsEnum = + TestUtil.docs(random(), reader, "id", newBytesRef(entry.getKey()), null, 0); + int docId = termPostingsEnum.nextDoc(); + expected = newBytesRef(entry.getValue()); + docValues = MultiDocValues.getSortedValues(reader, "field"); + assertEquals(docId, docValues.advance(docId)); + final BytesRef actual = docValues.lookupOrd(docValues.ordValue()); + assertEquals(expected, actual); + } + + reader.close(); + w.close(); + dir.close(); + } + + private void doTestNumericsVsStoredFields(double density, LongSupplier longs) throws Exception { + doTestNumericsVsStoredFields(density, longs, 256); + } + + private void doTestNumericsVsStoredFields(double density, LongSupplier longs, int minDocs) + throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedField = newStringField("stored", "", Field.Store.YES); + Field dvField = new NumericDocValuesField("dv", 0); + doc.add(idField); + doc.add(storedField); + doc.add(dvField); + + // index some docs + int numDocs = atLeast((int) (minDocs * 1.172)); + // numDocs should be always > 256 so that in case of a codec that optimizes + // for numbers of values <= 256, all storage layouts are tested + assert numDocs > 256; + for (int i = 0; i < numDocs; i++) { + if (random().nextDouble() > density) { + writer.addDocument(new Document()); + continue; + } + idField.setStringValue(Integer.toString(i)); + long value = longs.getAsLong(); + storedField.setStringValue(Long.toString(value)); + dvField.setLongValue(value); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // merge some segments and ensure that at least one of them has more than + // max(256, minDocs) values + writer.forceMerge(numDocs / Math.max(256, minDocs)); + + writer.close(); + // compare + assertDVIterate(dir); + dir.close(); + } + + // Asserts equality of stored value vs. DocValue by iterating DocValues one at a time + protected void assertDVIterate(Directory dir) throws IOException { + DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + NumericDocValues docValues = DocValues.getNumeric(r, "dv"); + docValues.nextDoc(); + StoredFields storedFields = r.storedFields(); + for (int i = 0; i < r.maxDoc(); i++) { + String storedValue = storedFields.document(i).get("stored"); + if (storedValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(Long.parseLong(storedValue), docValues.longValue()); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + } + + protected void compareStoredFieldWithSortedNumericsDV( + DirectoryReader directoryReader, String storedField, String dvField) throws IOException { + for (LeafReaderContext leaf : directoryReader.leaves()) { + LeafReader reader = leaf.reader(); + StoredFields storedFields = reader.storedFields(); + SortedNumericDocValues docValues = reader.getSortedNumericDocValues(dvField); + if (docValues == null) { + // no stored values at all + for (int doc = 0; doc < reader.maxDoc(); doc++) { + assertArrayEquals(new String[0], storedFields.document(doc).getValues(storedField)); + } + continue; + } + for (int doc = 0; doc < reader.maxDoc(); doc++) { + String[] storedValues = storedFields.document(doc).getValues(storedField); + if (storedValues.length == 0) { + assertFalse(docValues.advanceExact(doc)); + continue; + } + switch (random().nextInt(3)) { + case 0 -> assertEquals(doc, docValues.nextDoc()); + case 1 -> assertEquals(doc, docValues.advance(doc)); + default -> assertTrue(docValues.advanceExact(doc)); + } + assertEquals(doc, docValues.docID()); + int repeats = 1 + random().nextInt(3); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + assertEquals(storedValues.length, docValues.docValueCount()); + for (int v = 0; v < docValues.docValueCount(); v++) { + assertEquals(storedValues[v], Long.toString(docValues.nextValue())); + } + } + } + // jump with advanceExact + int iters = 1 + random().nextInt(3); + for (int i = 0; i < iters; i++) { + docValues = reader.getSortedNumericDocValues(dvField); + for (int doc = random().nextInt(leaf.reader().maxDoc()); doc < reader.maxDoc(); doc++) { + String[] storedValues = storedFields.document(doc).getValues(storedField); + if (docValues.advanceExact(doc)) { + assertEquals(doc, docValues.docID()); + int repeats = 1 + random().nextInt(3); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + assertEquals(storedValues.length, docValues.docValueCount()); + for (int v = 0; v < docValues.docValueCount(); v++) { + assertEquals(storedValues[v], Long.toString(docValues.nextValue())); + } + } + } else { + assertArrayEquals(new String[0], storedValues); + } + doc += random().nextInt(5); // skip some docs + } + } + // jump with advance + for (int i = 0; i < iters; i++) { + docValues = reader.getSortedNumericDocValues(dvField); + int doc = random().nextInt(leaf.reader().maxDoc()); + while (doc != NO_MORE_DOCS) { + int nextDoc = docValues.advance(doc); + // no stored fields in between + for (int d = doc; d < (nextDoc == NO_MORE_DOCS ? reader.maxDoc() : nextDoc); d++) { + String[] storedValues = storedFields.document(d).getValues(storedField); + assertArrayEquals(new String[0], storedValues); + } + doc = nextDoc; + if (doc != NO_MORE_DOCS) { + String[] storedValues = storedFields.document(doc).getValues(storedField); + int repeats = 1 + random().nextInt(3); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + assertEquals(storedValues.length, docValues.docValueCount()); + for (int v = 0; v < docValues.docValueCount(); v++) { + assertEquals(storedValues[v], Long.toString(docValues.nextValue())); + } + } + doc = nextDoc + 1; + doc += random().nextInt(5); // skip some docs + } + } + } + } + } + + private void doTestSortedNumericsVsStoredFields(LongSupplier counts, LongSupplier values) + throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + + // index some docs + int numDocs = atLeast(300); + // numDocs should be always > 256 so that in case of a codec that optimizes + // for numbers of values <= 256, all storage layouts are tested + assert numDocs > 256; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc.add(new StringField("id", Integer.toString(i), Field.Store.NO)); + + int valueCount = (int) counts.getAsLong(); + long[] valueArray = new long[valueCount]; + for (int j = 0; j < valueCount; j++) { + long value = values.getAsLong(); + valueArray[j] = value; + doc.add(new SortedNumericDocValuesField("dv", value)); + } + Arrays.sort(valueArray); + for (int j = 0; j < valueCount; j++) { + doc.add(new StoredField("stored", Long.toString(valueArray[j]))); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + try (DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir))) { + TestUtil.checkReader(reader); + compareStoredFieldWithSortedNumericsDV(reader, "stored", "dv"); + } + // merge some segments and ensure that at least one of them has more than + // 256 values + writer.forceMerge(numDocs / 256); + try (DirectoryReader reader = maybeWrapWithMergingReader(DirectoryReader.open(dir))) { + TestUtil.checkReader(reader); + compareStoredFieldWithSortedNumericsDV(reader, "stored", "dv"); + } + IOUtils.close(writer, dir); + } + + public void testBooleanNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(1, () -> random().nextInt(2)); + } + } + + public void testSparseBooleanNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(random().nextDouble(), () -> random().nextInt(2)); + } + } + + public void testByteNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields( + 1, () -> TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)); + } + } + + public void testSparseByteNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields( + random().nextDouble(), () -> TestUtil.nextInt(random(), Byte.MIN_VALUE, Byte.MAX_VALUE)); + } + } + + public void testShortNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields( + 1, () -> TestUtil.nextInt(random(), Short.MIN_VALUE, Short.MAX_VALUE)); + } + } + + public void testSparseShortNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields( + random().nextDouble(), + () -> TestUtil.nextInt(random(), Short.MIN_VALUE, Short.MAX_VALUE)); + } + } + + public void testIntNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(1, random()::nextInt); + } + } + + public void testSparseIntNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(random().nextDouble(), random()::nextInt); + } + } + + public void testLongNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(1, random()::nextLong); + } + } + + public void testSparseLongNumericsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestNumericsVsStoredFields(random().nextDouble(), random()::nextLong); + } + } + + private void doTestBinaryVsStoredFields(double density, Supplier bytes) throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedField = new StoredField("stored", new byte[0]); + Field dvField = new BinaryDocValuesField("dv", newBytesRef()); + doc.add(idField); + doc.add(storedField); + doc.add(dvField); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + if (random().nextDouble() > density) { + writer.addDocument(new Document()); + continue; + } + idField.setStringValue(Integer.toString(i)); + byte[] buffer = bytes.get(); + storedField.setBytesValue(buffer); + dvField.setBytesValue(buffer); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // compare + DirectoryReader ir = writer.getReader(); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + BinaryDocValues docValues = DocValues.getBinary(r, "dv"); + docValues.nextDoc(); + for (int i = 0; i < r.maxDoc(); i++) { + BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); + if (binaryValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(binaryValue, docValues.binaryValue()); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + + // compare again + writer.forceMerge(1); + ir = writer.getReader(); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + BinaryDocValues docValues = DocValues.getBinary(r, "dv"); + docValues.nextDoc(); + for (int i = 0; i < r.maxDoc(); i++) { + BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); + if (binaryValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(binaryValue, docValues.binaryValue()); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + writer.close(); + dir.close(); + } + + public void testBinaryFixedLengthVsStoredFields() throws Exception { + doTestBinaryFixedLengthVsStoredFields(1); + } + + public void testSparseBinaryFixedLengthVsStoredFields() throws Exception { + doTestBinaryFixedLengthVsStoredFields(random().nextDouble()); + } + + private void doTestBinaryFixedLengthVsStoredFields(double density) throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + int fixedLength = TestUtil.nextInt(random(), 0, 10); + doTestBinaryVsStoredFields( + density, + () -> { + byte[] buffer = new byte[fixedLength]; + random().nextBytes(buffer); + return buffer; + }); + } + } + + public void testBinaryVariableLengthVsStoredFields() throws Exception { + doTestBinaryVariableLengthVsStoredFields(1); + } + + public void testSparseBinaryVariableLengthVsStoredFields() throws Exception { + doTestBinaryVariableLengthVsStoredFields(random().nextDouble()); + } + + public void doTestBinaryVariableLengthVsStoredFields(double density) throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestBinaryVsStoredFields( + density, + () -> { + final int length = random().nextInt(10); + byte[] buffer = new byte[length]; + random().nextBytes(buffer); + return buffer; + }); + } + } + + protected void doTestSortedVsStoredFields(int numDocs, double density, Supplier bytes) + throws Exception { + Directory dir = newFSDirectory(createTempDir("dvduel")); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedField = new StoredField("stored", new byte[0]); + Field dvField = new SortedDocValuesField("dv", newBytesRef()); + doc.add(idField); + doc.add(storedField); + doc.add(dvField); + + // index some docs + for (int i = 0; i < numDocs; i++) { + if (random().nextDouble() > density) { + writer.addDocument(new Document()); + continue; + } + idField.setStringValue(Integer.toString(i)); + byte[] buffer = bytes.get(); + storedField.setBytesValue(buffer); + dvField.setBytesValue(buffer); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + // compare + DirectoryReader ir = writer.getReader(); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + SortedDocValues docValues = DocValues.getSorted(r, "dv"); + docValues.nextDoc(); + for (int i = 0; i < r.maxDoc(); i++) { + BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); + if (binaryValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(binaryValue, docValues.lookupOrd(docValues.ordValue())); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + writer.forceMerge(1); + + // compare again + ir = writer.getReader(); + TestUtil.checkReader(ir); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + SortedDocValues docValues = DocValues.getSorted(r, "dv"); + docValues.nextDoc(); + for (int i = 0; i < r.maxDoc(); i++) { + BytesRef binaryValue = storedFields.document(i).getBinaryValue("stored"); + if (binaryValue == null) { + assertTrue(docValues.docID() > i); + } else { + assertEquals(i, docValues.docID()); + assertEquals(binaryValue, docValues.lookupOrd(docValues.ordValue())); + docValues.nextDoc(); + } + } + assertEquals(DocIdSetIterator.NO_MORE_DOCS, docValues.docID()); + } + ir.close(); + writer.close(); + dir.close(); + } + + public void testSortedFixedLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + int fixedLength = TestUtil.nextInt(random(), 1, 10); + doTestSortedVsStoredFields(atLeast(300), 1, fixedLength, fixedLength); + } + } + + public void testSparseSortedFixedLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + int fixedLength = TestUtil.nextInt(random(), 1, 10); + doTestSortedVsStoredFields(atLeast(300), random().nextDouble(), fixedLength, fixedLength); + } + } + + public void testSortedVariableLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedVsStoredFields(atLeast(300), 1, 1, 10); + } + } + + public void testSparseSortedVariableLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedVsStoredFields(atLeast(300), random().nextDouble(), 1, 10); + } + } + + protected void doTestSortedVsStoredFields( + int numDocs, double density, int minLength, int maxLength) throws Exception { + doTestSortedVsStoredFields( + numDocs, + density, + () -> { + int length = TestUtil.nextInt(random(), minLength, maxLength); + byte[] buffer = new byte[length]; + random().nextBytes(buffer); + return buffer; + }); + } + + public void testSortedSetOneValue() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(0, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoFields() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(new SortedSetDocValuesField("field2", newBytesRef("world"))); + iwriter.addDocument(doc); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(0, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field2"); + assertEquals(0, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("world"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoDocumentsMerged() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + + doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(2, dv.getValueCount()); + + assertEquals(0, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(1, dv.nextOrd()); + + bytes = dv.lookupOrd(1); + assertEquals(newBytesRef("world"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoValues() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); + iwriter.addDocument(doc); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(0, dv.nextDoc()); + + assertEquals(2, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + assertEquals(1, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + bytes = dv.lookupOrd(1); + assertEquals(newBytesRef("world"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoValuesUnordered() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(0, dv.nextDoc()); + + assertEquals(2, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + assertEquals(1, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + bytes = dv.lookupOrd(1); + assertEquals(newBytesRef("world"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetThreeValuesTwoDocs() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); + iwriter.addDocument(doc); + iwriter.commit(); + + doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("beer"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(3, dv.getValueCount()); + + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(1, dv.nextOrd()); + assertEquals(2, dv.nextOrd()); + + assertEquals(1, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + assertEquals(1, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("beer"), bytes); + + bytes = dv.lookupOrd(1); + assertEquals(newBytesRef("hello"), bytes); + + bytes = dv.lookupOrd(2); + assertEquals(newBytesRef("world"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoDocumentsLastMissing() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + + doc = new Document(); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(1, dv.getValueCount()); + assertEquals(0, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoDocumentsLastMissingMerge() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + + doc = new Document(); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(1, dv.getValueCount()); + assertEquals(0, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoDocumentsFirstMissing() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + iwriter.addDocument(doc); + + doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + + iwriter.forceMerge(1); + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(1, dv.getValueCount()); + assertEquals(1, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTwoDocumentsFirstMissingMerge() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + iwriter.addDocument(doc); + iwriter.commit(); + + doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(1, dv.getValueCount()); + assertEquals(1, dv.nextDoc()); + + assertEquals(1, dv.docValueCount()); + assertEquals(0, dv.nextOrd()); + + BytesRef bytes = dv.lookupOrd(0); + assertEquals(newBytesRef("hello"), bytes); + + ireader.close(); + directory.close(); + } + + public void testSortedSetMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(0, dv.getValueCount()); + + TermsEnum termsEnum = dv.termsEnum(); + assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); + assertEquals(-1, dv.lookupTerm(new BytesRef("lucene"))); + + ireader.close(); + directory.close(); + } + + public void testSortedSetTermsEnum() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("world"))); + doc.add(new SortedSetDocValuesField("field", newBytesRef("beer"))); + iwriter.addDocument(doc); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(3, dv.getValueCount()); + + TermsEnum termsEnum = dv.termsEnum(); + + // next() + assertEquals("beer", termsEnum.next().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + + // seekCeil() + assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(newBytesRef("ha!"))); + assertEquals("hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(newBytesRef("beer"))); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertEquals(SeekStatus.END, termsEnum.seekCeil(newBytesRef("zzz"))); + + // seekExact() + assertTrue(termsEnum.seekExact(newBytesRef("beer"))); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + assertTrue(termsEnum.seekExact(newBytesRef("hello"))); + assertEquals("hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertTrue(termsEnum.seekExact(newBytesRef("world"))); + assertEquals("world", termsEnum.term().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertFalse(termsEnum.seekExact(newBytesRef("bogus"))); + + // seek(ord) + termsEnum.seekExact(0); + assertEquals("beer", termsEnum.term().utf8ToString()); + assertEquals(0, termsEnum.ord()); + termsEnum.seekExact(1); + assertEquals("hello", termsEnum.term().utf8ToString()); + assertEquals(1, termsEnum.ord()); + termsEnum.seekExact(2); + assertEquals("world", termsEnum.term().utf8ToString()); + assertEquals(2, termsEnum.ord()); + + // NORMAL automaton + termsEnum = + dv.intersect( + new CompiledAutomaton( + Operations.determinize( + new RegExp(".*l.*").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT))); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertEquals("world", termsEnum.next().utf8ToString()); + assertEquals(2, termsEnum.ord()); + assertNull(termsEnum.next()); + + // SINGLE automaton + termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton())); + assertEquals("hello", termsEnum.next().utf8ToString()); + assertEquals(1, termsEnum.ord()); + assertNull(termsEnum.next()); + + ireader.close(); + directory.close(); + } + + protected void compareStoredFieldWithSortedSetDV( + DirectoryReader directoryReader, String storedField, String dvField) throws IOException { + for (LeafReaderContext leaf : directoryReader.leaves()) { + LeafReader reader = leaf.reader(); + StoredFields storedFields = reader.storedFields(); + SortedSetDocValues docValues = reader.getSortedSetDocValues(dvField); + if (docValues == null) { + // no stored values at all + for (int doc = 0; doc < reader.maxDoc(); doc++) { + assertArrayEquals(new String[0], storedFields.document(doc).getValues(storedField)); + } + continue; + } + // sequentially + for (int doc = 0; doc < reader.maxDoc(); doc++) { + String[] storedValues = storedFields.document(doc).getValues(storedField); + if (storedValues.length == 0) { + assertFalse(docValues.advanceExact(doc)); + continue; + } + switch (random().nextInt(3)) { + case 0 -> assertEquals(doc, docValues.nextDoc()); + case 1 -> assertEquals(doc, docValues.advance(doc)); + default -> assertTrue(docValues.advanceExact(doc)); + } + assertEquals(doc, docValues.docID()); + assertEquals(storedValues.length, docValues.docValueCount()); + int repeats = 1 + random().nextInt(3); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + for (int v = 0; v < docValues.docValueCount(); v++) { + long ord = docValues.nextOrd(); + assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); + } + } + } + // jump with advanceExact + int iters = 1 + random().nextInt(3); + for (int i = 0; i < iters; i++) { + docValues = reader.getSortedSetDocValues(dvField); + for (int doc = random().nextInt(leaf.reader().maxDoc()); doc < reader.maxDoc(); doc++) { + String[] storedValues = storedFields.document(doc).getValues(storedField); + if (docValues.advanceExact(doc)) { + assertEquals(doc, docValues.docID()); + assertEquals(storedValues.length, docValues.docValueCount()); + int repeats = 1 + random().nextInt(3); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + for (int v = 0; v < docValues.docValueCount(); v++) { + long ord = docValues.nextOrd(); + assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); + } + } + } else { + assertArrayEquals(new String[0], storedValues); + } + doc += random().nextInt(5); // skip some docs + } + } + // jump with advance + for (int i = 0; i < iters; i++) { + docValues = reader.getSortedSetDocValues(dvField); + int doc = random().nextInt(leaf.reader().maxDoc()); + while (doc != NO_MORE_DOCS) { + int nextDoc = docValues.advance(doc); + // no stored fields in between + for (int d = doc; d < (nextDoc == NO_MORE_DOCS ? reader.maxDoc() : nextDoc); d++) { + String[] storedValues = storedFields.document(d).getValues(storedField); + assertArrayEquals(new String[0], storedValues); + } + doc = nextDoc; + if (doc != NO_MORE_DOCS) { + int repeats = 1 + random().nextInt(3); + String[] storedValues = storedFields.document(doc).getValues(storedField); + for (int r = 0; r < repeats; r++) { + if (r > 0 || random().nextBoolean()) { + assertTrue(docValues.advanceExact(doc)); + } + for (int v = 0; v < docValues.docValueCount(); v++) { + long ord = docValues.nextOrd(); + assertEquals(storedValues[v], docValues.lookupOrd(ord).utf8ToString()); + } + } + doc = nextDoc + 1; + doc += random().nextInt(5); // skip some docs + } + } + } + } + } + + protected void doTestSortedSetVsStoredFields( + int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) + throws Exception { + Directory dir = newFSDirectory(createTempDir("dvduel")); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + + Set valueSet = new HashSet<>(); + for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) { + final int length = TestUtil.nextInt(random(), minLength, maxLength); + valueSet.add(TestUtil.randomSimpleString(random(), length)); + } + String[] uniqueValues = valueSet.toArray(new String[0]); + + // index some docs + if (VERBOSE) { + System.out.println("\nTEST: now add numDocs=" + numDocs); + } + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); + doc.add(idField); + int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc); + // create a random set of strings + Set values = new TreeSet<>(); + for (int v = 0; v < numValues; v++) { + values.add(RandomPicks.randomFrom(random(), uniqueValues)); + } + + // add ordered to the stored field + for (String v : values) { + doc.add(new StoredField("stored", v)); + } + + // add in any order to the dv field + ArrayList unordered = new ArrayList<>(values); + Collections.shuffle(unordered, random()); + for (String v : unordered) { + doc.add(new SortedSetDocValuesField("dv", newBytesRef(v))); + } + + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + + try (DirectoryReader reader = writer.getReader()) { + TestUtil.checkReader(reader); + compareStoredFieldWithSortedSetDV(reader, "stored", "dv"); + } + writer.forceMerge(1); + try (DirectoryReader reader = writer.getReader()) { + TestUtil.checkReader(reader); + compareStoredFieldWithSortedSetDV(reader, "stored", "dv"); + } + IOUtils.close(writer, dir); + } + + public void testSortedSetFixedLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + int fixedLength = TestUtil.nextInt(random(), 1, 10); + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100); + } + } + + public void testSortedNumericsSingleValuedVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedNumericsVsStoredFields(() -> 1, random()::nextLong); + } + } + + public void testSortedNumericsSingleValuedMissingVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedNumericsVsStoredFields(() -> random().nextBoolean() ? 0 : 1, random()::nextLong); + } + } + + public void testSortedNumericsMultipleValuesVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedNumericsVsStoredFields( + () -> TestUtil.nextLong(random(), 0, 50), random()::nextLong); + } + } + + public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception { + final long[] values = new long[TestUtil.nextInt(random(), 2, 6)]; + for (int i = 0; i < values.length; ++i) { + values[i] = random().nextLong(); + } + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedNumericsVsStoredFields( + () -> TestUtil.nextLong(random(), 0, 6), () -> values[random().nextInt(values.length)]); + } + } + + public void testSortedSetVariableLengthVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100); + } + } + + public void testSortedSetFixedLengthSingleValuedVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + int fixedLength = TestUtil.nextInt(random(), 1, 10); + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100); + } + } + + public void testSortedSetVariableLengthSingleValuedVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100); + } + } + + public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6); + } + } + + public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6); + } + } + + public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000); + } + } + + public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000); + } + } + + public void testGCDCompression() throws Exception { + doTestGCDCompression(1); + } + + public void testSparseGCDCompression() throws Exception { + doTestGCDCompression(random().nextDouble()); + } + + private void doTestGCDCompression(double density) throws Exception { + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + final long min = -(((long) random().nextInt(1 << 30)) << 32); + final long mul = random().nextInt() & 0xFFFFFFFFL; + final LongSupplier longs = () -> min + mul * random().nextInt(1 << 20); + doTestNumericsVsStoredFields(density, longs); + } + } + + public void testZeros() throws Exception { + doTestNumericsVsStoredFields(1, () -> 0); + } + + public void testSparseZeros() throws Exception { + doTestNumericsVsStoredFields(random().nextDouble(), () -> 0); + } + + public void testZeroOrMin() throws Exception { + // try to make GCD compression fail if the format did not anticipate that + // the GCD of 0 and MIN_VALUE is negative + int numIterations = atLeast(1); + for (int i = 0; i < numIterations; i++) { + final LongSupplier longs = () -> random().nextBoolean() ? 0 : Long.MIN_VALUE; + doTestNumericsVsStoredFields(1, longs); + } + } + + public void testTwoNumbersOneMissing() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(0, dv.longValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ir.close(); + directory.close(); + } + + public void testTwoNumbersOneMissingWithMerging() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(0, dv.longValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ir.close(); + directory.close(); + } + + public void testThreeNumbersOneMissingWithMerging() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 0)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new NumericDocValuesField("dv1", 5)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + NumericDocValues dv = ar.getNumericDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(0, dv.longValue()); + assertEquals(2, dv.nextDoc()); + assertEquals(5, dv.longValue()); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissing() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef(), dv.binaryValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ir.close(); + directory.close(); + } + + public void testTwoBytesOneMissingWithMerging() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef())); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef(), dv.binaryValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ir.close(); + directory.close(); + } + + public void testThreeBytesOneMissingWithMerging() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(null); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory, conf); + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef())); + iw.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.YES)); + iw.addDocument(doc); + iw.commit(); + doc = new Document(); + doc.add(new StringField("id", "2", Field.Store.YES)); + doc.add(new BinaryDocValuesField("dv1", newBytesRef("boo"))); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + IndexReader ir = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assertEquals(1, ir.leaves().size()); + LeafReader ar = ir.leaves().get(0).reader(); + BinaryDocValues dv = ar.getBinaryDocValues("dv1"); + assertEquals(0, dv.nextDoc()); + assertEquals(newBytesRef(), dv.binaryValue()); + assertEquals(2, dv.nextDoc()); + assertEquals(newBytesRef("boo"), dv.binaryValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + ir.close(); + directory.close(); + } + + /** Tests dv against stored fields with threads (binary/numeric/sorted, no missing) */ + public void testThreads() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Document doc = new Document(); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", newBytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", newBytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + doc.add(idField); + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + doc.add(storedNumericField); + doc.add(dvNumericField); + + // index some docs + int numDocs = atLeast(300); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = TestUtil.nextInt(random(), 0, 8); + byte[] buffer = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); + int numThreads = TestUtil.nextInt(random(), 2, 7); + Thread[] threads = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = + new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = storedFields.document(j).getBinaryValue("storedBin"); + assertEquals(j, binaries.nextDoc()); + BytesRef scratch = binaries.binaryValue(); + assertEquals(binaryValue, scratch); + assertEquals(j, sorted.nextDoc()); + scratch = sorted.lookupOrd(sorted.ordValue()); + assertEquals(binaryValue, scratch); + String expected = storedFields.document(j).get("storedNum"); + assertEquals(j, numerics.nextDoc()); + assertEquals(Long.parseLong(expected), numerics.longValue()); + } + } + TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } + + /** Tests dv against stored fields with threads (all types + missing) */ + @Nightly + public void testThreads2() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + Field idField = new StringField("id", "", Field.Store.NO); + Field storedBinField = new StoredField("storedBin", new byte[0]); + Field dvBinField = new BinaryDocValuesField("dvBin", newBytesRef()); + Field dvSortedField = new SortedDocValuesField("dvSorted", newBytesRef()); + Field storedNumericField = new StoredField("storedNum", ""); + Field dvNumericField = new NumericDocValuesField("dvNum", 0); + + // index some docs + int numDocs = TestUtil.nextInt(random(), 1025, 2047); + for (int i = 0; i < numDocs; i++) { + idField.setStringValue(Integer.toString(i)); + int length = TestUtil.nextInt(random(), 0, 8); + byte[] buffer = new byte[length]; + random().nextBytes(buffer); + storedBinField.setBytesValue(buffer); + dvBinField.setBytesValue(buffer); + dvSortedField.setBytesValue(buffer); + long numericValue = random().nextLong(); + storedNumericField.setStringValue(Long.toString(numericValue)); + dvNumericField.setLongValue(numericValue); + Document doc = new Document(); + doc.add(idField); + if (random().nextInt(4) > 0) { + doc.add(storedBinField); + doc.add(dvBinField); + doc.add(dvSortedField); + } + if (random().nextInt(4) > 0) { + doc.add(storedNumericField); + doc.add(dvNumericField); + } + int numSortedSetFields = random().nextInt(3); + Set values = new TreeSet<>(); + for (int j = 0; j < numSortedSetFields; j++) { + values.add(TestUtil.randomSimpleString(random())); + } + for (String v : values) { + doc.add(new SortedSetDocValuesField("dvSortedSet", newBytesRef(v))); + doc.add(new StoredField("storedSortedSet", v)); + } + int numSortedNumericFields = random().nextInt(3); + Set numValues = new TreeSet<>(); + for (int j = 0; j < numSortedNumericFields; j++) { + numValues.add(TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE)); + } + for (Long l : numValues) { + doc.add(new SortedNumericDocValuesField("dvSortedNumeric", l)); + doc.add(new StoredField("storedSortedNumeric", Long.toString(l))); + } + writer.addDocument(doc); + if (random().nextInt(31) == 0) { + writer.commit(); + } + } + + // delete some docs + int numDeletions = random().nextInt(numDocs / 10); + for (int i = 0; i < numDeletions; i++) { + int id = random().nextInt(numDocs); + writer.deleteDocuments(new Term("id", Integer.toString(id))); + } + writer.close(); + + // compare + final DirectoryReader ir = maybeWrapWithMergingReader(DirectoryReader.open(dir)); + int numThreads = TestUtil.nextInt(random(), 2, 7); + Thread[] threads = new Thread[numThreads]; + final CountDownLatch startingGun = new CountDownLatch(1); + + for (int i = 0; i < threads.length; i++) { + threads[i] = + new Thread() { + @Override + public void run() { + try { + startingGun.await(); + for (LeafReaderContext context : ir.leaves()) { + LeafReader r = context.reader(); + StoredFields storedFields = r.storedFields(); + BinaryDocValues binaries = r.getBinaryDocValues("dvBin"); + SortedDocValues sorted = r.getSortedDocValues("dvSorted"); + NumericDocValues numerics = r.getNumericDocValues("dvNum"); + SortedSetDocValues sortedSet = r.getSortedSetDocValues("dvSortedSet"); + SortedNumericDocValues sortedNumeric = + r.getSortedNumericDocValues("dvSortedNumeric"); + for (int j = 0; j < r.maxDoc(); j++) { + BytesRef binaryValue = storedFields.document(j).getBinaryValue("storedBin"); + if (binaryValue != null) { + if (binaries != null) { + assertEquals(j, binaries.nextDoc()); + BytesRef scratch = binaries.binaryValue(); + assertEquals(binaryValue, scratch); + assertEquals(j, sorted.nextDoc()); + scratch = sorted.lookupOrd(sorted.ordValue()); + assertEquals(binaryValue, scratch); + } + } + + String number = storedFields.document(j).get("storedNum"); + if (number != null) { + if (numerics != null) { + assertEquals(j, numerics.advance(j)); + assertEquals(Long.parseLong(number), numerics.longValue()); + } + } + + String[] values = storedFields.document(j).getValues("storedSortedSet"); + if (values.length > 0) { + assertNotNull(sortedSet); + assertEquals(j, sortedSet.nextDoc()); + assertEquals(values.length, sortedSet.docValueCount()); + for (String s : values) { + long ord = sortedSet.nextOrd(); + BytesRef value = sortedSet.lookupOrd(ord); + assertEquals(s, value.utf8ToString()); + } + } + + String[] numValues = storedFields.document(j).getValues("storedSortedNumeric"); + if (numValues.length > 0) { + assertNotNull(sortedNumeric); + assertEquals(j, sortedNumeric.nextDoc()); + assertEquals(numValues.length, sortedNumeric.docValueCount()); + for (String numValue : numValues) { + long v = sortedNumeric.nextValue(); + assertEquals(numValue, Long.toString(v)); + } + } + } + } + TestUtil.checkReader(ir); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + }; + threads[i].start(); + } + startingGun.countDown(); + for (Thread t : threads) { + t.join(); + } + ir.close(); + dir.close(); + } + + @Nightly + public void testThreads3() throws Exception { + Directory dir = newFSDirectory(createTempDir()); + IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); + + int numSortedSets = random().nextInt(21); + int numBinaries = random().nextInt(21); + int numSortedNums = random().nextInt(21); + + int numDocs = TestUtil.nextInt(random(), 2025, 2047); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + + for (int j = 0; j < numSortedSets; j++) { + doc.add( + new SortedSetDocValuesField( + "ss" + j, newBytesRef(TestUtil.randomSimpleString(random())))); + doc.add( + new SortedSetDocValuesField( + "ss" + j, newBytesRef(TestUtil.randomSimpleString(random())))); + } + + for (int j = 0; j < numBinaries; j++) { + doc.add( + new BinaryDocValuesField("b" + j, newBytesRef(TestUtil.randomSimpleString(random())))); + } + + for (int j = 0; j < numSortedNums; j++) { + doc.add( + new SortedNumericDocValuesField( + "sn" + j, TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); + doc.add( + new SortedNumericDocValuesField( + "sn" + j, TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE))); + } + writer.addDocument(doc); + } + writer.close(); + + // now check with threads + for (int i = 0; i < 10; i++) { + final DirectoryReader r = maybeWrapWithMergingReader(DirectoryReader.open(dir)); + final CountDownLatch startingGun = new CountDownLatch(1); + Thread[] threads = new Thread[TestUtil.nextInt(random(), 4, 10)]; + for (int tid = 0; tid < threads.length; tid++) { + threads[tid] = + new Thread() { + @Override + public void run() { + try { + ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream infoStream = new PrintStream(bos, false, UTF_8); + startingGun.await(); + for (LeafReaderContext leaf : r.leaves()) { + DocValuesStatus status = + CheckIndex.testDocValues((CodecReader) leaf.reader(), infoStream, true); + if (status.error != null) { + throw status.error; + } + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + }; + } + for (Thread thread : threads) { + thread.start(); + } + startingGun.countDown(); + for (Thread thread : threads) { + thread.join(); + } + r.close(); + } + + dir.close(); + } + + // LUCENE-5218 + public void testEmptyBinaryValueOnPageSizes() throws Exception { + // Test larger and larger power-of-two sized values, + // followed by empty string value: + for (int i = 0; i < 20; i++) { + if (i > 14 && codecAcceptsHugeBinaryValues("field") == false) { + break; + } + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + BytesRef bytes = newBytesRef(new byte[1 << i], 0, 1 << i); + for (int j = 0; j < 4; j++) { + Document doc = new Document(); + doc.add(new BinaryDocValuesField("field", bytes)); + w.addDocument(doc); + } + Document doc = new Document(); + doc.add(new StoredField("id", "5")); + doc.add(new BinaryDocValuesField("field", newBytesRef())); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + BinaryDocValues values = MultiDocValues.getBinaryValues(r, "field"); + for (int j = 0; j < 5; j++) { + assertEquals(j, values.nextDoc()); + BytesRef result = values.binaryValue(); + assertTrue(result.length == 0 || result.length == 1 << i); + } + r.close(); + dir.close(); + } + } + + public void testOneSortedNumber() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 5)); + writer.addDocument(doc); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(5, dv.nextValue()); + + reader.close(); + directory.close(); + } + + public void testOneSortedNumberOneMissing() throws IOException { + Directory directory = newDirectory(); + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(null)); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 5)); + writer.addDocument(doc); + writer.addDocument(new Document()); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(5, dv.nextValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + reader.close(); + directory.close(); + } + + public void testNumberMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new NumericDocValuesField("field", 5)); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + NumericDocValues dv = getOnlyLeafReader(ireader).getNumericDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + public void testTwoSortedNumber() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 11)); + doc.add(new SortedNumericDocValuesField("dv", -5)); + writer.addDocument(doc); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(-5, dv.nextValue()); + assertEquals(11, dv.nextValue()); + + reader.close(); + directory.close(); + } + + public void testTwoSortedNumberSameValue() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 11)); + doc.add(new SortedNumericDocValuesField("dv", 11)); + writer.addDocument(doc); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(11, dv.nextValue()); + assertEquals(11, dv.nextValue()); + + reader.close(); + directory.close(); + } + + public void testTwoSortedNumberOneMissing() throws IOException { + Directory directory = newDirectory(); + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(null)); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 11)); + doc.add(new SortedNumericDocValuesField("dv", -5)); + writer.addDocument(doc); + writer.addDocument(new Document()); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(2, dv.docValueCount()); + assertEquals(-5, dv.nextValue()); + assertEquals(11, dv.nextValue()); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + reader.close(); + directory.close(); + } + + public void testSortedNumberMerge() throws IOException { + Directory directory = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(null); + iwc.setMergePolicy(newLogMergePolicy()); + IndexWriter writer = new IndexWriter(directory, iwc); + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", 11)); + writer.addDocument(doc); + writer.commit(); + doc = new Document(); + doc.add(new SortedNumericDocValuesField("dv", -5)); + writer.addDocument(doc); + writer.forceMerge(1); + writer.close(); + + // Now search the index: + IndexReader reader = maybeWrapWithMergingReader(DirectoryReader.open(directory)); + assert reader.leaves().size() == 1; + SortedNumericDocValues dv = reader.leaves().get(0).reader().getSortedNumericDocValues("dv"); + assertEquals(0, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(11, dv.nextValue()); + assertEquals(1, dv.nextDoc()); + assertEquals(1, dv.docValueCount()); + assertEquals(-5, dv.nextValue()); + + reader.close(); + directory.close(); + } + + public void testSortedNumberMergeAwayAllValues() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "0", Field.Store.NO)); + iwriter.addDocument(doc); + doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedNumericDocValuesField("field", 5)); + iwriter.addDocument(doc); + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedNumericDocValues dv = getOnlyLeafReader(ireader).getSortedNumericDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + public void testSortedEnumAdvanceIndependently() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + SortedDocValuesField field = new SortedDocValuesField("field", newBytesRef("2")); + doc.add(field); + iwriter.addDocument(doc); + field.setBytesValue(newBytesRef("1")); + iwriter.addDocument(doc); + field.setBytesValue(newBytesRef("3")); + iwriter.addDocument(doc); + + iwriter.commit(); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); + doTestSortedSetEnumAdvanceIndependently(DocValues.singleton(dv)); + + ireader.close(); + directory.close(); + } + + public void testSortedSetEnumAdvanceIndependently() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + SortedSetDocValuesField field1 = new SortedSetDocValuesField("field", newBytesRef("2")); + SortedSetDocValuesField field2 = new SortedSetDocValuesField("field", newBytesRef("3")); + doc.add(field1); + doc.add(field2); + iwriter.addDocument(doc); + field1.setBytesValue(newBytesRef("1")); + iwriter.addDocument(doc); + field2.setBytesValue(newBytesRef("2")); + iwriter.addDocument(doc); + + iwriter.commit(); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + doTestSortedSetEnumAdvanceIndependently(dv); + + ireader.close(); + directory.close(); + } + + protected void doTestSortedSetEnumAdvanceIndependently(SortedSetDocValues dv) throws IOException { + if (dv.getValueCount() < 2) { + return; + } + List terms = new ArrayList<>(); + TermsEnum te = dv.termsEnum(); + terms.add(BytesRef.deepCopyOf(te.next())); + terms.add(BytesRef.deepCopyOf(te.next())); + + // Make sure that calls to next() does not modify the term of the other enum + TermsEnum enum1 = dv.termsEnum(); + TermsEnum enum2 = dv.termsEnum(); + BytesRefBuilder term1 = new BytesRefBuilder(); + BytesRefBuilder term2 = new BytesRefBuilder(); + + term1.copyBytes(enum1.next()); + term2.copyBytes(enum2.next()); + term1.copyBytes(enum1.next()); + + assertEquals(term1.get(), enum1.term()); + assertEquals(term2.get(), enum2.term()); + + // Same for seekCeil + enum1 = dv.termsEnum(); + enum2 = dv.termsEnum(); + term1 = new BytesRefBuilder(); + term2 = new BytesRefBuilder(); + + term2.copyBytes(enum2.next()); + BytesRefBuilder seekTerm = new BytesRefBuilder(); + seekTerm.append(terms.get(0)); + seekTerm.append((byte) 0); + enum1.seekCeil(seekTerm.get()); + term1.copyBytes(enum1.term()); + + assertEquals(term1.get(), enum1.term()); + assertEquals(term2.get(), enum2.term()); + + // Same for seekCeil on an exact value + enum1 = dv.termsEnum(); + enum2 = dv.termsEnum(); + term1 = new BytesRefBuilder(); + term2 = new BytesRefBuilder(); + + term2.copyBytes(enum2.next()); + enum1.seekCeil(terms.get(1)); + term1.copyBytes(enum1.term()); + + assertEquals(term1.get(), enum1.term()); + assertEquals(term2.get(), enum2.term()); + + // Same for seekExact + enum1 = dv.termsEnum(); + enum2 = dv.termsEnum(); + term1 = new BytesRefBuilder(); + term2 = new BytesRefBuilder(); + + term2.copyBytes(enum2.next()); + final boolean found = enum1.seekExact(terms.get(1)); + assertTrue(found); + term1.copyBytes(enum1.term()); + + // Same for seek by ord + enum1 = dv.termsEnum(); + enum2 = dv.termsEnum(); + term1 = new BytesRefBuilder(); + term2 = new BytesRefBuilder(); + + term2.copyBytes(enum2.next()); + enum1.seekExact(1); + term1.copyBytes(enum1.term()); + + assertEquals(term1.get(), enum1.term()); + assertEquals(term2.get(), enum2.term()); + } + + // same as testSortedMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testSortedMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlyLeafReader(ireader).getSortedDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + TermsEnum termsEnum = dv.termsEnum(); + assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); + assertEquals(-1, dv.lookupTerm(new BytesRef("lucene"))); + + ireader.close(); + directory.close(); + } + + // same as testSortedSetMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testSortedSetMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedSetDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlyLeafReader(ireader).getSortedSetDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + TermsEnum termsEnum = dv.termsEnum(); + assertFalse(termsEnum.seekExact(new BytesRef("lucene"))); + assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("lucene"))); + assertEquals(-1, dv.lookupTerm(new BytesRef("lucene"))); + + ireader.close(); + directory.close(); + } + + // same as testNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testNumericMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new NumericDocValuesField("field", 42L)); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + NumericDocValues dv = getOnlyLeafReader(ireader).getNumericDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + // same as testSortedNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding + // on + public void testSortedNumericMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedNumericDocValuesField("field", 42L)); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedNumericDocValues dv = getOnlyLeafReader(ireader).getSortedNumericDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + // same as testBinaryMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testBinaryMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new BinaryDocValuesField("field", newBytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + BinaryDocValues dv = getOnlyLeafReader(ireader).getBinaryDocValues("field"); + assertEquals(NO_MORE_DOCS, dv.nextDoc()); + + ireader.close(); + directory.close(); + } + + public void testRandomAdvanceNumeric() throws IOException { + final long longRange; + if (random().nextBoolean()) { + longRange = TestUtil.nextInt(random(), 1, 1024); + } else { + longRange = TestUtil.nextLong(random(), 1, Long.MAX_VALUE); + } + doTestRandomAdvance( + new FieldCreator() { + @Override + public Field next() { + return new NumericDocValuesField("field", TestUtil.nextLong(random(), 0, longRange)); + } + + @Override + public DocIdSetIterator iterator(IndexReader r) throws IOException { + return MultiDocValues.getNumericValues(r, "field"); + } + }); + } + + public void testRandomAdvanceBinary() throws IOException { + doTestRandomAdvance( + new FieldCreator() { + @Override + public Field next() { + byte[] bytes = new byte[random().nextInt(10)]; + random().nextBytes(bytes); + return new BinaryDocValuesField("field", newBytesRef(bytes)); + } + + @Override + public DocIdSetIterator iterator(IndexReader r) throws IOException { + return MultiDocValues.getBinaryValues(r, "field"); + } + }); + } + + /** + * Tests where a DVField uses a high number of packed bits to store its ords. See: + * https://issues.apache.org/jira/browse/LUCENE-10159 + */ + @Nightly + public void testHighOrdsSortedSetDV() throws Exception { + assumeFalse( + "This test with SimpleTextCodec requires a lot of memory", + getCodec() instanceof SimpleTextCodec); + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setRAMBufferSizeMB(8 + random().nextInt(64)); + IndexWriter writer = new IndexWriter(dir, iwc); + // many docs with some of them have very high ords + int numDocs = 20_000 + random().nextInt(10_000); + for (int i = 1; i < numDocs; i++) { + final int numOrds; + if (random().nextInt(100) <= 5) { + numOrds = 1000 + random().nextInt(500); + } else { + numOrds = random().nextInt(10); + } + Document doc = new Document(); + for (int ord = 0; ord < numOrds; ord++) { + doc.add( + new SortedSetDocValuesField("sorted_set_dv", TestUtil.randomBinaryTerm(random(), 2))); + } + writer.addDocument(doc); + } + writer.forceMerge(1, true); + try (DirectoryReader reader = DirectoryReader.open(writer)) { + TestUtil.checkReader(reader); + } + IOUtils.close(writer, dir); + } + + private interface FieldCreator { + Field next(); + + DocIdSetIterator iterator(IndexReader r) throws IOException; + } + + private void doTestRandomAdvance(FieldCreator fieldCreator) throws IOException { + + Analyzer analyzer = new MockAnalyzer(random()); + + Directory directory = newDirectory(); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + conf.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter w = new RandomIndexWriter(random(), directory, conf); + int numChunks = atLeast(10); + int id = 0; + Set missingSet = new HashSet<>(); + for (int i = 0; i < numChunks; i++) { + // change sparseness for each chunk + double sparseChance = random().nextDouble(); + int docCount = atLeast(1000); + for (int j = 0; j < docCount; j++) { + Document doc = new Document(); + doc.add(new StoredField("id", id)); + if (random().nextDouble() > sparseChance) { + doc.add(fieldCreator.next()); + } else { + missingSet.add(id); + } + id++; + w.addDocument(doc); + } + } + + if (random().nextBoolean()) { + w.forceMerge(1); + } + + // Now search the index: + IndexReader r = w.getReader(); + StoredFields storedFields = r.storedFields(); + BitSet missing = new FixedBitSet(r.maxDoc()); + for (int docID = 0; docID < r.maxDoc(); docID++) { + Document doc = storedFields.document(docID); + if (missingSet.contains(doc.getField("id").numericValue())) { + missing.set(docID); + } + } + + int numIters = atLeast(10); + for (int iter = 0; iter < numIters; iter++) { + DocIdSetIterator values = fieldCreator.iterator(r); + assertEquals(-1, values.docID()); + + while (true) { + int docID; + if (random().nextBoolean()) { + docID = values.nextDoc(); + } else { + int range; + if (random().nextInt(10) == 7) { + // big jump + range = r.maxDoc() - values.docID(); + } else { + // small jump + range = 25; + } + int inc = TestUtil.nextInt(random(), 1, range); + docID = values.advance(values.docID() + inc); + } + if (docID == NO_MORE_DOCS) { + break; + } + assertFalse(missing.get(docID)); + } + } + + IOUtils.close(r, w, directory); + } + + protected boolean codecAcceptsHugeBinaryValues(String field) { + return true; + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MergeReaderWrapper.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MergeReaderWrapper.java index cc3084fec07..3fee110f783 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MergeReaderWrapper.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MergeReaderWrapper.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.CodecReader; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; @@ -187,6 +188,17 @@ class MergeReaderWrapper extends LeafReader { return norms.getNorms(fi); } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + ensureOpen(); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (fi == null) { + // Field does not exist + return null; + } + return docValues.getSkipper(fi); + } + @Override public FieldInfos getFieldInfos() { return in.getFieldInfos(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java index e18c25ee0cf..ab907b76802 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/MismatchedLeafReader.java @@ -103,6 +103,7 @@ public class MismatchedLeafReader extends FilterLeafReader { oldInfo.hasPayloads(), // storePayloads oldInfo.getIndexOptions(), // indexOptions oldInfo.getDocValuesType(), // docValuesType + oldInfo.hasDocValuesSkipIndex(), // hasDocValuesSkipIndex oldInfo.getDocValuesGen(), // dvGen oldInfo.attributes(), // attributes oldInfo.getPointDimensionCount(), // data dimension count diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java index f8fe88b5cf2..5dea79f203a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/RandomPostingsTester.java @@ -157,6 +157,7 @@ public class RandomPostingsTester { true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, @@ -731,6 +732,7 @@ public class RandomPostingsTester { doPayloads, indexOptions, DocValuesType.NONE, + false, -1, new HashMap<>(), 0, diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java index 0df805b01e4..efd13121d93 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/QueryUtils.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Random; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexReader; @@ -227,6 +228,11 @@ public class QueryUtils { return null; } + @Override + public DocValuesSkipper getDocValuesSkipper(String field) throws IOException { + return null; + } + @Override public FloatVectorValues getFloatVectorValues(String field) throws IOException { return null; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index f4795771c9b..c649fd18fa5 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -102,6 +102,7 @@ import junit.framework.AssertionFailedError; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.bitvectors.HnswBitVectorsFormat; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; @@ -3223,11 +3224,16 @@ public abstract class LuceneTestCase extends Assert { return true; } + private static boolean supportsVectorSearch(KnnVectorsFormat format) { + return (format instanceof FlatVectorsFormat) == false; + } + protected static KnnVectorsFormat randomVectorFormat(VectorEncoding vectorEncoding) { List availableFormats = KnnVectorsFormat.availableKnnVectorsFormats().stream() .map(KnnVectorsFormat::forName) .filter(format -> supportsVectorEncoding(format, vectorEncoding)) + .filter(format -> supportsVectorSearch(format)) .toList(); return RandomPicks.randomFrom(random(), availableFormats); } diff --git a/settings.gradle b/settings.gradle index acb67deb2d6..f4ee13243ca 100644 --- a/settings.gradle +++ b/settings.gradle @@ -20,19 +20,31 @@ pluginManagement { mavenCentral() gradlePluginPortal() } + + includeBuild("build-tools/build-infra") } plugins { - id("org.gradle.toolchains.foojay-resolver-convention") version "0.7.0" + id "org.gradle.toolchains.foojay-resolver-convention" version "0.8.0" id 'com.gradle.enterprise' version '3.15.1' id 'com.gradle.common-custom-user-data-gradle-plugin' version '1.11.3' } -apply from: file('gradle/ge.gradle') +dependencyResolutionManagement { + versionCatalogs { + deps { + from(files('versions.toml')) + } + } +} + +if (Boolean.parseBoolean(providers.gradleProperty("gradle.ge").orElse("true").get())) { + apply from: file('gradle/ge.gradle') +} rootProject.name = "lucene-root" -includeBuild("dev-tools/missing-doclet") +includeBuild("build-tools/missing-doclet") include "lucene:analysis:common" include "lucene:analysis:icu" diff --git a/versions.lock b/versions.lock index b11bf043dad..26de44f99e2 100644 --- a/versions.lock +++ b/versions.lock @@ -1,29 +1,935 @@ -# Run ./gradlew --write-locks to regenerate this file -com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1 (1 constraints: 0d050e36) -com.ibm.icu:icu4j:74.2 (1 constraints: e1041731) -commons-codec:commons-codec:1.13 (1 constraints: d904f430) -io.sgr:s2-geometry-library-java:1.0.0 (1 constraints: 0305f035) -junit:junit:4.13.1 (1 constraints: 3b05453b) -net.sf.jopt-simple:jopt-simple:5.0.4 (1 constraints: be0ad6cc) -net.sourceforge.nekohtml:nekohtml:1.9.17 (1 constraints: 4405503b) -org.antlr:antlr4-runtime:4.11.1 (1 constraints: 39053f3b) -org.apache.commons:commons-compress:1.19 (1 constraints: df04fa30) -org.apache.commons:commons-math3:3.6.1 (1 constraints: bf0adbcc) -org.apache.opennlp:opennlp-tools:2.3.2 (1 constraints: 09050036) -org.carrot2:morfologik-fsa:2.1.9 (1 constraints: db0d9c36) -org.carrot2:morfologik-polish:2.1.9 (1 constraints: 0e050136) -org.carrot2:morfologik-stemming:2.1.9 (2 constraints: 1312040d) -org.hamcrest:hamcrest:2.2 (1 constraints: a8041f2c) -org.locationtech.spatial4j:spatial4j:0.8 (1 constraints: ac041f2c) -org.openjdk.jmh:jmh-core:1.37 (1 constraints: df04fc30) -org.ow2.asm:asm:9.6 (3 constraints: 3917ef6d) -org.ow2.asm:asm-commons:9.6 (1 constraints: b304382c) -org.ow2.asm:asm-tree:9.6 (1 constraints: ea09e3a5) -org.slf4j:slf4j-api:1.7.36 (1 constraints: 6f0ed053) -ua.net.nlp:morfologik-ukrainian-search:4.9.1 (1 constraints: 10051b36) -xerces:xercesImpl:2.12.0 (1 constraints: 3705353b) - -[Test dependencies] -com.carrotsearch:procfork:1.0.6 (1 constraints: 0905f635) -org.assertj:assertj-core:3.21.0 (1 constraints: 38053c3b) -org.locationtech.jts:jts-core:1.17.0 (1 constraints: 3b053e3b) +{ + "comment" : "An inventory of resolved dependency versions. Do not edit this file directly.", + "configurationGroups" : { + "main_dependencies" : { + "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", + "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", + "commons-codec:commons-codec:1.13" : "e9962aab,refs=4", + "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", + "junit:junit:4.13.1" : "fa9ef26b,refs=4", + "net.sf.jopt-simple:jopt-simple:5.0.4" : "85a1e4c6,refs=2", + "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", + "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", + "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", + "org.apache.opennlp:opennlp-tools:2.3.2" : "2f760bab,refs=4", + "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", + "org.carrot2:morfologik-polish:2.1.9" : "fe494320,refs=3", + "org.carrot2:morfologik-stemming:2.1.9" : "79af844b,refs=4", + "org.hamcrest:hamcrest:2.2" : "fa9ef26b,refs=4", + "org.locationtech.spatial4j:spatial4j:0.8" : "cbc357ab,refs=4", + "org.openjdk.jmh:jmh-core:1.37" : "85a1e4c6,refs=2", + "org.ow2.asm:asm:9.6" : "d9953130,refs=4", + "org.ow2.asm:asm-commons:9.6" : "d9953130,refs=4", + "org.ow2.asm:asm-tree:9.6" : "d9953130,refs=4", + "org.slf4j:slf4j-api:1.7.36" : "2f760bab,refs=4", + "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "fe494320,refs=3", + "xerces:xercesImpl:2.12.0" : "5ce8cdc6,refs=2" + }, + "test_dependencies" : { + "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "b35e5d7a,refs=74", + "com.carrotsearch:procfork:1.0.6" : "b7ba1646,refs=2", + "com.github.ben-manes.caffeine:caffeine:3.0.5" : "6897bc09,refs=38", + "com.github.kevinstern:software-and-algorithms:1.0" : "6897bc09,refs=38", + "com.google.auto.service:auto-service-annotations:1.0.1" : "6897bc09,refs=38", + "com.google.auto.value:auto-value-annotations:1.9" : "6897bc09,refs=38", + "com.google.auto:auto-common:1.2.1" : "6897bc09,refs=38", + "com.google.code.findbugs:jsr305:3.0.2" : "6897bc09,refs=38", + "com.google.errorprone:error_prone_annotation:2.18.0" : "6897bc09,refs=38", + "com.google.errorprone:error_prone_annotations:2.18.0" : "6897bc09,refs=38", + "com.google.errorprone:error_prone_check_api:2.18.0" : "6897bc09,refs=38", + "com.google.errorprone:error_prone_core:2.18.0" : "6897bc09,refs=38", + "com.google.errorprone:error_prone_type_annotations:2.18.0" : "6897bc09,refs=38", + "com.google.guava:failureaccess:1.0.1" : "6897bc09,refs=38", + "com.google.guava:guava:31.0.1-jre" : "6897bc09,refs=38", + "com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava" : "6897bc09,refs=38", + "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", + "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", + "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", + "commons-codec:commons-codec:1.13" : "733734f0,refs=6", + "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", + "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", + "javax.inject:javax.inject:1" : "6897bc09,refs=38", + "junit:junit:4.13.1" : "b35e5d7a,refs=74", + "net.sf.jopt-simple:jopt-simple:5.0.4" : "152d9f78,refs=3", + "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", + "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", + "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", + "org.apache.opennlp:opennlp-tools:2.3.2" : "b91715f0,refs=6", + "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", + "org.carrot2:morfologik-fsa:2.1.9" : "e077a675,refs=8", + "org.carrot2:morfologik-polish:2.1.9" : "cb00cecf,refs=5", + "org.carrot2:morfologik-stemming:2.1.9" : "e077a675,refs=8", + "org.checkerframework:checker-qual:3.19.0" : "6897bc09,refs=38", + "org.checkerframework:dataflow-errorprone:3.27.0" : "6897bc09,refs=38", + "org.eclipse.jgit:org.eclipse.jgit:4.4.1.201607150455-r" : "6897bc09,refs=38", + "org.hamcrest:hamcrest:2.2" : "b35e5d7a,refs=74", + "org.locationtech.jts:jts-core:1.17.0" : "180518e6,refs=2", + "org.locationtech.spatial4j:spatial4j:0.8" : "1d5a4b2b,refs=4", + "org.openjdk.jmh:jmh-core:1.37" : "152d9f78,refs=3", + "org.openjdk.jmh:jmh-generator-annprocess:1.37" : "ecaf1d73,refs=1", + "org.ow2.asm:asm:9.6" : "6fbc4021,refs=5", + "org.ow2.asm:asm-commons:9.6" : "6fbc4021,refs=5", + "org.ow2.asm:asm-tree:9.6" : "6fbc4021,refs=5", + "org.pcollections:pcollections:3.1.4" : "6897bc09,refs=38", + "org.slf4j:slf4j-api:1.7.36" : "b91715f0,refs=6", + "ua.net.nlp:morfologik-ukrainian-search:4.9.1" : "cb00cecf,refs=5", + "xerces:xercesImpl:2.12.0" : "6f16ff86,refs=2" + } + }, + "because" : { + "152d9f78" : [ + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + } + ], + "180518e6" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:spatial-extras" + } + ], + "1d5a4b2b" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:spatial-extras" + } + ], + "2f760bab" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], + "47ea4550" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:icu" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:icu" + } + ], + "5ce8cdc6" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + } + ], + "6897bc09" : [ + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:backward-codecs" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:classification" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:codecs" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:core" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:core.tests" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:distribution.tests" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:facet" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:grouping" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:join" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:misc" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:queries" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:replicator" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:spatial-test-fixtures" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:spatial3d" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:suggest" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:test-framework" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:common" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:icu" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:kuromoji" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:morfologik.tests" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:nori" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:phonetic" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:smartcn" + }, + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:analysis:stempel" + } + ], + "6f16ff86" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + } + ], + "6fbc4021" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queries" + } + ], + "733734f0" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:phonetic" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:phonetic" + } + ], + "79af844b" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:morfologik" + } + ], + "85a1e4c6" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + } + ], + "b35e5d7a" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:backward-codecs" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:backward-codecs" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:classification" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:classification" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:codecs" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:codecs" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:core" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:core" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:core.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:core.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:distribution.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:distribution.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:facet" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:facet" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:grouping" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:grouping" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:join" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:join" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:misc" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:misc" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:queries" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queries" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:replicator" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:replicator" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:spatial-test-fixtures" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:spatial-test-fixtures" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:spatial3d" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:spatial3d" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:suggest" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:suggest" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:test-framework" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:test-framework" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:common" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:common" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:icu" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:icu" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:kuromoji" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:kuromoji" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:morfologik.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:nori" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:nori" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:phonetic" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:phonetic" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:smartcn" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:smartcn" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:stempel" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:stempel" + } + ], + "b7ba1646" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:distribution.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:distribution.tests" + } + ], + "b91715f0" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:opennlp" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:opennlp" + } + ], + "cb00cecf" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik.tests" + } + ], + "cbc357ab" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:spatial-extras" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:spatial-extras" + } + ], + "d9953130" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark-jmh" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:expressions" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:expressions" + } + ], + "e077a675" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:morfologik.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:morfologik.tests" + } + ], + "e9962aab" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:phonetic" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:phonetic" + } + ], + "ecaf1d73" : [ + { + "configuration" : "annotationProcessor", + "projectPath" : ":lucene:benchmark-jmh" + } + ], + "fa9ef26b" : [ + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:spatial-test-fixtures" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:spatial-test-fixtures" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:test-framework" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:test-framework" + } + ], + "fe494320" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:analysis:morfologik" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:analysis:morfologik" + } + ], + "ffa00415" : [ + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis.tests" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:analysis:icu" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:analysis:icu" + } + ] + } +} \ No newline at end of file diff --git a/versions.props b/versions.props deleted file mode 100644 index 13243bd65f6..00000000000 --- a/versions.props +++ /dev/null @@ -1,19 +0,0 @@ -com.carrotsearch.randomizedtesting:*=2.8.1 -com.carrotsearch:procfork=1.0.6 -com.google.errorprone:*=2.18.0 -com.ibm.icu:icu4j=74.2 -commons-codec:commons-codec=1.13 -io.sgr:s2-geometry-library-java=1.0.0 -junit:junit=4.13.1 -net.sourceforge.nekohtml:nekohtml=1.9.17 -org.antlr:antlr4*=4.11.1 -org.apache.commons:commons-compress=1.19 -org.apache.opennlp:opennlp-tools=2.3.2 -org.assertj:*=3.21.0 -org.carrot2:morfologik-*=2.1.9 -org.hamcrest:*=2.2 -org.locationtech.jts:jts-core=1.17.0 -org.locationtech.spatial4j:*=0.8 -org.ow2.asm:*=9.6 -ua.net.nlp:morfologik-ukrainian-search=4.9.1 -xerces:xercesImpl=2.12.0 diff --git a/versions.toml b/versions.toml new file mode 100644 index 00000000000..6a137975d3e --- /dev/null +++ b/versions.toml @@ -0,0 +1,85 @@ +[versions] +antlr = "4.11.1" +asm = "9.6" +assertj = "3.21.0" +commons-codec = "1.13" +commons-compress = "1.19" +ecj = "3.36.0" +errorprone = "2.18.0" +flexmark = "0.61.24" +# @keep This is GJF version for spotless/ tidy. +googleJavaFormat = "1.18.1" +groovy = "3.0.21" +hamcrest = "2.2" +icu4j = "74.2" +javacc = "7.0.12" +jflex = "1.8.2" +jgit = "5.13.1.202206130422-r" +jmh = "1.37" +jts = "1.17.0" +junit = "4.13.1" +# @keep Minimum gradle version to run the build +minGradle = "8.8" +# @keep This is the minimum required Java version. +minJava = "21" +morfologik = "2.1.9" +morfologik-ukrainian = "4.9.1" +nekohtml = "1.9.17" +opennlp = "2.3.2" +procfork = "1.0.6" +randomizedtesting = "2.8.1" +rat = "0.14" +s2-geometry = "1.0.0" +spatial4j = "0.8" +xerces = "2.12.0" +zstd = "1.5.5-11" + +[libraries] +antlr-core = { module = "org.antlr:antlr4", version.ref = "antlr" } +antlr-runtime = { module = "org.antlr:antlr4-runtime", version.ref = "antlr" } +asm-commons = { module = "org.ow2.asm:asm-commons", version.ref = "asm" } +asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } +assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } +commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } +commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } +errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } +flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" } +flexmark-ext-abbreviation = { module = "com.vladsch.flexmark:flexmark-ext-abbreviation", version.ref = "flexmark" } +flexmark-ext-attributes = { module = "com.vladsch.flexmark:flexmark-ext-attributes", version.ref = "flexmark" } +flexmark-ext-autolink = { module = "com.vladsch.flexmark:flexmark-ext-autolink", version.ref = "flexmark" } +flexmark-ext-tables = { module = "com.vladsch.flexmark:flexmark-ext-tables", version.ref = "flexmark" } +groovy = { module = "org.codehaus.groovy:groovy-all", version.ref = "groovy" } +hamcrest = { module = "org.hamcrest:hamcrest", version.ref = "hamcrest" } +icu4j = { module = "com.ibm.icu:icu4j", version.ref = "icu4j" } +javacc = { module = "net.java.dev.javacc:javacc", version.ref = "javacc" } +jflex = { module = "de.jflex:jflex", version.ref = "jflex" } +jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" } +jmh-annprocess = { module = "org.openjdk.jmh:jmh-generator-annprocess", version.ref = "jmh" } +jmh-core = { module = "org.openjdk.jmh:jmh-core", version.ref = "jmh" } +jts = { module = "org.locationtech.jts:jts-core", version.ref = "jts" } +junit = { module = "junit:junit", version.ref = "junit" } +morfologik-polish = { module = "org.carrot2:morfologik-polish", version.ref = "morfologik" } +morfologik-stemming = { module = "org.carrot2:morfologik-stemming", version.ref = "morfologik" } +morfologik-ukrainian = { module = "ua.net.nlp:morfologik-ukrainian-search", version.ref = "morfologik-ukrainian" } +nekohtml = { module = "net.sourceforge.nekohtml:nekohtml", version.ref = "nekohtml" } +opennlp-tools = { module = "org.apache.opennlp:opennlp-tools", version.ref = "opennlp" } +procfork = { module = "com.carrotsearch:procfork", version.ref = "procfork" } +randomizedtesting-runner = { module = "com.carrotsearch.randomizedtesting:randomizedtesting-runner", version.ref = "randomizedtesting" } +rat = { module = "org.apache.rat:apache-rat", version.ref = "rat" } +s2-geometry = { module = "io.sgr:s2-geometry-library-java", version.ref = "s2-geometry" } +spatial4j = { module = "org.locationtech.spatial4j:spatial4j", version.ref = "spatial4j" } +xerces = { module = "xerces:xercesImpl", version.ref = "xerces" } +zstd = { module = "com.github.luben:zstd-jni", version.ref = "zstd" } + +[plugins] +benmanes-versions = "com.github.ben-manes.versions:0.51.0" +dependencychecks = "com.carrotsearch.gradle.dependencychecks:0.0.9" +errorprone = "net.ltgt.errorprone:3.1.0" +forbiddenapis = "de.thetaphi.forbiddenapis:3.7" +jacocolog = "org.barfuin.gradle.jacocolog:3.1.0" +owasp-dependencycheck = "org.owasp.dependencycheck:7.2.0" +randomizedtesting = "com.carrotsearch.gradle.randomizedtesting:0.0.6" +spotless = "com.diffplug.spotless:6.5.2" +undercouch-download = "de.undercouch.download:5.2.0" +versionCatalogUpdate = "nl.littlerobots.version-catalog-update:0.8.4"