diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java b/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java index 9ba150909d8..64abfa5aec8 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java +++ b/buildSrc/src/main/java/org/apache/lucene/gradle/ErrorReportingTestListener.java @@ -113,7 +113,7 @@ public class ErrorReportingTestListener implements TestOutputListener, TestListe if (echoOutput && !verboseMode) { synchronized (this) { - System.out.println(""); + System.out.println(); System.out.println(suite.getClassName() + " > test suite's output saved to " + outputLog + ", copied below:"); try (BufferedReader reader = Files.newBufferedReader(outputLog, StandardCharsets.UTF_8)) { char[] buf = new char[1024]; diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf index 72a976a94fa..06a400ef883 100644 --- a/dev-tools/doap/lucene.rdf +++ b/dev-tools/doap/lucene.rdf @@ -67,6 +67,13 @@ + + + lucene-9.10.0 + 2024-02-20 + 9.10.0 + + lucene-9.9.2 diff --git a/dev-tools/scripts/addBackcompatIndexes.py b/dev-tools/scripts/addBackcompatIndexes.py index bbaf0b40630..7faacb8b8e3 100755 --- a/dev-tools/scripts/addBackcompatIndexes.py +++ b/dev-tools/scripts/addBackcompatIndexes.py @@ -45,16 +45,13 @@ def create_and_add_index(source, indextype, index_version, current_version, temp 'emptyIndex': 'empty' }[indextype] if indextype in ('cfs', 'nocfs'): - dirname = 'index.%s' % indextype filename = '%s.%s-%s.zip' % (prefix, index_version, indextype) else: - dirname = indextype filename = '%s.%s.zip' % (prefix, index_version) print(' creating %s...' % filename, end='', flush=True) module = 'backward-codecs' index_dir = os.path.join('lucene', module, 'src/test/org/apache/lucene/backward_index') - test_file = os.path.join(index_dir, filename) if os.path.exists(os.path.join(index_dir, filename)): print('uptodate') return @@ -76,24 +73,20 @@ def create_and_add_index(source, indextype, index_version, current_version, temp '-Dtests.codec=default' ]) base_dir = os.getcwd() - bc_index_dir = os.path.join(temp_dir, dirname) - bc_index_file = os.path.join(bc_index_dir, filename) + bc_index_file = os.path.join(temp_dir, filename) if os.path.exists(bc_index_file): print('alreadyexists') else: - if os.path.exists(bc_index_dir): - shutil.rmtree(bc_index_dir) os.chdir(source) scriptutil.run('./gradlew %s' % gradle_args) - os.chdir(bc_index_dir) - scriptutil.run('zip %s *' % filename) + if not os.path.exists(bc_index_file): + raise Exception("Expected file can't be found: %s" %bc_index_file) print('done') print(' adding %s...' % filename, end='', flush=True) scriptutil.run('cp %s %s' % (bc_index_file, os.path.join(base_dir, index_dir))) os.chdir(base_dir) - scriptutil.run('rm -rf %s' % bc_index_dir) print('done') def update_backcompat_tests(index_version, current_version): diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8eff18dfaf8..860961c11b8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -197,7 +197,10 @@ Improvements Optimizations --------------------- -(No changes) + +* GITHUB#13115: Short circuit queued flush check when flush on update is disabled (Prabhat Sharma) + +* GITHUB#13085: Remove unnecessary toString() / substring() calls to save some String allocations (Dmitry Cherniachenko) Bug Fixes --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java index f999c25133f..94b684cd3d5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemmer.java @@ -278,7 +278,7 @@ class BrazilianStemmer { return false; } - return value.substring(value.length() - suffix.length()).equals(suffix); + return value.endsWith(suffix); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java index 3f746491da5..7f176cccb30 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java @@ -142,7 +142,7 @@ public class PatternParser extends DefaultHandler { break; } } - token.append(chars.toString().substring(0, i)); + token.append(chars, 0, i); // chars.delete(0,i); for (int countr = i; countr < chars.length(); countr++) { chars.setCharAt(countr - i, chars.charAt(countr)); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java index 8e786aa59cc..d5122406261 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java @@ -669,7 +669,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase { builder.append((char) ch); } } catch (Exception e) { - if (gold.equals(builder.toString())) { + if (gold.contentEquals(builder)) { throw e; } throw new Exception( diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java index 5b315cf3ce9..33348e26d7a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilter.java @@ -30,19 +30,13 @@ import org.apache.lucene.tests.analysis.Token; public class TestTrimFilter extends BaseTokenStreamTestCase { public void testTrim() throws Exception { - char[] a = " a ".toCharArray(); - char[] b = "b ".toCharArray(); - char[] ccc = "cCc".toCharArray(); - char[] whitespace = " ".toCharArray(); - char[] empty = "".toCharArray(); - TokenStream ts = new CannedTokenStream( - new Token(new String(a, 0, a.length), 1, 5), - new Token(new String(b, 0, b.length), 6, 10), - new Token(new String(ccc, 0, ccc.length), 11, 15), - new Token(new String(whitespace, 0, whitespace.length), 16, 20), - new Token(new String(empty, 0, empty.length), 21, 21)); + new Token(" a ", 1, 5), + new Token("b ", 6, 10), + new Token("cCc", 11, 15), + new Token(" ", 16, 20), + new Token("", 21, 21)); ts = new TrimFilter(ts); assertTokenStreamContents(ts, new String[] {"a", "b", "cCc", "", ""}); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java index e39bffb9c64..06553575e5f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java @@ -82,8 +82,8 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase { indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i)))); } - boolean outputGood = expectedOutput.equals(output.toString()); - boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString()); + boolean outputGood = expectedOutput.contentEquals(output); + boolean indexMatchedGood = expectedIndexMatchedOutput.contentEquals(indexMatched); if (!outputGood || !indexMatchedGood || false) { System.out.println("Pattern : " + pattern); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java index 8df28d40dbc..04baa47425e 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/BackwardsCompatibilityTestBase.java @@ -26,6 +26,7 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; import java.text.ParseException; import java.util.ArrayList; import java.util.HashSet; @@ -38,11 +39,17 @@ import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SegmentReader; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; @@ -253,10 +260,23 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase { protected abstract void createIndex(Directory directory) throws IOException; public final void createBWCIndex() throws IOException { - Path indexDir = getIndexDir().resolve(indexName(Version.LATEST)); - Files.deleteIfExists(indexDir); - try (Directory dir = newFSDirectory(indexDir)) { + Path zipFile = getIndexDir().resolve(indexName(Version.LATEST)); + Files.deleteIfExists(zipFile); + Path tmpDir = createTempDir(); + + try (Directory dir = FSDirectory.open(tmpDir); + ZipOutputStream zipOut = + new ZipOutputStream( + Files.newOutputStream( + zipFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW))) { createIndex(dir); + for (String file : dir.listAll()) { + try (IndexInput in = dir.openInput(file, IOContext.READONCE)) { + zipOut.putNextEntry(new ZipEntry(file)); + new OutputStreamDataOutput(zipOut).copyBytes(in, in.length()); + zipOut.closeEntry(); + } + } } } diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java index 0cd9f37d5c3..c7b1ea3fb4a 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestGenerateBwcIndices.java @@ -20,8 +20,10 @@ import static org.apache.lucene.backward_index.BackwardsCompatibilityTestBase.cr import java.io.IOException; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems; import org.apache.lucene.util.Version; +@SuppressFileSystems("ExtrasFS") public class TestGenerateBwcIndices extends LuceneTestCase { // Backcompat index generation, described below, is mostly automated in: diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java index c57f319213f..82de070189c 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestIndexSortBackwardsCompatibility.java @@ -55,7 +55,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT static final String INDEX_NAME = "sorted"; static final String SUFFIX = ""; - private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_10_0; + private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_11_0; private static final String PARENT_FIELD_NAME = "___parent"; public TestIndexSortBackwardsCompatibility(Version version, String pattern) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip new file mode 100644 index 00000000000..59d38f72b4f Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-cfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip new file mode 100644 index 00000000000..6da00722761 Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/index.9.10.0-nocfs.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip new file mode 100644 index 00000000000..131d82c5acf Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/sorted.9.10.0.zip differ diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt index 8d3c001554d..8aa27313b67 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/versions.txt @@ -37,4 +37,5 @@ 9.8.0 9.9.0 9.9.1 -9.9.2 \ No newline at end of file +9.9.2 +9.10.0 diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java index 8caa25bd8d6..cd0daa42d4b 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java @@ -112,13 +112,13 @@ public class EnwikiContentSource extends ContentSource { String time(String original) { StringBuilder buffer = new StringBuilder(); - buffer.append(original.substring(8, 10)); + buffer.append(original, 8, 10); buffer.append('-'); buffer.append(months[Integer.parseInt(original.substring(5, 7)) - 1]); buffer.append('-'); - buffer.append(original.substring(0, 4)); + buffer.append(original, 0, 4); buffer.append(' '); - buffer.append(original.substring(11, 19)); + buffer.append(original, 11, 19); buffer.append(".000"); return buffer.toString(); diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java index cdce0b479fb..f6ebae4158f 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java @@ -60,7 +60,7 @@ public class TrecFBISParser extends TrecDocParser { docData.setName(name); docData.setDate(date); docData.setTitle(title); - docData.setBody(stripTags(docBuf, mark).toString()); + docData.setBody(stripTags(docBuf, mark)); return docData; } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java index e2dae3ab775..06a4f0fbdd1 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java @@ -53,14 +53,14 @@ public class TrecFR94Parser extends TrecDocParser { // date... String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); if (dateStr != null) { - dateStr = stripTags(dateStr, 0).toString(); + dateStr = stripTags(dateStr, 0); date = trecSrc.parseDate(dateStr.trim()); } } docData.clear(); docData.setName(name); docData.setDate(date); - docData.setBody(stripTags(docBuf, mark).toString()); + docData.setBody(stripTags(docBuf, mark)); return docData; } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java index 57762e884f0..a0d8e570cfa 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java @@ -52,7 +52,7 @@ public class TrecFTParser extends TrecDocParser { docData.setName(name); docData.setDate(date); docData.setTitle(title); - docData.setBody(stripTags(docBuf, mark).toString()); + docData.setBody(stripTags(docBuf, mark)); return docData; } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java index 933859e0ddb..186465a6a0d 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java @@ -49,7 +49,7 @@ public class TrecLATimesParser extends TrecDocParser { if (d2a > 0) { dateStr = dateStr.substring(0, d2a + 3); // we need the "day" part } - dateStr = stripTags(dateStr, 0).toString(); + dateStr = stripTags(dateStr, 0); date = trecSrc.parseDate(dateStr.trim()); } @@ -59,14 +59,14 @@ public class TrecLATimesParser extends TrecDocParser { title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null); } if (title != null) { - title = stripTags(title, 0).toString().trim(); + title = stripTags(title, 0).trim(); } docData.clear(); docData.setName(name); docData.setDate(date); docData.setTitle(title); - docData.setBody(stripTags(docBuf, mark).toString()); + docData.setBody(stripTags(docBuf, mark)); return docData; } } diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java index e8eb8f27042..8dc1ace3177 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java @@ -59,7 +59,7 @@ public class SearchWithSortTask extends ReadTask { String typeString; if (index != -1) { fieldName = field.substring(0, index); - typeString = field.substring(1 + index, field.length()); + typeString = field.substring(1 + index); } else { throw new RuntimeException("You must specify the sort type ie page:int,subject:string"); } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java index b0c19e8526e..d95e82f62ec 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java @@ -169,7 +169,7 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader { if (type == TYPE_STRING) { byte[] bytes = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length); - visitor.stringField(fieldInfo, new String(bytes, 0, bytes.length, StandardCharsets.UTF_8)); + visitor.stringField(fieldInfo, new String(bytes, StandardCharsets.UTF_8)); } else if (type == TYPE_BINARY) { byte[] copy = new byte[scratch.length() - VALUE.length]; System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java index 525381d60c9..bdfa78af87f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/CodecUtil.java @@ -380,7 +380,7 @@ public final class CodecUtil { int suffixLength = in.readByte() & 0xFF; byte[] suffixBytes = new byte[suffixLength]; in.readBytes(suffixBytes, 0, suffixBytes.length); - String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8); + String suffix = new String(suffixBytes, StandardCharsets.UTF_8); if (!suffix.equals(expectedSuffix)) { throw new CorruptIndexException( "file mismatch, expected suffix=" + expectedSuffix + ", got=" + suffix, in); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 97c05435b96..341e28c36f5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene94; import java.io.IOException; import java.util.Collections; +import java.util.List; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesFormat; @@ -111,6 +112,8 @@ import org.apache.lucene.store.IndexOutput; *
  • 0: EUCLIDEAN distance. ({@link VectorSimilarityFunction#EUCLIDEAN}) *
  • 1: DOT_PRODUCT similarity. ({@link VectorSimilarityFunction#DOT_PRODUCT}) *
  • 2: COSINE similarity. ({@link VectorSimilarityFunction#COSINE}) + *
  • 3: MAXIMUM_INNER_PRODUCT similarity. ({@link + * VectorSimilarityFunction#MAXIMUM_INNER_PRODUCT}) * * * @@ -284,10 +287,38 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { } private static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException { - if (b < 0 || b >= VectorSimilarityFunction.values().length) { - throw new CorruptIndexException("invalid distance function: " + b, input); + try { + return distOrdToFunc(b); + } catch (IllegalArgumentException e) { + throw new CorruptIndexException("invalid distance function: " + b, input, e); } - return VectorSimilarityFunction.values()[b]; + } + + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order have been chosen to + // match that of VectorSimilarityFunction in, at least, Lucene 9.10. Values + static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + + static VectorSimilarityFunction distOrdToFunc(byte i) { + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); + } + return SIMILARITY_FUNCTIONS.get(i); + } + + static byte distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } + } + throw new IllegalArgumentException("invalid distance function: " + func); } static { @@ -378,7 +409,7 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat { } output.writeVInt(fi.getVectorDimension()); output.writeByte((byte) fi.getVectorEncoding().ordinal()); - output.writeByte((byte) fi.getVectorSimilarityFunction().ordinal()); + output.writeByte(distFuncToOrd(fi.getVectorSimilarityFunction())); } CodecUtil.writeFooter(output); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java index 9ebac62ce9b..efb51c963e0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsReader.java @@ -22,6 +22,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FlatVectorsReader; @@ -171,15 +172,24 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader } } + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order must be identical + // to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}. + public static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + public static VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { - int similarityFunctionId = input.readInt(); - if (similarityFunctionId < 0 - || similarityFunctionId >= VectorSimilarityFunction.values().length) { - throw new CorruptIndexException( - "Invalid similarity function id: " + similarityFunctionId, input); + int i = input.readInt(); + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); } - return VectorSimilarityFunction.values()[similarityFunctionId]; + return SIMILARITY_FUNCTIONS.get(i); } public static VectorEncoding readVectorEncoding(DataInput input) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java index 174c65db9ac..a236dd7c65b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT; +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; import java.io.IOException; import java.util.ArrayList; @@ -33,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TaskExecutor; import org.apache.lucene.store.IndexOutput; @@ -436,7 +438,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { throws IOException { meta.writeInt(field.number); meta.writeInt(field.getVectorEncoding().ordinal()); - meta.writeInt(field.getVectorSimilarityFunction().ordinal()); + meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction())); meta.writeVLong(vectorIndexOffset); meta.writeVLong(vectorIndexLength); meta.writeVInt(field.getVectorDimension()); @@ -500,6 +502,15 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter { IOUtils.close(meta, vectorIndex, flatVectorWriter); } + static int distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } + } + throw new IllegalArgumentException("invalid distance function: " + func); + } + private static class FieldWriter extends KnnFieldVectorsWriter { private static final long SHALLOW_SIZE = diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java index ec6ced68002..d85df1cf8cd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -384,7 +384,7 @@ final class DocumentsWriter implements Closeable, Accountable { ensureOpen(); boolean hasEvents = false; while (flushControl.anyStalledThreads() - || (flushControl.numQueuedFlushes() > 0 && config.checkPendingFlushOnUpdate)) { + || (config.checkPendingFlushOnUpdate && flushControl.numQueuedFlushes() > 0)) { // Help out flushing any queued DWPTs so we can un-stall: // Try pickup pending threads here if possible // no need to loop over the next pending flushes... doFlush will take care of this diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java b/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java index 968900c7f0c..b8459e42afa 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexFileNames.java @@ -191,7 +191,7 @@ public final class IndexFileNames { if (idx == -1) { return null; } else { - return filename.substring(idx + 1, filename.length()); + return filename.substring(idx + 1); } } diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java index 8fbe0359698..b51cdf86306 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java @@ -677,16 +677,11 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase { public void testMaxTokenLengthDefault() throws Exception { StandardAnalyzer a = new StandardAnalyzer(); - StringBuilder bToken = new StringBuilder(); // exact max length: - for (int i = 0; i < StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; i++) { - bToken.append('b'); - } - - String bString = bToken.toString(); + String bString = "b".repeat(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); // first bString is exact max default length; next one is 1 too long String input = "x " + bString + " " + bString + "b"; - assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"}); + assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"}); a.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94FieldInfosFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94FieldInfosFormat.java new file mode 100644 index 00000000000..c69eeadf5e6 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene94/TestLucene94FieldInfosFormat.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene94; + +import java.util.Arrays; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.tests.index.BaseFieldInfoFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene94FieldInfosFormat extends BaseFieldInfoFormatTestCase { + @Override + protected Codec getCodec() { + return TestUtil.getDefaultCodec(); + } + + // Ensures that all expected vector similarity functions are translatable + // in the format. + public void testVectorSimilarityFuncs() { + // This does not necessarily have to be all similarity functions, but + // differences should be considered carefully. + var expectedValues = Arrays.stream(VectorSimilarityFunction.values()).toList(); + + assertEquals(Lucene94FieldInfosFormat.SIMILARITY_FUNCTIONS, expectedValues); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java index be0b01f3e0b..382389bc8f3 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene99; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FilterCodec; @@ -186,4 +187,13 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat new Lucene99HnswScalarQuantizedVectorsFormat( 20, 100, 1, null, new SameThreadExecutorService())); } + + // Ensures that all expected vector similarity functions are translatable + // in the format. + public void testVectorSimilarityFuncs() { + // This does not necessarily have to be all similarity functions, but + // differences should be considered carefully. + var expectedValues = Arrays.stream(VectorSimilarityFunction.values()).toList(); + assertEquals(Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS, expectedValues); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java index e05f3ae6633..4695c5b1f5e 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPayloads.java @@ -18,7 +18,6 @@ package org.apache.lucene.index; import java.io.IOException; import java.io.StringReader; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; @@ -289,12 +288,10 @@ public class TestPayloads extends LuceneTestCase { reader.close(); } - static final Charset utf8 = StandardCharsets.UTF_8; - private void generateRandomData(byte[] data) { // this test needs the random data to be valid unicode String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length); - byte[] b = s.getBytes(utf8); + byte[] b = s.getBytes(StandardCharsets.UTF_8); assert b.length == data.length; System.arraycopy(b, 0, data, 0, b.length); } @@ -493,7 +490,7 @@ public class TestPayloads extends LuceneTestCase { this.pool = pool; payload = pool.get(); generateRandomData(payload); - term = new String(payload, 0, payload.length, utf8); + term = new String(payload, StandardCharsets.UTF_8); first = true; payloadAtt = addAttribute(PayloadAttribute.class); termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPrefixRandom.java b/lucene/core/src/test/org/apache/lucene/search/TestPrefixRandom.java index 80f61e6d171..0cb8b083057 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPrefixRandom.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPrefixRandom.java @@ -107,7 +107,7 @@ public class TestPrefixRandom extends LuceneTestCase { @Override public String toString(String field) { - return field.toString() + ":" + prefix.toString(); + return field + ":" + prefix; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java index 4b5f0eb5c08..5f48b0861bb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -143,7 +143,7 @@ public class TestRegexpRandom2 extends LuceneTestCase { @Override public String toString(String field) { - return field.toString() + automaton.toString(); + return field + automaton; } @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java index 32fc2005cdd..c8adb8751b9 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java @@ -213,10 +213,10 @@ public class TestLevenshteinAutomata extends LuceneTestCase { List list = new ArrayList<>(); for (int i = 0; i < s.length() - 1; i++) { StringBuilder sb = new StringBuilder(); - sb.append(s.substring(0, i)); + sb.append(s, 0, i); sb.append(s.charAt(i + 1)); sb.append(s.charAt(i)); - sb.append(s.substring(i + 2, s.length())); + sb.append(s, i + 2, s.length()); String st = sb.toString(); if (!st.equals(s)) { list.add(Automata.makeString(st)); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index c83f2631845..c934108115d 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -119,7 +119,7 @@ public class TestRegExp extends LuceneTestCase { // Add any head to the result, unchanged if (substitutionPoint > 0) { - result.append(docValue.substring(0, substitutionPoint)); + result.append(docValue, 0, substitutionPoint); } // Modify the middle... diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java index 4f2d4af6da0..dfedf7974fd 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java @@ -1398,7 +1398,7 @@ public class UnifiedHighlighter { curValueBuilder.append(curValue); } curValueBuilder.append(valueSeparator); - curValueBuilder.append(value.substring(0, Math.min(lengthBudget - 1, value.length()))); + curValueBuilder.append(value, 0, Math.min(lengthBudget - 1, value.length())); values[currentField] = curValueBuilder; } diff --git a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java index 457d72bc4a3..d9c8b29caef 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/index/BPIndexReorderer.java @@ -49,7 +49,7 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntroSorter; +import org.apache.lucene.util.IntroSelector; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.packed.PackedInts; @@ -251,17 +251,17 @@ public final class BPIndexReorderer { private class IndexReorderingTask extends BaseRecursiveAction { private final IntsRef docIDs; - private final float[] gains; + private final float[] biases; private final CloseableThreadLocal threadLocal; IndexReorderingTask( IntsRef docIDs, - float[] gains, + float[] biases, CloseableThreadLocal threadLocal, int depth) { super(depth); this.docIDs = docIDs; - this.gains = gains; + this.biases = biases; this.threadLocal = threadLocal; } @@ -293,14 +293,14 @@ public final class BPIndexReorderer { assert sorted(docIDs); } - int leftSize = docIDs.length / 2; - if (leftSize < minPartitionSize) { + int halfLength = docIDs.length / 2; + if (halfLength < minPartitionSize) { return; } - int rightSize = docIDs.length - leftSize; - IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, leftSize); - IntsRef right = new IntsRef(docIDs.ints, docIDs.offset + leftSize, rightSize); + IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, halfLength); + IntsRef right = + new IntsRef(docIDs.ints, docIDs.offset + halfLength, docIDs.length - halfLength); PerThreadState state = threadLocal.get(); ForwardIndex forwardIndex = state.forwardIndex; @@ -313,7 +313,9 @@ public final class BPIndexReorderer { for (int iter = 0; iter < maxIters; ++iter) { boolean moved; try { - moved = shuffle(forwardIndex, left, right, leftDocFreqs, rightDocFreqs, gains, iter); + moved = + shuffle( + forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter); } catch (IOException e) { throw new UncheckedIOException(e); } @@ -322,10 +324,11 @@ public final class BPIndexReorderer { } } - // It is fine for all tasks to share the same docs / gains array since they all work on + // It is fine for all tasks to share the same docs / biases array since they all work on // different slices of the array at a given point in time. - IndexReorderingTask leftTask = new IndexReorderingTask(left, gains, threadLocal, depth + 1); - IndexReorderingTask rightTask = new IndexReorderingTask(right, gains, threadLocal, depth + 1); + IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1); + IndexReorderingTask rightTask = + new IndexReorderingTask(right, biases, threadLocal, depth + 1); if (shouldFork(docIDs.length, docIDs.ints.length)) { invokeAll(leftTask, rightTask); @@ -341,116 +344,94 @@ public final class BPIndexReorderer { */ private boolean shuffle( ForwardIndex forwardIndex, - IntsRef left, - IntsRef right, + IntsRef docIDs, + int midPoint, int[] leftDocFreqs, int[] rightDocFreqs, - float[] gains, + float[] biases, int iter) throws IOException { - assert left.ints == right.ints; - assert left.offset + left.length == right.offset; - // Computing gains is typically a bottleneck, because each iteration needs to iterate over all - // postings to recompute gains, and the total number of postings is usually one order of + // Computing biases is typically a bottleneck, because each iteration needs to iterate over + // all postings to recompute biases, and the total number of postings is usually one order of // magnitude or more larger than the number of docs. So we try to parallelize it. - ComputeGainsTask leftGainsTask = - new ComputeGainsTask( - left.ints, - gains, - left.offset, - left.offset + left.length, + new ComputeBiasTask( + docIDs.ints, + biases, + docIDs.offset, + docIDs.offset + docIDs.length, leftDocFreqs, rightDocFreqs, threadLocal, - depth); - ComputeGainsTask rightGainsTask = - new ComputeGainsTask( - right.ints, - gains, - right.offset, - right.offset + right.length, - rightDocFreqs, - leftDocFreqs, - threadLocal, - depth); - if (shouldFork(docIDs.length, docIDs.ints.length)) { - invokeAll(leftGainsTask, rightGainsTask); - } else { - leftGainsTask.compute(); - rightGainsTask.compute(); + depth) + .compute(); + + float maxLeftBias = Float.NEGATIVE_INFINITY; + for (int i = docIDs.offset; i < midPoint; ++i) { + maxLeftBias = Math.max(maxLeftBias, biases[i]); + } + float minRightBias = Float.POSITIVE_INFINITY; + for (int i = midPoint, end = docIDs.offset + docIDs.length; i < end; ++i) { + minRightBias = Math.min(minRightBias, biases[i]); + } + float gain = maxLeftBias - minRightBias; + // This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for + // Bipartite Graph Partitioning" by comparing the gain of swapping the doc from the left side + // that is most attracted to the right and the doc from the right side that is most attracted + // to the left against `iter` rather than zero. + if (gain <= iter) { + return false; } - class ByDescendingGainSorter extends IntroSorter { + new IntroSelector() { int pivotDoc; - float pivotGain; + float pivotBias; @Override protected void setPivot(int i) { - pivotDoc = left.ints[i]; - pivotGain = gains[i]; + pivotDoc = docIDs.ints[i]; + pivotBias = biases[i]; } @Override protected int comparePivot(int j) { - // Compare in reverse order to get a descending sort - int cmp = Float.compare(gains[j], pivotGain); + int cmp = Float.compare(pivotBias, biases[j]); if (cmp == 0) { // Tie break on the doc ID to preserve doc ID ordering as much as possible - cmp = pivotDoc - left.ints[j]; + cmp = pivotDoc - docIDs.ints[j]; } return cmp; } @Override protected void swap(int i, int j) { - int tmpDoc = left.ints[i]; - left.ints[i] = left.ints[j]; - left.ints[j] = tmpDoc; + float tmpBias = biases[i]; + biases[i] = biases[j]; + biases[j] = tmpBias; - float tmpGain = gains[i]; - gains[i] = gains[j]; - gains[j] = tmpGain; - } - } - - Runnable leftSorter = - () -> new ByDescendingGainSorter().sort(left.offset, left.offset + left.length); - Runnable rightSorter = - () -> new ByDescendingGainSorter().sort(right.offset, right.offset + right.length); - - if (shouldFork(docIDs.length, docIDs.ints.length)) { - // TODO: run it on more than 2 threads at most - invokeAll(adapt(leftSorter), adapt(rightSorter)); - } else { - leftSorter.run(); - rightSorter.run(); - } - - for (int i = 0; i < left.length; ++i) { - // This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for - // Bipartite Graph Partitioning" by comparing the gain against `iter` rather than zero. - if (gains[left.offset + i] + gains[right.offset + i] <= iter) { - if (i == 0) { - return false; + if (i < midPoint == j < midPoint) { + int tmpDoc = docIDs.ints[i]; + docIDs.ints[i] = docIDs.ints[j]; + docIDs.ints[j] = tmpDoc; + } else { + // If we're swapping docs across the left and right sides, we need to keep doc freqs + // up-to-date. + int left = Math.min(i, j); + int right = Math.max(i, j); + try { + swapDocsAndFreqs(docIDs.ints, left, right, forwardIndex, leftDocFreqs, rightDocFreqs); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } - break; } - - swap( - left.ints, - left.offset + i, - right.offset + i, - forwardIndex, - leftDocFreqs, - rightDocFreqs); - } + }.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint); return true; } - private static void swap( + private static void swapDocsAndFreqs( int[] docs, int left, int right, @@ -492,19 +473,19 @@ public final class BPIndexReorderer { } } - private class ComputeGainsTask extends BaseRecursiveAction { + private class ComputeBiasTask extends BaseRecursiveAction { private final int[] docs; - private final float[] gains; + private final float[] biases; private final int from; private final int to; private final int[] fromDocFreqs; private final int[] toDocFreqs; private final CloseableThreadLocal threadLocal; - ComputeGainsTask( + ComputeBiasTask( int[] docs, - float[] gains, + float[] biases, int from, int to, int[] fromDocFreqs, @@ -513,7 +494,7 @@ public final class BPIndexReorderer { int depth) { super(depth); this.docs = docs; - this.gains = gains; + this.biases = biases; this.from = from; this.to = to; this.fromDocFreqs = fromDocFreqs; @@ -527,15 +508,15 @@ public final class BPIndexReorderer { if (problemSize > 1 && shouldFork(problemSize, docs.length)) { final int mid = (from + to) >>> 1; invokeAll( - new ComputeGainsTask( - docs, gains, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth), - new ComputeGainsTask( - docs, gains, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth)); + new ComputeBiasTask( + docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth), + new ComputeBiasTask( + docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth)); } else { ForwardIndex forwardIndex = threadLocal.get().forwardIndex; try { for (int i = from; i < to; ++i) { - gains[i] = computeGain(docs[i], forwardIndex, fromDocFreqs, toDocFreqs); + biases[i] = computeBias(docs[i], forwardIndex, fromDocFreqs, toDocFreqs); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -547,11 +528,11 @@ public final class BPIndexReorderer { * Compute a float that is negative when a document is attracted to the left and positive * otherwise. */ - private static float computeGain( + private static float computeBias( int docID, ForwardIndex forwardIndex, int[] fromDocFreqs, int[] toDocFreqs) throws IOException { forwardIndex.seek(docID); - double gain = 0; + double bias = 0; for (IntsRef terms = forwardIndex.nextTerms(); terms.length != 0; terms = forwardIndex.nextTerms()) { @@ -561,12 +542,12 @@ public final class BPIndexReorderer { final int toDocFreq = toDocFreqs[termID]; assert fromDocFreq >= 0; assert toDocFreq >= 0; - gain += + bias += (toDocFreq == 0 ? 0 : fastLog2(toDocFreq)) - (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq)); } } - return (float) gain; + return (float) bias; } } @@ -869,7 +850,7 @@ public final class BPIndexReorderer { } private static long docRAMRequirements(int maxDoc) { - // We need one int per doc for the doc map, plus one float to store the gain associated with + // We need one int per doc for the doc map, plus one float to store the bias associated with // this doc. return 2L * Integer.BYTES * maxDoc; } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/surround/query/BooleanQueryTestFacade.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/surround/query/BooleanQueryTestFacade.java index 83b1d78d685..d5da92c5300 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/surround/query/BooleanQueryTestFacade.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/surround/query/BooleanQueryTestFacade.java @@ -114,7 +114,7 @@ public class BooleanQueryTestFacade { public void doTest() throws Exception { if (verbose) { - System.out.println(""); + System.out.println(); System.out.println("Query: " + queryText); } diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/RecursivePrefixTreeStrategy.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/RecursivePrefixTreeStrategy.java index 8fe372feec3..642f466bc03 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/RecursivePrefixTreeStrategy.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/RecursivePrefixTreeStrategy.java @@ -113,7 +113,7 @@ public class RecursivePrefixTreeStrategy extends PrefixTreeStrategy { if (pointsOnly) str.append(",pointsOnly"); if (pruneLeafyBranches) str.append(",pruneLeafyBranches"); if (prefixGridScanLevel != grid.getMaxLevels() - 4) - str.append(",prefixGridScanLevel:").append("").append(prefixGridScanLevel); + str.append(",prefixGridScanLevel:").append(prefixGridScanLevel); if (!multiOverlappingIndexedShapes) str.append(",!multiOverlappingIndexedShapes"); return str.append(')').toString(); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index e1772102098..afaf304a4ad 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -927,7 +927,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { return; } sb.append(""); - sb.append(surface.substring(0, prefixToken.length())); + sb.append(surface, 0, prefixToken.length()); sb.append(""); sb.append(surface.substring(prefixToken.length())); } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java index 2361f6b0c31..6983c7eba59 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingInfixSuggester.java @@ -892,7 +892,7 @@ public class TestAnalyzingInfixSuggester extends LuceneTestCase { b.append(""); b.append(queryTerm); b.append(""); - b.append(inputTerm.substring(queryTerm.length(), inputTerm.length())); + b.append(inputTerm.substring(queryTerm.length())); matched = true; break; } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java b/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java index e01fc3877a8..3c6bb891c92 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/search/CheckHits.java @@ -793,6 +793,10 @@ public class CheckHits { assertTrue(s2 == null || s2.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS); continue; } + if (s2 == null) { + assertTrue(s1.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS); + continue; + } TwoPhaseIterator twoPhase1 = s1.twoPhaseIterator(); TwoPhaseIterator twoPhase2 = s2.twoPhaseIterator(); DocIdSetIterator approx1 = twoPhase1 == null ? s1.iterator() : twoPhase1.approximation(); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/English.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/English.java index c78fb7ac362..5b48b617e72 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/English.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/English.java @@ -166,7 +166,6 @@ public final class English { result.append("one "); break; case 0: - result.append(""); break; } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LineFileDocs.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LineFileDocs.java index e0158c4c542..91b897ffb70 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LineFileDocs.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LineFileDocs.java @@ -307,7 +307,7 @@ public class LineFileDocs implements Closeable { throw new RuntimeException("line: [" + line + "] is in an invalid format !"); } - docState.body.setStringValue(line.substring(1 + spot2, line.length())); + docState.body.setStringValue(line.substring(1 + spot2)); final String title = line.substring(0, spot); docState.title.setStringValue(title); docState.titleTokenized.setStringValue(title);