mirror of https://github.com/apache/lucene.git
Merge branch 'main' into java_21
This commit is contained in:
commit
54b6248a8d
|
@ -113,7 +113,7 @@ public class ErrorReportingTestListener implements TestOutputListener, TestListe
|
||||||
|
|
||||||
if (echoOutput && !verboseMode) {
|
if (echoOutput && !verboseMode) {
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
System.out.println("");
|
System.out.println();
|
||||||
System.out.println(suite.getClassName() + " > test suite's output saved to " + outputLog + ", copied below:");
|
System.out.println(suite.getClassName() + " > test suite's output saved to " + outputLog + ", copied below:");
|
||||||
try (BufferedReader reader = Files.newBufferedReader(outputLog, StandardCharsets.UTF_8)) {
|
try (BufferedReader reader = Files.newBufferedReader(outputLog, StandardCharsets.UTF_8)) {
|
||||||
char[] buf = new char[1024];
|
char[] buf = new char[1024];
|
||||||
|
|
|
@ -67,6 +67,13 @@
|
||||||
</maintainer>
|
</maintainer>
|
||||||
|
|
||||||
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
|
||||||
|
<release>
|
||||||
|
<Version>
|
||||||
|
<name>lucene-9.10.0</name>
|
||||||
|
<created>2024-02-20</created>
|
||||||
|
<revision>9.10.0</revision>
|
||||||
|
</Version>
|
||||||
|
</release>
|
||||||
<release>
|
<release>
|
||||||
<Version>
|
<Version>
|
||||||
<name>lucene-9.9.2</name>
|
<name>lucene-9.9.2</name>
|
||||||
|
|
|
@ -45,16 +45,13 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
|
||||||
'emptyIndex': 'empty'
|
'emptyIndex': 'empty'
|
||||||
}[indextype]
|
}[indextype]
|
||||||
if indextype in ('cfs', 'nocfs'):
|
if indextype in ('cfs', 'nocfs'):
|
||||||
dirname = 'index.%s' % indextype
|
|
||||||
filename = '%s.%s-%s.zip' % (prefix, index_version, indextype)
|
filename = '%s.%s-%s.zip' % (prefix, index_version, indextype)
|
||||||
else:
|
else:
|
||||||
dirname = indextype
|
|
||||||
filename = '%s.%s.zip' % (prefix, index_version)
|
filename = '%s.%s.zip' % (prefix, index_version)
|
||||||
|
|
||||||
print(' creating %s...' % filename, end='', flush=True)
|
print(' creating %s...' % filename, end='', flush=True)
|
||||||
module = 'backward-codecs'
|
module = 'backward-codecs'
|
||||||
index_dir = os.path.join('lucene', module, 'src/test/org/apache/lucene/backward_index')
|
index_dir = os.path.join('lucene', module, 'src/test/org/apache/lucene/backward_index')
|
||||||
test_file = os.path.join(index_dir, filename)
|
|
||||||
if os.path.exists(os.path.join(index_dir, filename)):
|
if os.path.exists(os.path.join(index_dir, filename)):
|
||||||
print('uptodate')
|
print('uptodate')
|
||||||
return
|
return
|
||||||
|
@ -76,24 +73,20 @@ def create_and_add_index(source, indextype, index_version, current_version, temp
|
||||||
'-Dtests.codec=default'
|
'-Dtests.codec=default'
|
||||||
])
|
])
|
||||||
base_dir = os.getcwd()
|
base_dir = os.getcwd()
|
||||||
bc_index_dir = os.path.join(temp_dir, dirname)
|
bc_index_file = os.path.join(temp_dir, filename)
|
||||||
bc_index_file = os.path.join(bc_index_dir, filename)
|
|
||||||
|
|
||||||
if os.path.exists(bc_index_file):
|
if os.path.exists(bc_index_file):
|
||||||
print('alreadyexists')
|
print('alreadyexists')
|
||||||
else:
|
else:
|
||||||
if os.path.exists(bc_index_dir):
|
|
||||||
shutil.rmtree(bc_index_dir)
|
|
||||||
os.chdir(source)
|
os.chdir(source)
|
||||||
scriptutil.run('./gradlew %s' % gradle_args)
|
scriptutil.run('./gradlew %s' % gradle_args)
|
||||||
os.chdir(bc_index_dir)
|
if not os.path.exists(bc_index_file):
|
||||||
scriptutil.run('zip %s *' % filename)
|
raise Exception("Expected file can't be found: %s" %bc_index_file)
|
||||||
print('done')
|
print('done')
|
||||||
|
|
||||||
print(' adding %s...' % filename, end='', flush=True)
|
print(' adding %s...' % filename, end='', flush=True)
|
||||||
scriptutil.run('cp %s %s' % (bc_index_file, os.path.join(base_dir, index_dir)))
|
scriptutil.run('cp %s %s' % (bc_index_file, os.path.join(base_dir, index_dir)))
|
||||||
os.chdir(base_dir)
|
os.chdir(base_dir)
|
||||||
scriptutil.run('rm -rf %s' % bc_index_dir)
|
|
||||||
print('done')
|
print('done')
|
||||||
|
|
||||||
def update_backcompat_tests(index_version, current_version):
|
def update_backcompat_tests(index_version, current_version):
|
||||||
|
|
|
@ -197,7 +197,10 @@ Improvements
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
(No changes)
|
|
||||||
|
* GITHUB#13115: Short circuit queued flush check when flush on update is disabled (Prabhat Sharma)
|
||||||
|
|
||||||
|
* GITHUB#13085: Remove unnecessary toString() / substring() calls to save some String allocations (Dmitry Cherniachenko)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -278,7 +278,7 @@ class BrazilianStemmer {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return value.substring(value.length() - suffix.length()).equals(suffix);
|
return value.endsWith(suffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -142,7 +142,7 @@ public class PatternParser extends DefaultHandler {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
token.append(chars.toString().substring(0, i));
|
token.append(chars, 0, i);
|
||||||
// chars.delete(0,i);
|
// chars.delete(0,i);
|
||||||
for (int countr = i; countr < chars.length(); countr++) {
|
for (int countr = i; countr < chars.length(); countr++) {
|
||||||
chars.setCharAt(countr - i, chars.charAt(countr));
|
chars.setCharAt(countr - i, chars.charAt(countr));
|
||||||
|
|
|
@ -669,7 +669,7 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
builder.append((char) ch);
|
builder.append((char) ch);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
if (gold.equals(builder.toString())) {
|
if (gold.contentEquals(builder)) {
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
throw new Exception(
|
throw new Exception(
|
||||||
|
|
|
@ -30,19 +30,13 @@ import org.apache.lucene.tests.analysis.Token;
|
||||||
public class TestTrimFilter extends BaseTokenStreamTestCase {
|
public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testTrim() throws Exception {
|
public void testTrim() throws Exception {
|
||||||
char[] a = " a ".toCharArray();
|
|
||||||
char[] b = "b ".toCharArray();
|
|
||||||
char[] ccc = "cCc".toCharArray();
|
|
||||||
char[] whitespace = " ".toCharArray();
|
|
||||||
char[] empty = "".toCharArray();
|
|
||||||
|
|
||||||
TokenStream ts =
|
TokenStream ts =
|
||||||
new CannedTokenStream(
|
new CannedTokenStream(
|
||||||
new Token(new String(a, 0, a.length), 1, 5),
|
new Token(" a ", 1, 5),
|
||||||
new Token(new String(b, 0, b.length), 6, 10),
|
new Token("b ", 6, 10),
|
||||||
new Token(new String(ccc, 0, ccc.length), 11, 15),
|
new Token("cCc", 11, 15),
|
||||||
new Token(new String(whitespace, 0, whitespace.length), 16, 20),
|
new Token(" ", 16, 20),
|
||||||
new Token(new String(empty, 0, empty.length), 21, 21));
|
new Token("", 21, 21));
|
||||||
ts = new TrimFilter(ts);
|
ts = new TrimFilter(ts);
|
||||||
|
|
||||||
assertTokenStreamContents(ts, new String[] {"a", "b", "cCc", "", ""});
|
assertTokenStreamContents(ts, new String[] {"a", "b", "cCc", "", ""});
|
||||||
|
|
|
@ -82,8 +82,8 @@ public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
||||||
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
|
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean outputGood = expectedOutput.equals(output.toString());
|
boolean outputGood = expectedOutput.contentEquals(output);
|
||||||
boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
|
boolean indexMatchedGood = expectedIndexMatchedOutput.contentEquals(indexMatched);
|
||||||
|
|
||||||
if (!outputGood || !indexMatchedGood || false) {
|
if (!outputGood || !indexMatchedGood || false) {
|
||||||
System.out.println("Pattern : " + pattern);
|
System.out.println("Pattern : " + pattern);
|
||||||
|
|
|
@ -26,6 +26,7 @@ import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -38,11 +39,17 @@ import java.util.function.Predicate;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.SegmentReader;
|
import org.apache.lucene.index.SegmentReader;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
import org.apache.lucene.tests.util.TestUtil;
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -253,10 +260,23 @@ public abstract class BackwardsCompatibilityTestBase extends LuceneTestCase {
|
||||||
protected abstract void createIndex(Directory directory) throws IOException;
|
protected abstract void createIndex(Directory directory) throws IOException;
|
||||||
|
|
||||||
public final void createBWCIndex() throws IOException {
|
public final void createBWCIndex() throws IOException {
|
||||||
Path indexDir = getIndexDir().resolve(indexName(Version.LATEST));
|
Path zipFile = getIndexDir().resolve(indexName(Version.LATEST));
|
||||||
Files.deleteIfExists(indexDir);
|
Files.deleteIfExists(zipFile);
|
||||||
try (Directory dir = newFSDirectory(indexDir)) {
|
Path tmpDir = createTempDir();
|
||||||
|
|
||||||
|
try (Directory dir = FSDirectory.open(tmpDir);
|
||||||
|
ZipOutputStream zipOut =
|
||||||
|
new ZipOutputStream(
|
||||||
|
Files.newOutputStream(
|
||||||
|
zipFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW))) {
|
||||||
createIndex(dir);
|
createIndex(dir);
|
||||||
|
for (String file : dir.listAll()) {
|
||||||
|
try (IndexInput in = dir.openInput(file, IOContext.READONCE)) {
|
||||||
|
zipOut.putNextEntry(new ZipEntry(file));
|
||||||
|
new OutputStreamDataOutput(zipOut).copyBytes(in, in.length());
|
||||||
|
zipOut.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,10 @@ import static org.apache.lucene.backward_index.BackwardsCompatibilityTestBase.cr
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.tests.util.LuceneTestCase.SuppressFileSystems;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@SuppressFileSystems("ExtrasFS")
|
||||||
public class TestGenerateBwcIndices extends LuceneTestCase {
|
public class TestGenerateBwcIndices extends LuceneTestCase {
|
||||||
|
|
||||||
// Backcompat index generation, described below, is mostly automated in:
|
// Backcompat index generation, described below, is mostly automated in:
|
||||||
|
|
|
@ -55,7 +55,7 @@ public class TestIndexSortBackwardsCompatibility extends BackwardsCompatibilityT
|
||||||
|
|
||||||
static final String INDEX_NAME = "sorted";
|
static final String INDEX_NAME = "sorted";
|
||||||
static final String SUFFIX = "";
|
static final String SUFFIX = "";
|
||||||
private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_10_0;
|
private static final Version FIRST_PARENT_DOC_VERSION = Version.LUCENE_9_11_0;
|
||||||
private static final String PARENT_FIELD_NAME = "___parent";
|
private static final String PARENT_FIELD_NAME = "___parent";
|
||||||
|
|
||||||
public TestIndexSortBackwardsCompatibility(Version version, String pattern) {
|
public TestIndexSortBackwardsCompatibility(Version version, String pattern) {
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -38,3 +38,4 @@
|
||||||
9.9.0
|
9.9.0
|
||||||
9.9.1
|
9.9.1
|
||||||
9.9.2
|
9.9.2
|
||||||
|
9.10.0
|
||||||
|
|
|
@ -112,13 +112,13 @@ public class EnwikiContentSource extends ContentSource {
|
||||||
String time(String original) {
|
String time(String original) {
|
||||||
StringBuilder buffer = new StringBuilder();
|
StringBuilder buffer = new StringBuilder();
|
||||||
|
|
||||||
buffer.append(original.substring(8, 10));
|
buffer.append(original, 8, 10);
|
||||||
buffer.append('-');
|
buffer.append('-');
|
||||||
buffer.append(months[Integer.parseInt(original.substring(5, 7)) - 1]);
|
buffer.append(months[Integer.parseInt(original.substring(5, 7)) - 1]);
|
||||||
buffer.append('-');
|
buffer.append('-');
|
||||||
buffer.append(original.substring(0, 4));
|
buffer.append(original, 0, 4);
|
||||||
buffer.append(' ');
|
buffer.append(' ');
|
||||||
buffer.append(original.substring(11, 19));
|
buffer.append(original, 11, 19);
|
||||||
buffer.append(".000");
|
buffer.append(".000");
|
||||||
|
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class TrecFBISParser extends TrecDocParser {
|
||||||
docData.setName(name);
|
docData.setName(name);
|
||||||
docData.setDate(date);
|
docData.setDate(date);
|
||||||
docData.setTitle(title);
|
docData.setTitle(title);
|
||||||
docData.setBody(stripTags(docBuf, mark).toString());
|
docData.setBody(stripTags(docBuf, mark));
|
||||||
return docData;
|
return docData;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,14 +53,14 @@ public class TrecFR94Parser extends TrecDocParser {
|
||||||
// date...
|
// date...
|
||||||
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
|
String dateStr = extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
|
||||||
if (dateStr != null) {
|
if (dateStr != null) {
|
||||||
dateStr = stripTags(dateStr, 0).toString();
|
dateStr = stripTags(dateStr, 0);
|
||||||
date = trecSrc.parseDate(dateStr.trim());
|
date = trecSrc.parseDate(dateStr.trim());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docData.clear();
|
docData.clear();
|
||||||
docData.setName(name);
|
docData.setName(name);
|
||||||
docData.setDate(date);
|
docData.setDate(date);
|
||||||
docData.setBody(stripTags(docBuf, mark).toString());
|
docData.setBody(stripTags(docBuf, mark));
|
||||||
return docData;
|
return docData;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class TrecFTParser extends TrecDocParser {
|
||||||
docData.setName(name);
|
docData.setName(name);
|
||||||
docData.setDate(date);
|
docData.setDate(date);
|
||||||
docData.setTitle(title);
|
docData.setTitle(title);
|
||||||
docData.setBody(stripTags(docBuf, mark).toString());
|
docData.setBody(stripTags(docBuf, mark));
|
||||||
return docData;
|
return docData;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,7 @@ public class TrecLATimesParser extends TrecDocParser {
|
||||||
if (d2a > 0) {
|
if (d2a > 0) {
|
||||||
dateStr = dateStr.substring(0, d2a + 3); // we need the "day" part
|
dateStr = dateStr.substring(0, d2a + 3); // we need the "day" part
|
||||||
}
|
}
|
||||||
dateStr = stripTags(dateStr, 0).toString();
|
dateStr = stripTags(dateStr, 0);
|
||||||
date = trecSrc.parseDate(dateStr.trim());
|
date = trecSrc.parseDate(dateStr.trim());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,14 +59,14 @@ public class TrecLATimesParser extends TrecDocParser {
|
||||||
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
title = extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
|
||||||
}
|
}
|
||||||
if (title != null) {
|
if (title != null) {
|
||||||
title = stripTags(title, 0).toString().trim();
|
title = stripTags(title, 0).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
docData.clear();
|
docData.clear();
|
||||||
docData.setName(name);
|
docData.setName(name);
|
||||||
docData.setDate(date);
|
docData.setDate(date);
|
||||||
docData.setTitle(title);
|
docData.setTitle(title);
|
||||||
docData.setBody(stripTags(docBuf, mark).toString());
|
docData.setBody(stripTags(docBuf, mark));
|
||||||
return docData;
|
return docData;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class SearchWithSortTask extends ReadTask {
|
||||||
String typeString;
|
String typeString;
|
||||||
if (index != -1) {
|
if (index != -1) {
|
||||||
fieldName = field.substring(0, index);
|
fieldName = field.substring(0, index);
|
||||||
typeString = field.substring(1 + index, field.length());
|
typeString = field.substring(1 + index);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
|
throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
|
||||||
}
|
}
|
||||||
|
|
|
@ -169,7 +169,7 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
|
||||||
if (type == TYPE_STRING) {
|
if (type == TYPE_STRING) {
|
||||||
byte[] bytes = new byte[scratch.length() - VALUE.length];
|
byte[] bytes = new byte[scratch.length() - VALUE.length];
|
||||||
System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
|
System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
|
||||||
visitor.stringField(fieldInfo, new String(bytes, 0, bytes.length, StandardCharsets.UTF_8));
|
visitor.stringField(fieldInfo, new String(bytes, StandardCharsets.UTF_8));
|
||||||
} else if (type == TYPE_BINARY) {
|
} else if (type == TYPE_BINARY) {
|
||||||
byte[] copy = new byte[scratch.length() - VALUE.length];
|
byte[] copy = new byte[scratch.length() - VALUE.length];
|
||||||
System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
|
System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
|
||||||
|
|
|
@ -380,7 +380,7 @@ public final class CodecUtil {
|
||||||
int suffixLength = in.readByte() & 0xFF;
|
int suffixLength = in.readByte() & 0xFF;
|
||||||
byte[] suffixBytes = new byte[suffixLength];
|
byte[] suffixBytes = new byte[suffixLength];
|
||||||
in.readBytes(suffixBytes, 0, suffixBytes.length);
|
in.readBytes(suffixBytes, 0, suffixBytes.length);
|
||||||
String suffix = new String(suffixBytes, 0, suffixBytes.length, StandardCharsets.UTF_8);
|
String suffix = new String(suffixBytes, StandardCharsets.UTF_8);
|
||||||
if (!suffix.equals(expectedSuffix)) {
|
if (!suffix.equals(expectedSuffix)) {
|
||||||
throw new CorruptIndexException(
|
throw new CorruptIndexException(
|
||||||
"file mismatch, expected suffix=" + expectedSuffix + ", got=" + suffix, in);
|
"file mismatch, expected suffix=" + expectedSuffix + ", got=" + suffix, in);
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene94;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.DocValuesFormat;
|
import org.apache.lucene.codecs.DocValuesFormat;
|
||||||
|
@ -111,6 +112,8 @@ import org.apache.lucene.store.IndexOutput;
|
||||||
* <li>0: EUCLIDEAN distance. ({@link VectorSimilarityFunction#EUCLIDEAN})
|
* <li>0: EUCLIDEAN distance. ({@link VectorSimilarityFunction#EUCLIDEAN})
|
||||||
* <li>1: DOT_PRODUCT similarity. ({@link VectorSimilarityFunction#DOT_PRODUCT})
|
* <li>1: DOT_PRODUCT similarity. ({@link VectorSimilarityFunction#DOT_PRODUCT})
|
||||||
* <li>2: COSINE similarity. ({@link VectorSimilarityFunction#COSINE})
|
* <li>2: COSINE similarity. ({@link VectorSimilarityFunction#COSINE})
|
||||||
|
* <li>3: MAXIMUM_INNER_PRODUCT similarity. ({@link
|
||||||
|
* VectorSimilarityFunction#MAXIMUM_INNER_PRODUCT})
|
||||||
* </ul>
|
* </ul>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
|
@ -284,10 +287,38 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException {
|
private static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException {
|
||||||
if (b < 0 || b >= VectorSimilarityFunction.values().length) {
|
try {
|
||||||
throw new CorruptIndexException("invalid distance function: " + b, input);
|
return distOrdToFunc(b);
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
throw new CorruptIndexException("invalid distance function: " + b, input, e);
|
||||||
}
|
}
|
||||||
return VectorSimilarityFunction.values()[b];
|
}
|
||||||
|
|
||||||
|
// List of vector similarity functions. This list is defined here, in order
|
||||||
|
// to avoid an undesirable dependency on the declaration and order of values
|
||||||
|
// in VectorSimilarityFunction. The list values and order have been chosen to
|
||||||
|
// match that of VectorSimilarityFunction in, at least, Lucene 9.10. Values
|
||||||
|
static final List<VectorSimilarityFunction> SIMILARITY_FUNCTIONS =
|
||||||
|
List.of(
|
||||||
|
VectorSimilarityFunction.EUCLIDEAN,
|
||||||
|
VectorSimilarityFunction.DOT_PRODUCT,
|
||||||
|
VectorSimilarityFunction.COSINE,
|
||||||
|
VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT);
|
||||||
|
|
||||||
|
static VectorSimilarityFunction distOrdToFunc(byte i) {
|
||||||
|
if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) {
|
||||||
|
throw new IllegalArgumentException("invalid distance function: " + i);
|
||||||
|
}
|
||||||
|
return SIMILARITY_FUNCTIONS.get(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static byte distFuncToOrd(VectorSimilarityFunction func) {
|
||||||
|
for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) {
|
||||||
|
if (SIMILARITY_FUNCTIONS.get(i).equals(func)) {
|
||||||
|
return (byte) i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("invalid distance function: " + func);
|
||||||
}
|
}
|
||||||
|
|
||||||
static {
|
static {
|
||||||
|
@ -378,7 +409,7 @@ public final class Lucene94FieldInfosFormat extends FieldInfosFormat {
|
||||||
}
|
}
|
||||||
output.writeVInt(fi.getVectorDimension());
|
output.writeVInt(fi.getVectorDimension());
|
||||||
output.writeByte((byte) fi.getVectorEncoding().ordinal());
|
output.writeByte((byte) fi.getVectorEncoding().ordinal());
|
||||||
output.writeByte((byte) fi.getVectorSimilarityFunction().ordinal());
|
output.writeByte(distFuncToOrd(fi.getVectorSimilarityFunction()));
|
||||||
}
|
}
|
||||||
CodecUtil.writeFooter(output);
|
CodecUtil.writeFooter(output);
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import org.apache.lucene.codecs.CodecUtil;
|
import org.apache.lucene.codecs.CodecUtil;
|
||||||
import org.apache.lucene.codecs.FlatVectorsReader;
|
import org.apache.lucene.codecs.FlatVectorsReader;
|
||||||
|
@ -171,15 +172,24 @@ public final class Lucene99HnswVectorsReader extends KnnVectorsReader
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// List of vector similarity functions. This list is defined here, in order
|
||||||
|
// to avoid an undesirable dependency on the declaration and order of values
|
||||||
|
// in VectorSimilarityFunction. The list values and order must be identical
|
||||||
|
// to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}.
|
||||||
|
public static final List<VectorSimilarityFunction> SIMILARITY_FUNCTIONS =
|
||||||
|
List.of(
|
||||||
|
VectorSimilarityFunction.EUCLIDEAN,
|
||||||
|
VectorSimilarityFunction.DOT_PRODUCT,
|
||||||
|
VectorSimilarityFunction.COSINE,
|
||||||
|
VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT);
|
||||||
|
|
||||||
public static VectorSimilarityFunction readSimilarityFunction(DataInput input)
|
public static VectorSimilarityFunction readSimilarityFunction(DataInput input)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int similarityFunctionId = input.readInt();
|
int i = input.readInt();
|
||||||
if (similarityFunctionId < 0
|
if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) {
|
||||||
|| similarityFunctionId >= VectorSimilarityFunction.values().length) {
|
throw new IllegalArgumentException("invalid distance function: " + i);
|
||||||
throw new CorruptIndexException(
|
|
||||||
"Invalid similarity function id: " + similarityFunctionId, input);
|
|
||||||
}
|
}
|
||||||
return VectorSimilarityFunction.values()[similarityFunctionId];
|
return SIMILARITY_FUNCTIONS.get(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static VectorEncoding readVectorEncoding(DataInput input) throws IOException {
|
public static VectorEncoding readVectorEncoding(DataInput input) throws IOException {
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
package org.apache.lucene.codecs.lucene99;
|
package org.apache.lucene.codecs.lucene99;
|
||||||
|
|
||||||
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
|
||||||
|
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.index.IndexFileNames;
|
||||||
import org.apache.lucene.index.MergeState;
|
import org.apache.lucene.index.MergeState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.Sorter;
|
import org.apache.lucene.index.Sorter;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.TaskExecutor;
|
import org.apache.lucene.search.TaskExecutor;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
@ -436,7 +438,7 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
throws IOException {
|
throws IOException {
|
||||||
meta.writeInt(field.number);
|
meta.writeInt(field.number);
|
||||||
meta.writeInt(field.getVectorEncoding().ordinal());
|
meta.writeInt(field.getVectorEncoding().ordinal());
|
||||||
meta.writeInt(field.getVectorSimilarityFunction().ordinal());
|
meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction()));
|
||||||
meta.writeVLong(vectorIndexOffset);
|
meta.writeVLong(vectorIndexOffset);
|
||||||
meta.writeVLong(vectorIndexLength);
|
meta.writeVLong(vectorIndexLength);
|
||||||
meta.writeVInt(field.getVectorDimension());
|
meta.writeVInt(field.getVectorDimension());
|
||||||
|
@ -500,6 +502,15 @@ public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
|
||||||
IOUtils.close(meta, vectorIndex, flatVectorWriter);
|
IOUtils.close(meta, vectorIndex, flatVectorWriter);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int distFuncToOrd(VectorSimilarityFunction func) {
|
||||||
|
for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) {
|
||||||
|
if (SIMILARITY_FUNCTIONS.get(i).equals(func)) {
|
||||||
|
return (byte) i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("invalid distance function: " + func);
|
||||||
|
}
|
||||||
|
|
||||||
private static class FieldWriter<T> extends KnnFieldVectorsWriter<T> {
|
private static class FieldWriter<T> extends KnnFieldVectorsWriter<T> {
|
||||||
|
|
||||||
private static final long SHALLOW_SIZE =
|
private static final long SHALLOW_SIZE =
|
||||||
|
|
|
@ -384,7 +384,7 @@ final class DocumentsWriter implements Closeable, Accountable {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
boolean hasEvents = false;
|
boolean hasEvents = false;
|
||||||
while (flushControl.anyStalledThreads()
|
while (flushControl.anyStalledThreads()
|
||||||
|| (flushControl.numQueuedFlushes() > 0 && config.checkPendingFlushOnUpdate)) {
|
|| (config.checkPendingFlushOnUpdate && flushControl.numQueuedFlushes() > 0)) {
|
||||||
// Help out flushing any queued DWPTs so we can un-stall:
|
// Help out flushing any queued DWPTs so we can un-stall:
|
||||||
// Try pickup pending threads here if possible
|
// Try pickup pending threads here if possible
|
||||||
// no need to loop over the next pending flushes... doFlush will take care of this
|
// no need to loop over the next pending flushes... doFlush will take care of this
|
||||||
|
|
|
@ -191,7 +191,7 @@ public final class IndexFileNames {
|
||||||
if (idx == -1) {
|
if (idx == -1) {
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
return filename.substring(idx + 1, filename.length());
|
return filename.substring(idx + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -677,16 +677,11 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testMaxTokenLengthDefault() throws Exception {
|
public void testMaxTokenLengthDefault() throws Exception {
|
||||||
StandardAnalyzer a = new StandardAnalyzer();
|
StandardAnalyzer a = new StandardAnalyzer();
|
||||||
|
|
||||||
StringBuilder bToken = new StringBuilder();
|
|
||||||
// exact max length:
|
// exact max length:
|
||||||
for (int i = 0; i < StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; i++) {
|
String bString = "b".repeat(StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||||
bToken.append('b');
|
|
||||||
}
|
|
||||||
|
|
||||||
String bString = bToken.toString();
|
|
||||||
// first bString is exact max default length; next one is 1 too long
|
// first bString is exact max default length; next one is 1 too long
|
||||||
String input = "x " + bString + " " + bString + "b";
|
String input = "x " + bString + " " + bString + "b";
|
||||||
assertAnalyzesTo(a, input.toString(), new String[] {"x", bString, bString, "b"});
|
assertAnalyzesTo(a, input, new String[] {"x", bString, bString, "b"});
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.lucene94;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import org.apache.lucene.codecs.Codec;
|
||||||
|
import org.apache.lucene.index.VectorSimilarityFunction;
|
||||||
|
import org.apache.lucene.tests.index.BaseFieldInfoFormatTestCase;
|
||||||
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
|
|
||||||
|
public class TestLucene94FieldInfosFormat extends BaseFieldInfoFormatTestCase {
|
||||||
|
@Override
|
||||||
|
protected Codec getCodec() {
|
||||||
|
return TestUtil.getDefaultCodec();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensures that all expected vector similarity functions are translatable
|
||||||
|
// in the format.
|
||||||
|
public void testVectorSimilarityFuncs() {
|
||||||
|
// This does not necessarily have to be all similarity functions, but
|
||||||
|
// differences should be considered carefully.
|
||||||
|
var expectedValues = Arrays.stream(VectorSimilarityFunction.values()).toList();
|
||||||
|
|
||||||
|
assertEquals(Lucene94FieldInfosFormat.SIMILARITY_FUNCTIONS, expectedValues);
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.codecs.lucene99;
|
||||||
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.lucene.codecs.Codec;
|
import org.apache.lucene.codecs.Codec;
|
||||||
import org.apache.lucene.codecs.FilterCodec;
|
import org.apache.lucene.codecs.FilterCodec;
|
||||||
|
@ -186,4 +187,13 @@ public class TestLucene99HnswQuantizedVectorsFormat extends BaseKnnVectorsFormat
|
||||||
new Lucene99HnswScalarQuantizedVectorsFormat(
|
new Lucene99HnswScalarQuantizedVectorsFormat(
|
||||||
20, 100, 1, null, new SameThreadExecutorService()));
|
20, 100, 1, null, new SameThreadExecutorService()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensures that all expected vector similarity functions are translatable
|
||||||
|
// in the format.
|
||||||
|
public void testVectorSimilarityFuncs() {
|
||||||
|
// This does not necessarily have to be all similarity functions, but
|
||||||
|
// differences should be considered carefully.
|
||||||
|
var expectedValues = Arrays.stream(VectorSimilarityFunction.values()).toList();
|
||||||
|
assertEquals(Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS, expectedValues);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
@ -289,12 +288,10 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
reader.close();
|
reader.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
static final Charset utf8 = StandardCharsets.UTF_8;
|
|
||||||
|
|
||||||
private void generateRandomData(byte[] data) {
|
private void generateRandomData(byte[] data) {
|
||||||
// this test needs the random data to be valid unicode
|
// this test needs the random data to be valid unicode
|
||||||
String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length);
|
String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length);
|
||||||
byte[] b = s.getBytes(utf8);
|
byte[] b = s.getBytes(StandardCharsets.UTF_8);
|
||||||
assert b.length == data.length;
|
assert b.length == data.length;
|
||||||
System.arraycopy(b, 0, data, 0, b.length);
|
System.arraycopy(b, 0, data, 0, b.length);
|
||||||
}
|
}
|
||||||
|
@ -493,7 +490,7 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
this.pool = pool;
|
this.pool = pool;
|
||||||
payload = pool.get();
|
payload = pool.get();
|
||||||
generateRandomData(payload);
|
generateRandomData(payload);
|
||||||
term = new String(payload, 0, payload.length, utf8);
|
term = new String(payload, StandardCharsets.UTF_8);
|
||||||
first = true;
|
first = true;
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
termAtt = addAttribute(CharTermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
|
@ -107,7 +107,7 @@ public class TestPrefixRandom extends LuceneTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
return field.toString() + ":" + prefix.toString();
|
return field + ":" + prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -143,7 +143,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(String field) {
|
public String toString(String field) {
|
||||||
return field.toString() + automaton.toString();
|
return field + automaton;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -213,10 +213,10 @@ public class TestLevenshteinAutomata extends LuceneTestCase {
|
||||||
List<Automaton> list = new ArrayList<>();
|
List<Automaton> list = new ArrayList<>();
|
||||||
for (int i = 0; i < s.length() - 1; i++) {
|
for (int i = 0; i < s.length() - 1; i++) {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
sb.append(s.substring(0, i));
|
sb.append(s, 0, i);
|
||||||
sb.append(s.charAt(i + 1));
|
sb.append(s.charAt(i + 1));
|
||||||
sb.append(s.charAt(i));
|
sb.append(s.charAt(i));
|
||||||
sb.append(s.substring(i + 2, s.length()));
|
sb.append(s, i + 2, s.length());
|
||||||
String st = sb.toString();
|
String st = sb.toString();
|
||||||
if (!st.equals(s)) {
|
if (!st.equals(s)) {
|
||||||
list.add(Automata.makeString(st));
|
list.add(Automata.makeString(st));
|
||||||
|
|
|
@ -119,7 +119,7 @@ public class TestRegExp extends LuceneTestCase {
|
||||||
|
|
||||||
// Add any head to the result, unchanged
|
// Add any head to the result, unchanged
|
||||||
if (substitutionPoint > 0) {
|
if (substitutionPoint > 0) {
|
||||||
result.append(docValue.substring(0, substitutionPoint));
|
result.append(docValue, 0, substitutionPoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Modify the middle...
|
// Modify the middle...
|
||||||
|
|
|
@ -1398,7 +1398,7 @@ public class UnifiedHighlighter {
|
||||||
curValueBuilder.append(curValue);
|
curValueBuilder.append(curValue);
|
||||||
}
|
}
|
||||||
curValueBuilder.append(valueSeparator);
|
curValueBuilder.append(valueSeparator);
|
||||||
curValueBuilder.append(value.substring(0, Math.min(lengthBudget - 1, value.length())));
|
curValueBuilder.append(value, 0, Math.min(lengthBudget - 1, value.length()));
|
||||||
values[currentField] = curValueBuilder;
|
values[currentField] = curValueBuilder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CloseableThreadLocal;
|
import org.apache.lucene.util.CloseableThreadLocal;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntroSorter;
|
import org.apache.lucene.util.IntroSelector;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.packed.PackedInts;
|
import org.apache.lucene.util.packed.PackedInts;
|
||||||
|
|
||||||
|
@ -251,17 +251,17 @@ public final class BPIndexReorderer {
|
||||||
private class IndexReorderingTask extends BaseRecursiveAction {
|
private class IndexReorderingTask extends BaseRecursiveAction {
|
||||||
|
|
||||||
private final IntsRef docIDs;
|
private final IntsRef docIDs;
|
||||||
private final float[] gains;
|
private final float[] biases;
|
||||||
private final CloseableThreadLocal<PerThreadState> threadLocal;
|
private final CloseableThreadLocal<PerThreadState> threadLocal;
|
||||||
|
|
||||||
IndexReorderingTask(
|
IndexReorderingTask(
|
||||||
IntsRef docIDs,
|
IntsRef docIDs,
|
||||||
float[] gains,
|
float[] biases,
|
||||||
CloseableThreadLocal<PerThreadState> threadLocal,
|
CloseableThreadLocal<PerThreadState> threadLocal,
|
||||||
int depth) {
|
int depth) {
|
||||||
super(depth);
|
super(depth);
|
||||||
this.docIDs = docIDs;
|
this.docIDs = docIDs;
|
||||||
this.gains = gains;
|
this.biases = biases;
|
||||||
this.threadLocal = threadLocal;
|
this.threadLocal = threadLocal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -293,14 +293,14 @@ public final class BPIndexReorderer {
|
||||||
assert sorted(docIDs);
|
assert sorted(docIDs);
|
||||||
}
|
}
|
||||||
|
|
||||||
int leftSize = docIDs.length / 2;
|
int halfLength = docIDs.length / 2;
|
||||||
if (leftSize < minPartitionSize) {
|
if (halfLength < minPartitionSize) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int rightSize = docIDs.length - leftSize;
|
IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, halfLength);
|
||||||
IntsRef left = new IntsRef(docIDs.ints, docIDs.offset, leftSize);
|
IntsRef right =
|
||||||
IntsRef right = new IntsRef(docIDs.ints, docIDs.offset + leftSize, rightSize);
|
new IntsRef(docIDs.ints, docIDs.offset + halfLength, docIDs.length - halfLength);
|
||||||
|
|
||||||
PerThreadState state = threadLocal.get();
|
PerThreadState state = threadLocal.get();
|
||||||
ForwardIndex forwardIndex = state.forwardIndex;
|
ForwardIndex forwardIndex = state.forwardIndex;
|
||||||
|
@ -313,7 +313,9 @@ public final class BPIndexReorderer {
|
||||||
for (int iter = 0; iter < maxIters; ++iter) {
|
for (int iter = 0; iter < maxIters; ++iter) {
|
||||||
boolean moved;
|
boolean moved;
|
||||||
try {
|
try {
|
||||||
moved = shuffle(forwardIndex, left, right, leftDocFreqs, rightDocFreqs, gains, iter);
|
moved =
|
||||||
|
shuffle(
|
||||||
|
forwardIndex, docIDs, right.offset, leftDocFreqs, rightDocFreqs, biases, iter);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new UncheckedIOException(e);
|
throw new UncheckedIOException(e);
|
||||||
}
|
}
|
||||||
|
@ -322,10 +324,11 @@ public final class BPIndexReorderer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// It is fine for all tasks to share the same docs / gains array since they all work on
|
// It is fine for all tasks to share the same docs / biases array since they all work on
|
||||||
// different slices of the array at a given point in time.
|
// different slices of the array at a given point in time.
|
||||||
IndexReorderingTask leftTask = new IndexReorderingTask(left, gains, threadLocal, depth + 1);
|
IndexReorderingTask leftTask = new IndexReorderingTask(left, biases, threadLocal, depth + 1);
|
||||||
IndexReorderingTask rightTask = new IndexReorderingTask(right, gains, threadLocal, depth + 1);
|
IndexReorderingTask rightTask =
|
||||||
|
new IndexReorderingTask(right, biases, threadLocal, depth + 1);
|
||||||
|
|
||||||
if (shouldFork(docIDs.length, docIDs.ints.length)) {
|
if (shouldFork(docIDs.length, docIDs.ints.length)) {
|
||||||
invokeAll(leftTask, rightTask);
|
invokeAll(leftTask, rightTask);
|
||||||
|
@ -341,116 +344,94 @@ public final class BPIndexReorderer {
|
||||||
*/
|
*/
|
||||||
private boolean shuffle(
|
private boolean shuffle(
|
||||||
ForwardIndex forwardIndex,
|
ForwardIndex forwardIndex,
|
||||||
IntsRef left,
|
IntsRef docIDs,
|
||||||
IntsRef right,
|
int midPoint,
|
||||||
int[] leftDocFreqs,
|
int[] leftDocFreqs,
|
||||||
int[] rightDocFreqs,
|
int[] rightDocFreqs,
|
||||||
float[] gains,
|
float[] biases,
|
||||||
int iter)
|
int iter)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
assert left.ints == right.ints;
|
|
||||||
assert left.offset + left.length == right.offset;
|
|
||||||
|
|
||||||
// Computing gains is typically a bottleneck, because each iteration needs to iterate over all
|
// Computing biases is typically a bottleneck, because each iteration needs to iterate over
|
||||||
// postings to recompute gains, and the total number of postings is usually one order of
|
// all postings to recompute biases, and the total number of postings is usually one order of
|
||||||
// magnitude or more larger than the number of docs. So we try to parallelize it.
|
// magnitude or more larger than the number of docs. So we try to parallelize it.
|
||||||
ComputeGainsTask leftGainsTask =
|
new ComputeBiasTask(
|
||||||
new ComputeGainsTask(
|
docIDs.ints,
|
||||||
left.ints,
|
biases,
|
||||||
gains,
|
docIDs.offset,
|
||||||
left.offset,
|
docIDs.offset + docIDs.length,
|
||||||
left.offset + left.length,
|
|
||||||
leftDocFreqs,
|
leftDocFreqs,
|
||||||
rightDocFreqs,
|
rightDocFreqs,
|
||||||
threadLocal,
|
threadLocal,
|
||||||
depth);
|
depth)
|
||||||
ComputeGainsTask rightGainsTask =
|
.compute();
|
||||||
new ComputeGainsTask(
|
|
||||||
right.ints,
|
float maxLeftBias = Float.NEGATIVE_INFINITY;
|
||||||
gains,
|
for (int i = docIDs.offset; i < midPoint; ++i) {
|
||||||
right.offset,
|
maxLeftBias = Math.max(maxLeftBias, biases[i]);
|
||||||
right.offset + right.length,
|
}
|
||||||
rightDocFreqs,
|
float minRightBias = Float.POSITIVE_INFINITY;
|
||||||
leftDocFreqs,
|
for (int i = midPoint, end = docIDs.offset + docIDs.length; i < end; ++i) {
|
||||||
threadLocal,
|
minRightBias = Math.min(minRightBias, biases[i]);
|
||||||
depth);
|
}
|
||||||
if (shouldFork(docIDs.length, docIDs.ints.length)) {
|
float gain = maxLeftBias - minRightBias;
|
||||||
invokeAll(leftGainsTask, rightGainsTask);
|
// This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for
|
||||||
} else {
|
// Bipartite Graph Partitioning" by comparing the gain of swapping the doc from the left side
|
||||||
leftGainsTask.compute();
|
// that is most attracted to the right and the doc from the right side that is most attracted
|
||||||
rightGainsTask.compute();
|
// to the left against `iter` rather than zero.
|
||||||
|
if (gain <= iter) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
class ByDescendingGainSorter extends IntroSorter {
|
new IntroSelector() {
|
||||||
|
|
||||||
int pivotDoc;
|
int pivotDoc;
|
||||||
float pivotGain;
|
float pivotBias;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setPivot(int i) {
|
protected void setPivot(int i) {
|
||||||
pivotDoc = left.ints[i];
|
pivotDoc = docIDs.ints[i];
|
||||||
pivotGain = gains[i];
|
pivotBias = biases[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected int comparePivot(int j) {
|
protected int comparePivot(int j) {
|
||||||
// Compare in reverse order to get a descending sort
|
int cmp = Float.compare(pivotBias, biases[j]);
|
||||||
int cmp = Float.compare(gains[j], pivotGain);
|
|
||||||
if (cmp == 0) {
|
if (cmp == 0) {
|
||||||
// Tie break on the doc ID to preserve doc ID ordering as much as possible
|
// Tie break on the doc ID to preserve doc ID ordering as much as possible
|
||||||
cmp = pivotDoc - left.ints[j];
|
cmp = pivotDoc - docIDs.ints[j];
|
||||||
}
|
}
|
||||||
return cmp;
|
return cmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void swap(int i, int j) {
|
protected void swap(int i, int j) {
|
||||||
int tmpDoc = left.ints[i];
|
float tmpBias = biases[i];
|
||||||
left.ints[i] = left.ints[j];
|
biases[i] = biases[j];
|
||||||
left.ints[j] = tmpDoc;
|
biases[j] = tmpBias;
|
||||||
|
|
||||||
float tmpGain = gains[i];
|
if (i < midPoint == j < midPoint) {
|
||||||
gains[i] = gains[j];
|
int tmpDoc = docIDs.ints[i];
|
||||||
gains[j] = tmpGain;
|
docIDs.ints[i] = docIDs.ints[j];
|
||||||
}
|
docIDs.ints[j] = tmpDoc;
|
||||||
}
|
|
||||||
|
|
||||||
Runnable leftSorter =
|
|
||||||
() -> new ByDescendingGainSorter().sort(left.offset, left.offset + left.length);
|
|
||||||
Runnable rightSorter =
|
|
||||||
() -> new ByDescendingGainSorter().sort(right.offset, right.offset + right.length);
|
|
||||||
|
|
||||||
if (shouldFork(docIDs.length, docIDs.ints.length)) {
|
|
||||||
// TODO: run it on more than 2 threads at most
|
|
||||||
invokeAll(adapt(leftSorter), adapt(rightSorter));
|
|
||||||
} else {
|
} else {
|
||||||
leftSorter.run();
|
// If we're swapping docs across the left and right sides, we need to keep doc freqs
|
||||||
rightSorter.run();
|
// up-to-date.
|
||||||
|
int left = Math.min(i, j);
|
||||||
|
int right = Math.max(i, j);
|
||||||
|
try {
|
||||||
|
swapDocsAndFreqs(docIDs.ints, left, right, forwardIndex, leftDocFreqs, rightDocFreqs);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new UncheckedIOException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < left.length; ++i) {
|
|
||||||
// This uses the simulated annealing proposed by Mackenzie et al in "Tradeoff Options for
|
|
||||||
// Bipartite Graph Partitioning" by comparing the gain against `iter` rather than zero.
|
|
||||||
if (gains[left.offset + i] + gains[right.offset + i] <= iter) {
|
|
||||||
if (i == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
swap(
|
|
||||||
left.ints,
|
|
||||||
left.offset + i,
|
|
||||||
right.offset + i,
|
|
||||||
forwardIndex,
|
|
||||||
leftDocFreqs,
|
|
||||||
rightDocFreqs);
|
|
||||||
}
|
}
|
||||||
|
}.select(docIDs.offset, docIDs.offset + docIDs.length, midPoint);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void swap(
|
private static void swapDocsAndFreqs(
|
||||||
int[] docs,
|
int[] docs,
|
||||||
int left,
|
int left,
|
||||||
int right,
|
int right,
|
||||||
|
@ -492,19 +473,19 @@ public final class BPIndexReorderer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class ComputeGainsTask extends BaseRecursiveAction {
|
private class ComputeBiasTask extends BaseRecursiveAction {
|
||||||
|
|
||||||
private final int[] docs;
|
private final int[] docs;
|
||||||
private final float[] gains;
|
private final float[] biases;
|
||||||
private final int from;
|
private final int from;
|
||||||
private final int to;
|
private final int to;
|
||||||
private final int[] fromDocFreqs;
|
private final int[] fromDocFreqs;
|
||||||
private final int[] toDocFreqs;
|
private final int[] toDocFreqs;
|
||||||
private final CloseableThreadLocal<PerThreadState> threadLocal;
|
private final CloseableThreadLocal<PerThreadState> threadLocal;
|
||||||
|
|
||||||
ComputeGainsTask(
|
ComputeBiasTask(
|
||||||
int[] docs,
|
int[] docs,
|
||||||
float[] gains,
|
float[] biases,
|
||||||
int from,
|
int from,
|
||||||
int to,
|
int to,
|
||||||
int[] fromDocFreqs,
|
int[] fromDocFreqs,
|
||||||
|
@ -513,7 +494,7 @@ public final class BPIndexReorderer {
|
||||||
int depth) {
|
int depth) {
|
||||||
super(depth);
|
super(depth);
|
||||||
this.docs = docs;
|
this.docs = docs;
|
||||||
this.gains = gains;
|
this.biases = biases;
|
||||||
this.from = from;
|
this.from = from;
|
||||||
this.to = to;
|
this.to = to;
|
||||||
this.fromDocFreqs = fromDocFreqs;
|
this.fromDocFreqs = fromDocFreqs;
|
||||||
|
@ -527,15 +508,15 @@ public final class BPIndexReorderer {
|
||||||
if (problemSize > 1 && shouldFork(problemSize, docs.length)) {
|
if (problemSize > 1 && shouldFork(problemSize, docs.length)) {
|
||||||
final int mid = (from + to) >>> 1;
|
final int mid = (from + to) >>> 1;
|
||||||
invokeAll(
|
invokeAll(
|
||||||
new ComputeGainsTask(
|
new ComputeBiasTask(
|
||||||
docs, gains, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth),
|
docs, biases, from, mid, fromDocFreqs, toDocFreqs, threadLocal, depth),
|
||||||
new ComputeGainsTask(
|
new ComputeBiasTask(
|
||||||
docs, gains, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth));
|
docs, biases, mid, to, fromDocFreqs, toDocFreqs, threadLocal, depth));
|
||||||
} else {
|
} else {
|
||||||
ForwardIndex forwardIndex = threadLocal.get().forwardIndex;
|
ForwardIndex forwardIndex = threadLocal.get().forwardIndex;
|
||||||
try {
|
try {
|
||||||
for (int i = from; i < to; ++i) {
|
for (int i = from; i < to; ++i) {
|
||||||
gains[i] = computeGain(docs[i], forwardIndex, fromDocFreqs, toDocFreqs);
|
biases[i] = computeBias(docs[i], forwardIndex, fromDocFreqs, toDocFreqs);
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new UncheckedIOException(e);
|
throw new UncheckedIOException(e);
|
||||||
|
@ -547,11 +528,11 @@ public final class BPIndexReorderer {
|
||||||
* Compute a float that is negative when a document is attracted to the left and positive
|
* Compute a float that is negative when a document is attracted to the left and positive
|
||||||
* otherwise.
|
* otherwise.
|
||||||
*/
|
*/
|
||||||
private static float computeGain(
|
private static float computeBias(
|
||||||
int docID, ForwardIndex forwardIndex, int[] fromDocFreqs, int[] toDocFreqs)
|
int docID, ForwardIndex forwardIndex, int[] fromDocFreqs, int[] toDocFreqs)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
forwardIndex.seek(docID);
|
forwardIndex.seek(docID);
|
||||||
double gain = 0;
|
double bias = 0;
|
||||||
for (IntsRef terms = forwardIndex.nextTerms();
|
for (IntsRef terms = forwardIndex.nextTerms();
|
||||||
terms.length != 0;
|
terms.length != 0;
|
||||||
terms = forwardIndex.nextTerms()) {
|
terms = forwardIndex.nextTerms()) {
|
||||||
|
@ -561,12 +542,12 @@ public final class BPIndexReorderer {
|
||||||
final int toDocFreq = toDocFreqs[termID];
|
final int toDocFreq = toDocFreqs[termID];
|
||||||
assert fromDocFreq >= 0;
|
assert fromDocFreq >= 0;
|
||||||
assert toDocFreq >= 0;
|
assert toDocFreq >= 0;
|
||||||
gain +=
|
bias +=
|
||||||
(toDocFreq == 0 ? 0 : fastLog2(toDocFreq))
|
(toDocFreq == 0 ? 0 : fastLog2(toDocFreq))
|
||||||
- (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq));
|
- (fromDocFreq == 0 ? 0 : fastLog2(fromDocFreq));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (float) gain;
|
return (float) bias;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -869,7 +850,7 @@ public final class BPIndexReorderer {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static long docRAMRequirements(int maxDoc) {
|
private static long docRAMRequirements(int maxDoc) {
|
||||||
// We need one int per doc for the doc map, plus one float to store the gain associated with
|
// We need one int per doc for the doc map, plus one float to store the bias associated with
|
||||||
// this doc.
|
// this doc.
|
||||||
return 2L * Integer.BYTES * maxDoc;
|
return 2L * Integer.BYTES * maxDoc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -114,7 +114,7 @@ public class BooleanQueryTestFacade {
|
||||||
public void doTest() throws Exception {
|
public void doTest() throws Exception {
|
||||||
|
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
System.out.println("");
|
System.out.println();
|
||||||
System.out.println("Query: " + queryText);
|
System.out.println("Query: " + queryText);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ public class RecursivePrefixTreeStrategy extends PrefixTreeStrategy {
|
||||||
if (pointsOnly) str.append(",pointsOnly");
|
if (pointsOnly) str.append(",pointsOnly");
|
||||||
if (pruneLeafyBranches) str.append(",pruneLeafyBranches");
|
if (pruneLeafyBranches) str.append(",pruneLeafyBranches");
|
||||||
if (prefixGridScanLevel != grid.getMaxLevels() - 4)
|
if (prefixGridScanLevel != grid.getMaxLevels() - 4)
|
||||||
str.append(",prefixGridScanLevel:").append("").append(prefixGridScanLevel);
|
str.append(",prefixGridScanLevel:").append(prefixGridScanLevel);
|
||||||
if (!multiOverlappingIndexedShapes) str.append(",!multiOverlappingIndexedShapes");
|
if (!multiOverlappingIndexedShapes) str.append(",!multiOverlappingIndexedShapes");
|
||||||
return str.append(')').toString();
|
return str.append(')').toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -927,7 +927,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
sb.append("<b>");
|
sb.append("<b>");
|
||||||
sb.append(surface.substring(0, prefixToken.length()));
|
sb.append(surface, 0, prefixToken.length());
|
||||||
sb.append("</b>");
|
sb.append("</b>");
|
||||||
sb.append(surface.substring(prefixToken.length()));
|
sb.append(surface.substring(prefixToken.length()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -892,7 +892,7 @@ public class TestAnalyzingInfixSuggester extends LuceneTestCase {
|
||||||
b.append("<b>");
|
b.append("<b>");
|
||||||
b.append(queryTerm);
|
b.append(queryTerm);
|
||||||
b.append("</b>");
|
b.append("</b>");
|
||||||
b.append(inputTerm.substring(queryTerm.length(), inputTerm.length()));
|
b.append(inputTerm.substring(queryTerm.length()));
|
||||||
matched = true;
|
matched = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -793,6 +793,10 @@ public class CheckHits {
|
||||||
assertTrue(s2 == null || s2.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
|
assertTrue(s2 == null || s2.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (s2 == null) {
|
||||||
|
assertTrue(s1.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
TwoPhaseIterator twoPhase1 = s1.twoPhaseIterator();
|
TwoPhaseIterator twoPhase1 = s1.twoPhaseIterator();
|
||||||
TwoPhaseIterator twoPhase2 = s2.twoPhaseIterator();
|
TwoPhaseIterator twoPhase2 = s2.twoPhaseIterator();
|
||||||
DocIdSetIterator approx1 = twoPhase1 == null ? s1.iterator() : twoPhase1.approximation();
|
DocIdSetIterator approx1 = twoPhase1 == null ? s1.iterator() : twoPhase1.approximation();
|
||||||
|
|
|
@ -166,7 +166,6 @@ public final class English {
|
||||||
result.append("one ");
|
result.append("one ");
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
result.append("");
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -307,7 +307,7 @@ public class LineFileDocs implements Closeable {
|
||||||
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
|
||||||
}
|
}
|
||||||
|
|
||||||
docState.body.setStringValue(line.substring(1 + spot2, line.length()));
|
docState.body.setStringValue(line.substring(1 + spot2));
|
||||||
final String title = line.substring(0, spot);
|
final String title = line.substring(0, spot);
|
||||||
docState.title.setStringValue(title);
|
docState.title.setStringValue(title);
|
||||||
docState.titleTokenized.setStringValue(title);
|
docState.titleTokenized.setStringValue(title);
|
||||||
|
|
Loading…
Reference in New Issue