LUCENE-9959: Add non thread local based API for term vector reader usage (#180)

This commit is contained in:
zacharymorn 2021-07-12 23:34:52 -07:00 committed by GitHub
parent 15034f6c90
commit 180cfa241b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 255 additions and 135 deletions

View File

@ -124,6 +124,9 @@ API Changes
* LUCENE-9998: Remove unused parameter fis in StoredFieldsWriter.finish() and TermVectorsWriter.finish(),
including those subclasses. (kkewwei)
* LUCENE-9959: Add non thread local based API for term vector reader usage. (Zach Chen, Adrien Grand,
David Smiley, Robert Muir, Mike Drob)
Improvements
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)

View File

@ -18,26 +18,18 @@ package org.apache.lucene.codecs;
import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; // javadocs
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.TermVectors;
/**
* Codec API for reading term vectors:
*
* @lucene.experimental
*/
public abstract class TermVectorsReader implements Cloneable, Closeable {
public abstract class TermVectorsReader extends TermVectors implements Cloneable, Closeable {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
protected TermVectorsReader() {}
/**
* Returns term vectors for this document, or null if term vectors were not indexed. If offsets
* are available they are in an {@link OffsetAttribute} available from the {@link
* org.apache.lucene.index.PostingsEnum}.
*/
public abstract Fields get(int doc) throws IOException;
/**
* Checks consistency of this reader.
*

View File

@ -112,10 +112,29 @@ public abstract class BaseCompositeReader<R extends IndexReader> extends Composi
}
@Override
public final Fields getTermVectors(int docID) throws IOException {
ensureOpen();
final int i = readerIndex(docID); // find subreader num
return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader
public final TermVectors getTermVectorsReader() {
TermVectors[] termVectors = new TermVectors[subReaders.length];
return new TermVectors() {
@Override
public Fields get(int doc) throws IOException {
ensureOpen();
final int i = readerIndex(doc); // find subreader num
if (termVectors[i] != null) {
return termVectors[i].get(doc - starts[i]); // dispatch to subreader
} else {
TermVectors reader = subReaders[i].getTermVectorsReader();
if (reader != null) {
// the getTermVectorsReader would clone a new instance, hence saving it into an array
// to avoid re-cloning from direct subReaders[i].getTermVectorsReader() call
termVectors[i] = reader;
return reader.get(doc - starts[i]);
}
return null;
}
}
};
}
@Override

View File

@ -41,10 +41,11 @@ public abstract class CodecReader extends LeafReader {
public abstract StoredFieldsReader getFieldsReader();
/**
* Expert: retrieve thread-private TermVectorsReader
* Expert: retrieve TermVectorsReader
*
* @lucene.internal
*/
@Override
public abstract TermVectorsReader getTermVectorsReader();
/**
@ -88,16 +89,6 @@ public abstract class CodecReader extends LeafReader {
getFieldsReader().visitDocument(docID, visitor);
}
@Override
public final Fields getTermVectors(int docID) throws IOException {
TermVectorsReader termVectorsReader = getTermVectorsReader();
if (termVectorsReader == null) {
return null;
}
checkBounds(docID);
return termVectorsReader.get(docID);
}
private void checkBounds(int docID) {
Objects.checkIndex(docID, maxDoc());
}

View File

@ -69,7 +69,7 @@ abstract class DocValuesLeafReader extends LeafReader {
}
@Override
public final Fields getTermVectors(int docID) throws IOException {
public TermVectors getTermVectorsReader() {
throw new UnsupportedOperationException();
}

View File

@ -351,9 +351,8 @@ public abstract class FilterLeafReader extends LeafReader {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
ensureOpen();
return in.getTermVectors(docID);
public TermVectors getTermVectorsReader() {
return in.getTermVectorsReader();
}
@Override

View File

@ -307,8 +307,21 @@ public abstract class IndexReader implements Closeable {
/**
* Retrieve term vectors for this document, or null if term vectors were not indexed. The returned
* Fields instance acts like a single-document inverted index (the docID will be 0).
*
* @deprecated Use {@link IndexReader#getTermVectorsReader} instead.
*/
public abstract Fields getTermVectors(int docID) throws IOException;
@Deprecated
public final Fields getTermVectors(int docID) throws IOException {
TermVectors termVectors = getTermVectorsReader();
if (termVectors != null) {
return termVectors.get(docID);
}
return null;
}
;
/** Get TermVectors from this index, or null if term vectors were not indexed. */
public abstract TermVectors getTermVectorsReader();
/**
* Retrieve term vector for this document and field, or null if term vectors were not indexed. The

View File

@ -184,13 +184,18 @@ class MergeReaderWrapper extends LeafReader {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
ensureOpen();
checkBounds(docID);
if (vectors == null) {
return null;
}
return vectors.get(docID);
public TermVectors getTermVectorsReader() {
return new TermVectors() {
@Override
public Fields get(int docID) throws IOException {
ensureOpen();
checkBounds(docID);
if (vectors == null) {
return null;
}
return vectors.get(docID);
}
};
}
@Override

View File

@ -300,21 +300,26 @@ public class ParallelLeafReader extends LeafReader {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
ensureOpen();
ParallelFields fields = null;
for (Map.Entry<String, LeafReader> ent : tvFieldToReader.entrySet()) {
String fieldName = ent.getKey();
Terms vector = ent.getValue().getTermVector(docID, fieldName);
if (vector != null) {
if (fields == null) {
fields = new ParallelFields();
public TermVectors getTermVectorsReader() {
return new TermVectors() {
@Override
public Fields get(int doc) throws IOException {
ensureOpen();
ParallelFields fields = null;
for (Map.Entry<String, LeafReader> ent : tvFieldToReader.entrySet()) {
String fieldName = ent.getKey();
Terms vector = ent.getValue().getTermVector(doc, fieldName);
if (vector != null) {
if (fields == null) {
fields = new ParallelFields();
}
fields.addField(fieldName, vector);
}
}
fields.addField(fieldName, vector);
}
}
return fields;
return fields;
}
};
}
@Override

View File

@ -57,7 +57,7 @@ final class SegmentCoreReaders {
final NormsProducer normsProducer;
final StoredFieldsReader fieldsReaderOrig;
final TermVectorsReader termVectorsReaderOrig;
final TermVectorsReader termVectorsReader;
final PointsReader pointsReader;
final VectorReader vectorReader;
final CompoundDirectory cfsReader;
@ -80,14 +80,6 @@ final class SegmentCoreReaders {
}
};
final CloseableThreadLocal<TermVectorsReader> termVectorsLocal =
new CloseableThreadLocal<TermVectorsReader>() {
@Override
protected TermVectorsReader initialValue() {
return (termVectorsReaderOrig == null) ? null : termVectorsReaderOrig.clone();
}
};
private final Set<IndexReader.ClosedListener> coreClosedListeners =
Collections.synchronizedSet(new LinkedHashSet<IndexReader.ClosedListener>());
@ -134,13 +126,13 @@ final class SegmentCoreReaders {
.fieldsReader(cfsDir, si.info, coreFieldInfos, context);
if (coreFieldInfos.hasVectors()) { // open term vector files only as needed
termVectorsReaderOrig =
termVectorsReader =
si.info
.getCodec()
.termVectorsFormat()
.vectorsReader(cfsDir, si.info, coreFieldInfos, context);
} else {
termVectorsReaderOrig = null;
termVectorsReader = null;
}
if (coreFieldInfos.hasPointValues()) {
@ -186,10 +178,9 @@ final class SegmentCoreReaders {
if (ref.decrementAndGet() == 0) {
try (Closeable finalizer = this::notifyCoreClosedListeners) {
IOUtils.close(
termVectorsLocal,
fieldsReaderLocal,
fields,
termVectorsReaderOrig,
termVectorsReader,
fieldsReaderOrig,
cfsReader,
normsProducer,

View File

@ -242,12 +242,6 @@ public final class SegmentReader extends CodecReader {
return si.info.maxDoc();
}
@Override
public TermVectorsReader getTermVectorsReader() {
ensureOpen();
return core.termVectorsLocal.get();
}
@Override
public StoredFieldsReader getFieldsReader() {
ensureOpen();
@ -310,6 +304,16 @@ public final class SegmentReader extends CodecReader {
private final Set<ClosedListener> readerClosedListeners = new CopyOnWriteArraySet<>();
@Override
public TermVectorsReader getTermVectorsReader() {
ensureOpen();
if (core.termVectorsReader != null) {
return core.termVectorsReader.clone();
} else {
return null;
}
}
@Override
void notifyReaderClosedListeners() throws IOException {
synchronized (readerClosedListeners) {

View File

@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/** Index API to access TermVectors */
public abstract class TermVectors {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
protected TermVectors() {}
/**
* Returns term vectors for this document, or null if term vectors were not indexed. If offsets
* are available they are in an {@link OffsetAttribute} available from the {@link
* org.apache.lucene.index.PostingsEnum}.
*/
public abstract Fields get(int doc) throws IOException;
}

View File

@ -19,7 +19,14 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.ExitableDirectoryReader.ExitingReaderException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;

View File

@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.Objects;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -56,10 +57,10 @@ public class TestFilterCodecReader extends LuceneTestCase {
final Method subClassMethod =
subClass.getDeclaredMethod(
superClassMethod.getName(), superClassMethod.getParameterTypes());
assertEquals(
"getReturnType() difference",
superClassMethod.getReturnType(),
subClassMethod.getReturnType());
assertTrue(
"getReturnType() difference and not compatible",
isTypeEqualOrAssignable(
superClassMethod.getReturnType(), subClassMethod.getReturnType()));
} catch (
@SuppressWarnings("unused")
NoSuchMethodException e) {
@ -67,4 +68,8 @@ public class TestFilterCodecReader extends LuceneTestCase {
}
}
}
private boolean isTypeEqualOrAssignable(Class<?> superClass, Class<?> subClass) {
return Objects.equals(subClass, superClass) || superClass.isAssignableFrom(subClass);
}
}

View File

@ -24,6 +24,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
@ -66,7 +67,7 @@ public class TestSegmentToThreadMapping extends LuceneTestCase {
}
@Override
public Fields getTermVectors(int doc) {
public TermVectorsReader getTermVectorsReader() {
return null;
}

View File

@ -18,11 +18,15 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
@ -123,11 +127,13 @@ public class TestMultiThreadTermVectors extends LuceneTestCase {
private void testTermVectors() throws Exception {
// check:
int numDocs = reader.numDocs();
TermVectors termVectors = reader.getTermVectorsReader();
for (int docId = 0; docId < numDocs; docId++) {
Fields vectors = reader.getTermVectors(docId);
// reader is StandardDirectoryReader, method impl from BaseCompositeReader
Fields vectors = termVectors.get(docId);
// verify vectors result
verifyVectors(vectors, docId);
Terms vector = reader.getTermVectors(docId).terms("field");
Terms vector = termVectors.get(docId).terms("field");
verifyVector(vector.iterator(), docId);
}
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.TopDocs;
@ -168,11 +169,16 @@ public class TermVectorLeafReader extends LeafReader {
public void checkIntegrity() throws IOException {}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != 0) {
return null;
}
return fields;
public TermVectors getTermVectorsReader() {
return new TermVectors() {
@Override
public Fields get(int docID) {
if (docID != 0) {
return null;
}
return fields;
}
};
}
@Override

View File

@ -17,7 +17,9 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
/**
@ -35,11 +37,21 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
@Override
public OffsetsEnum getOffsetsEnum(LeafReader leafReader, int docId, String content)
throws IOException {
Terms docTerms = leafReader.getTermVector(docId, getField());
if (docTerms == null) {
Terms tvTerms = null;
TermVectors termVectors = leafReader.getTermVectorsReader();
if (termVectors != null) {
Fields vectors = termVectors.get(docId);
if (vectors != null) {
tvTerms = vectors.terms(getField());
}
}
if (tvTerms == null) {
return OffsetsEnum.EMPTY;
}
leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms, getField());
leafReader = new TermVectorFilteredLeafReader(leafReader, tvTerms, getField());
return createOffsetsEnumFromReader(leafReader, docId);
}

View File

@ -17,7 +17,9 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.highlight.TermVectorLeafReader;
@ -40,7 +42,16 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
@Override
public OffsetsEnum getOffsetsEnum(LeafReader reader, int docId, String content)
throws IOException {
Terms tvTerms = reader.getTermVector(docId, getField());
Terms tvTerms = null;
TermVectors termVectors = reader.getTermVectorsReader();
if (termVectors != null) {
Fields vectors = termVectors.get(docId);
if (vectors != null) {
tvTerms = vectors.terms(getField());
}
}
if (tvTerms == null) {
return OffsetsEnum.EMPTY;
}

View File

@ -48,6 +48,7 @@ import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
@ -1102,11 +1103,10 @@ public class UnifiedHighlighter {
return this.values;
}
}
/**
* Wraps an IndexReader that remembers/caches the last call to {@link
* LeafReader#getTermVectors(int)} so that if the next call has the same ID, then it is reused. If
* TV's were column-stride (like doc-values), there would be no need for this.
* Wraps an IndexReader that remembers/caches the last call to {@link TermVectors#get(int)} so
* that if the next call has the same ID, then it is reused. If TV's were column-stride (like
* doc-values), there would be no need for this.
*/
private static class TermVectorReusingLeafReader extends FilterLeafReader {
@ -1136,12 +1136,21 @@ public class UnifiedHighlighter {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = in.getTermVectors(docID);
public TermVectors getTermVectorsReader() {
if (in.getTermVectorsReader() == null) {
return null;
}
return tvFields;
return new TermVectors() {
@Override
public Fields get(int docID) throws IOException {
if (docID != lastDocId) {
lastDocId = docID;
tvFields = in.getTermVectorsReader().get(docID);
}
return tvFields;
}
};
}
@Override

View File

@ -40,6 +40,7 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.ParallelLeafReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
@ -133,20 +134,23 @@ public class TestUnifiedHighlighterTermVec extends LuceneTestCase {
@Override
public LeafReader wrap(LeafReader reader) {
return new FilterLeafReader(reader) {
BitSet seenDocIDs = new BitSet();
@Override
public Fields getTermVectors(int docID) throws IOException {
// if we're invoked by ParallelLeafReader then we can't do our assertion. TODO see
// LUCENE-6868
if (callStackContains(ParallelLeafReader.class) == false
&& callStackContains(CheckIndex.class) == false) {
assertFalse(
"Should not request TVs for doc more than once.", seenDocIDs.get(docID));
seenDocIDs.set(docID);
}
return super.getTermVectors(docID);
public TermVectors getTermVectorsReader() {
BitSet seenDocIDs = new BitSet();
return new TermVectors() {
@Override
public Fields get(int docID) throws IOException {
// if we're invoked by ParallelLeafReader then we can't do our assertion. TODO
// see LUCENE-6868
if (callStackContains(ParallelLeafReader.class) == false
&& callStackContains(CheckIndex.class) == false) {
assertFalse(
"Should not request TVs for doc more than once.", seenDocIDs.get(docID));
seenDocIDs.set(docID);
}
return reader.getTermVectorsReader().get(docID);
}
};
}
@Override

View File

@ -1235,6 +1235,20 @@ public class MemoryIndex {
fieldInfos = new FieldInfos(fieldInfosArr);
}
@Override
public TermVectors getTermVectorsReader() {
return new TermVectors() {
@Override
public Fields get(int docID) {
if (docID == 0) {
return memoryFields;
} else {
return null;
}
}
};
}
private Info getInfoForExpectedDocValuesType(String fieldName, DocValuesType expectedType) {
if (expectedType == DocValuesType.NONE) {
return null;
@ -1722,15 +1736,6 @@ public class MemoryIndex {
}
}
@Override
public Fields getTermVectors(int docID) {
if (docID == 0) {
return memoryFields;
} else {
return null;
}
}
@Override
public int numDocs() {
if (DEBUG) System.err.println("MemoryIndexReader.numDocs");

View File

@ -86,12 +86,6 @@ public class AssertingLeafReader extends FilterLeafReader {
return terms == null ? null : new AssertingTerms(terms);
}
@Override
public Fields getTermVectors(int docID) throws IOException {
Fields fields = super.getTermVectors(docID);
return fields == null ? null : new AssertingFields(fields);
}
/** Wraps a Fields but with additional asserts */
public static class AssertingFields extends FilterFields {
public AssertingFields(Fields in) {

View File

@ -56,15 +56,20 @@ public final class FieldFilterLeafReader extends FilterLeafReader {
}
@Override
public Fields getTermVectors(int docID) throws IOException {
Fields f = super.getTermVectors(docID);
if (f == null) {
return null;
}
f = new FieldFilterFields(f);
// we need to check for emptyness, so we can return
// null:
return f.iterator().hasNext() ? f : null;
public TermVectors getTermVectorsReader() {
return new TermVectors() {
@Override
public Fields get(int docID) throws IOException {
Fields f = in.getTermVectorsReader().get(docID);
if (f == null) {
return null;
}
f = new FieldFilterFields(f);
// we need to check for emptyness, so we can return
// null:
return f.iterator().hasNext() ? f : null;
}
};
}
@Override

View File

@ -23,9 +23,9 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.List;
import java.util.Random;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafMetaData;
import org.apache.lucene.index.LeafReader;
@ -241,7 +241,7 @@ public class QueryUtils {
public void checkIntegrity() throws IOException {}
@Override
public Fields getTermVectors(int docID) throws IOException {
public TermVectorsReader getTermVectorsReader() {
return null;
}