Some small cleanups in TermVectorsService (#62292)

We removed the use of aggregated stats from term vectors back in #16452, but there is
a bunch of dead code left here which can be stripped out.
This commit is contained in:
Alan Woodward 2020-09-15 08:59:55 +01:00 committed by Alan Woodward
parent c50899dd8f
commit 8089210815
4 changed files with 31 additions and 99 deletions

View File

@ -23,12 +23,9 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.HashMap;
@ -52,20 +49,18 @@ public class TermVectorsFilter {
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
private Fields fields;
private Fields topLevelFields;
private final Fields fields;
private final Fields topLevelFields;
private final Set<String> selectedFields;
private AggregatedDfs dfs;
private Map<Term, ScoreTerm> scoreTerms;
private Map<String, Integer> sizes = new HashMap<>();
private TFIDFSimilarity similarity;
private final Map<Term, ScoreTerm> scoreTerms;
private final Map<String, Integer> sizes = new HashMap<>();
private final TFIDFSimilarity similarity;
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields) {
this.fields = termVectorsByField;
this.topLevelFields = topLevelFields;
this.selectedFields = selectedFields;
this.dfs = dfs;
this.scoreTerms = new HashMap<>();
this.similarity = new ClassicSimilarity();
}
@ -196,7 +191,7 @@ public class TermVectorsFilter {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
long numDocs = topLevelTerms.getDocCount();
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
@ -212,13 +207,15 @@ public class TermVectorsFilter {
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
int freq = docsEnum.freq();
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
long docFreq = topLevelTermsEnum.docFreq();
if (!isAccepted(docFreq)) {
continue;
}
@ -275,26 +272,6 @@ public class TermVectorsFilter {
return true;
}
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
if (dfs != null) {
return dfs.fieldStatistics().get(fieldName).docCount();
}
return topLevelTerms.getDocCount();
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
private float computeScore(long docFreq, int freq, long numDocs) {
return freq * similarity.idf(docFreq, numDocs);
}

View File

@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
@ -38,7 +37,6 @@ import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.Collections;
@ -353,15 +351,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContentObj
public void setFields(Fields termVectorsByField, Set<String> selectedFields,
EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null);
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
}
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
Fields topLevelFields, @Nullable AggregatedDfs dfs, TermVectorsFilter termVectorsFilter) throws IOException {
Fields topLevelFields, TermVectorsFilter termVectorsFilter) throws IOException {
TermVectorsWriter tvw = new TermVectorsWriter(this);
if (termVectorsByField != null) {
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter);
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, termVectorsFilter);
}
}

View File

@ -23,15 +23,12 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.ArrayList;
@ -48,14 +45,14 @@ final class TermVectorsWriter {
// size here?
private static final String HEADER = "TV";
private static final int CURRENT_VERSION = -1;
TermVectorsResponse response = null;
final TermVectorsResponse response;
TermVectorsWriter(TermVectorsResponse termVectorsResponse) throws IOException {
TermVectorsWriter(TermVectorsResponse termVectorsResponse) {
response = termVectorsResponse;
}
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
@Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
@Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null;
@ -90,11 +87,7 @@ final class TermVectorsWriter {
startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) {
writeFieldStatistics(dfs.fieldStatistics().get(field));
} else {
writeFieldStatistics(topLevelTerms);
}
writeFieldStatistics(topLevelTerms);
}
TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads;
@ -110,20 +103,11 @@ final class TermVectorsWriter {
startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency
if (dfs != null) {
final TermStatistics statistics = dfs.termStatistics().get(term);
if (statistics == null) {
writeMissingTermStatistics();
} else {
writeTermStatistics(statistics);
}
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
writeMissingTermStatistics();
}
writeMissingTermStatistics();
}
}
if (useDocsAndPos) {
@ -158,7 +142,7 @@ final class TermVectorsWriter {
header.writeVInt(numFieldsWritten);
for (int i = 0; i < fields.size(); i++) {
header.writeString(fields.get(i));
header.writeVLong(fieldOffset.get(i).longValue());
header.writeVLong(fieldOffset.get(i));
}
header.close();
return header.bytes();
@ -259,15 +243,6 @@ final class TermVectorsWriter {
writePotentiallyNegativeVLong(ttf);
}
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
int docFreq = (int) termStatistics.docFreq();
assert (docFreq >= -1);
writePotentiallyNegativeVInt(docFreq);
long ttf = termStatistics.totalTermFreq();
assert (ttf >= -1);
writePotentiallyNegativeVLong(ttf);
}
private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
long sttf = topLevelTerms.getSumTotalTermFreq();
assert (sttf >= -1);
@ -280,18 +255,6 @@ final class TermVectorsWriter {
writePotentiallyNegativeVInt(dc);
}
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
long sttf = fieldStats.sumTotalTermFreq();
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = fieldStats.sumDocFreq();
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = (int) fieldStats.docCount();
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}
private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
output.writeFloat(Math.max(0, scoreTerm.score));
}
@ -310,11 +273,11 @@ final class TermVectorsWriter {
/** Implements an empty {@link Terms}. */
private static final Terms EMPTY_TERMS = new Terms() {
@Override public TermsEnum iterator() throws IOException { return TermsEnum.EMPTY; }
@Override public long size() throws IOException { return 0; }
@Override public long getSumTotalTermFreq() throws IOException { return 0; }
@Override public long getSumDocFreq() throws IOException { return 0; }
@Override public int getDocCount() throws IOException { return 0; }
@Override public TermsEnum iterator() { return TermsEnum.EMPTY; }
@Override public long size() { return 0; }
@Override public long getSumTotalTermFreq() { return 0; }
@Override public long getSumDocFreq() { return 0; }
@Override public int getDocCount() { return 0; }
@Override public boolean hasFreqs() { return false; }
@Override public boolean hasOffsets() { return false; }
@Override public boolean hasPositions() { return false; }

View File

@ -54,7 +54,6 @@ import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.ArrayList;
@ -87,7 +86,6 @@ public class TermVectorsService {
final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id()));
Fields termVectorsByField = null;
AggregatedDfs dfs = null;
TermVectorsFilter termVectorsFilter = null;
/* handle potential wildcards in fields */
@ -104,10 +102,6 @@ public class TermVectorsService {
/* from an artificial document */
if (request.doc() != null) {
termVectorsByField = generateTermVectorsFromDoc(indexShard, request);
// if no document indexed in shard, take the queried document itself for stats
if (topLevelFields == null) {
topLevelFields = termVectorsByField;
}
termVectorsResponse.setArtificial(true);
termVectorsResponse.setExists(true);
}
@ -134,7 +128,7 @@ public class TermVectorsService {
/* if there are term vectors, optional compute dfs and/or terms filtering */
if (termVectorsByField != null) {
if (request.filterSettings() != null) {
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs);
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields());
termVectorsFilter.setSettings(request.filterSettings());
try {
termVectorsFilter.selectBestTerms();
@ -143,7 +137,7 @@ public class TermVectorsService {
}
}
// write term vectors
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs,
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields,
termVectorsFilter);
}
termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime));
@ -233,7 +227,7 @@ public class TermVectorsService {
MapperService mapperService = indexShard.mapperService();
Analyzer analyzer;
if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) {
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field).toString());
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field));
} else {
MappedFieldType fieldType = mapperService.fieldType(field);
analyzer = fieldType.indexAnalyzer();