Some small cleanups in TermVectorsService (#62292)

We removed the use of aggregated stats from term vectors back in #16452, but there is
a bunch of dead code left here which can be stripped out.
This commit is contained in:
Alan Woodward 2020-09-15 08:59:55 +01:00 committed by Alan Woodward
parent c50899dd8f
commit 8089210815
4 changed files with 31 additions and 99 deletions

View File

@ -23,12 +23,9 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
@ -52,20 +49,18 @@ public class TermVectorsFilter {
private int minWordLength = DEFAULT_MIN_WORD_LENGTH; private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH; private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
private Fields fields; private final Fields fields;
private Fields topLevelFields; private final Fields topLevelFields;
private final Set<String> selectedFields; private final Set<String> selectedFields;
private AggregatedDfs dfs; private final Map<Term, ScoreTerm> scoreTerms;
private Map<Term, ScoreTerm> scoreTerms; private final Map<String, Integer> sizes = new HashMap<>();
private Map<String, Integer> sizes = new HashMap<>(); private final TFIDFSimilarity similarity;
private TFIDFSimilarity similarity;
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) { public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields) {
this.fields = termVectorsByField; this.fields = termVectorsByField;
this.topLevelFields = topLevelFields; this.topLevelFields = topLevelFields;
this.selectedFields = selectedFields; this.selectedFields = selectedFields;
this.dfs = dfs;
this.scoreTerms = new HashMap<>(); this.scoreTerms = new HashMap<>();
this.similarity = new ClassicSimilarity(); this.similarity = new ClassicSimilarity();
} }
@ -196,7 +191,7 @@ public class TermVectorsFilter {
topLevelTerms = terms; topLevelTerms = terms;
} }
long numDocs = getDocCount(fieldName, topLevelTerms); long numDocs = topLevelTerms.getDocCount();
// one queue per field name // one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size())); ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
@ -212,13 +207,15 @@ public class TermVectorsFilter {
Term term = new Term(fieldName, termBytesRef); Term term = new Term(fieldName, termBytesRef);
// remove noise words // remove noise words
int freq = getTermFreq(termsEnum, docsEnum); docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
int freq = docsEnum.freq();
if (isNoise(term.bytes().utf8ToString(), freq)) { if (isNoise(term.bytes().utf8ToString(), freq)) {
continue; continue;
} }
// now call on docFreq // now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq(); long docFreq = topLevelTermsEnum.docFreq();
if (!isAccepted(docFreq)) { if (!isAccepted(docFreq)) {
continue; continue;
} }
@ -275,26 +272,6 @@ public class TermVectorsFilter {
return true; return true;
} }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
if (dfs != null) {
return dfs.fieldStatistics().get(fieldName).docCount();
}
return topLevelTerms.getDocCount();
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
private float computeScore(long docFreq, int freq, long numDocs) { private float computeScore(long docFreq, int freq, long numDocs) {
return freq * similarity.idf(docFreq, numDocs); return freq * similarity.idf(docFreq, numDocs);
} }

View File

@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.action.ActionResponse; import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag; import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.io.stream.BytesStreamOutput;
@ -38,7 +37,6 @@ import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException; import java.io.IOException;
import java.util.Collections; import java.util.Collections;
@ -353,15 +351,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContentObj
public void setFields(Fields termVectorsByField, Set<String> selectedFields, public void setFields(Fields termVectorsByField, Set<String> selectedFields,
EnumSet<Flag> flags, Fields topLevelFields) throws IOException { EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null); setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
} }
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
Fields topLevelFields, @Nullable AggregatedDfs dfs, TermVectorsFilter termVectorsFilter) throws IOException { Fields topLevelFields, TermVectorsFilter termVectorsFilter) throws IOException {
TermVectorsWriter tvw = new TermVectorsWriter(this); TermVectorsWriter tvw = new TermVectorsWriter(this);
if (termVectorsByField != null) { if (termVectorsByField != null) {
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter); tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, termVectorsFilter);
} }
} }

View File

@ -23,15 +23,12 @@ import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag; import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -48,14 +45,14 @@ final class TermVectorsWriter {
// size here? // size here?
private static final String HEADER = "TV"; private static final String HEADER = "TV";
private static final int CURRENT_VERSION = -1; private static final int CURRENT_VERSION = -1;
TermVectorsResponse response = null; final TermVectorsResponse response;
TermVectorsWriter(TermVectorsResponse termVectorsResponse) throws IOException { TermVectorsWriter(TermVectorsResponse termVectorsResponse) {
response = termVectorsResponse; response = termVectorsResponse;
} }
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
@Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException { @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0; int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null; PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null; PostingsEnum docsEnum = null;
@ -90,11 +87,7 @@ final class TermVectorsWriter {
startField(field, termsSize, positions, offsets, payloads); startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) { if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) { writeFieldStatistics(topLevelTerms);
writeFieldStatistics(dfs.fieldStatistics().get(field));
} else {
writeFieldStatistics(topLevelTerms);
}
} }
TermsEnum iterator = fieldTermVector.iterator(); TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads; final boolean useDocsAndPos = positions || offsets || payloads;
@ -110,20 +103,11 @@ final class TermVectorsWriter {
startTerm(termBytesRef); startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) { if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency // get the doc frequency
if (dfs != null) { boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
final TermStatistics statistics = dfs.termStatistics().get(term); if (foundTerm) {
if (statistics == null) { writeTermStatistics(topLevelIterator);
writeMissingTermStatistics();
} else {
writeTermStatistics(statistics);
}
} else { } else {
boolean foundTerm = topLevelIterator.seekExact(termBytesRef); writeMissingTermStatistics();
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
writeMissingTermStatistics();
}
} }
} }
if (useDocsAndPos) { if (useDocsAndPos) {
@ -158,7 +142,7 @@ final class TermVectorsWriter {
header.writeVInt(numFieldsWritten); header.writeVInt(numFieldsWritten);
for (int i = 0; i < fields.size(); i++) { for (int i = 0; i < fields.size(); i++) {
header.writeString(fields.get(i)); header.writeString(fields.get(i));
header.writeVLong(fieldOffset.get(i).longValue()); header.writeVLong(fieldOffset.get(i));
} }
header.close(); header.close();
return header.bytes(); return header.bytes();
@ -259,15 +243,6 @@ final class TermVectorsWriter {
writePotentiallyNegativeVLong(ttf); writePotentiallyNegativeVLong(ttf);
} }
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
int docFreq = (int) termStatistics.docFreq();
assert (docFreq >= -1);
writePotentiallyNegativeVInt(docFreq);
long ttf = termStatistics.totalTermFreq();
assert (ttf >= -1);
writePotentiallyNegativeVLong(ttf);
}
private void writeFieldStatistics(Terms topLevelTerms) throws IOException { private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
long sttf = topLevelTerms.getSumTotalTermFreq(); long sttf = topLevelTerms.getSumTotalTermFreq();
assert (sttf >= -1); assert (sttf >= -1);
@ -280,18 +255,6 @@ final class TermVectorsWriter {
writePotentiallyNegativeVInt(dc); writePotentiallyNegativeVInt(dc);
} }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
long sttf = fieldStats.sumTotalTermFreq();
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = fieldStats.sumDocFreq();
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = (int) fieldStats.docCount();
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}
private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException { private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
output.writeFloat(Math.max(0, scoreTerm.score)); output.writeFloat(Math.max(0, scoreTerm.score));
} }
@ -310,11 +273,11 @@ final class TermVectorsWriter {
/** Implements an empty {@link Terms}. */ /** Implements an empty {@link Terms}. */
private static final Terms EMPTY_TERMS = new Terms() { private static final Terms EMPTY_TERMS = new Terms() {
@Override public TermsEnum iterator() throws IOException { return TermsEnum.EMPTY; } @Override public TermsEnum iterator() { return TermsEnum.EMPTY; }
@Override public long size() throws IOException { return 0; } @Override public long size() { return 0; }
@Override public long getSumTotalTermFreq() throws IOException { return 0; } @Override public long getSumTotalTermFreq() { return 0; }
@Override public long getSumDocFreq() throws IOException { return 0; } @Override public long getSumDocFreq() { return 0; }
@Override public int getDocCount() throws IOException { return 0; } @Override public int getDocCount() { return 0; }
@Override public boolean hasFreqs() { return false; } @Override public boolean hasFreqs() { return false; }
@Override public boolean hasOffsets() { return false; } @Override public boolean hasOffsets() { return false; }
@Override public boolean hasPositions() { return false; } @Override public boolean hasPositions() { return false; }

View File

@ -54,7 +54,6 @@ import org.elasticsearch.index.mapper.StringFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.Uid; import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -87,7 +86,6 @@ public class TermVectorsService {
final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id())); final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id()));
Fields termVectorsByField = null; Fields termVectorsByField = null;
AggregatedDfs dfs = null;
TermVectorsFilter termVectorsFilter = null; TermVectorsFilter termVectorsFilter = null;
/* handle potential wildcards in fields */ /* handle potential wildcards in fields */
@ -104,10 +102,6 @@ public class TermVectorsService {
/* from an artificial document */ /* from an artificial document */
if (request.doc() != null) { if (request.doc() != null) {
termVectorsByField = generateTermVectorsFromDoc(indexShard, request); termVectorsByField = generateTermVectorsFromDoc(indexShard, request);
// if no document indexed in shard, take the queried document itself for stats
if (topLevelFields == null) {
topLevelFields = termVectorsByField;
}
termVectorsResponse.setArtificial(true); termVectorsResponse.setArtificial(true);
termVectorsResponse.setExists(true); termVectorsResponse.setExists(true);
} }
@ -134,7 +128,7 @@ public class TermVectorsService {
/* if there are term vectors, optional compute dfs and/or terms filtering */ /* if there are term vectors, optional compute dfs and/or terms filtering */
if (termVectorsByField != null) { if (termVectorsByField != null) {
if (request.filterSettings() != null) { if (request.filterSettings() != null) {
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs); termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields());
termVectorsFilter.setSettings(request.filterSettings()); termVectorsFilter.setSettings(request.filterSettings());
try { try {
termVectorsFilter.selectBestTerms(); termVectorsFilter.selectBestTerms();
@ -143,7 +137,7 @@ public class TermVectorsService {
} }
} }
// write term vectors // write term vectors
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs, termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields,
termVectorsFilter); termVectorsFilter);
} }
termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime)); termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime));
@ -233,7 +227,7 @@ public class TermVectorsService {
MapperService mapperService = indexShard.mapperService(); MapperService mapperService = indexShard.mapperService();
Analyzer analyzer; Analyzer analyzer;
if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) { if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) {
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field).toString()); analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field));
} else { } else {
MappedFieldType fieldType = mapperService.fieldType(field); MappedFieldType fieldType = mapperService.fieldType(field);
analyzer = fieldType.indexAnalyzer(); analyzer = fieldType.indexAnalyzer();