Some small cleanups in TermVectorsService (#62292)
We removed the use of aggregated stats from term vectors back in #16452, but there is a bunch of dead code left here which can be stripped out.
This commit is contained in:
parent
c50899dd8f
commit
8089210815
|
@ -23,12 +23,9 @@ import org.apache.lucene.index.PostingsEnum;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.search.dfs.AggregatedDfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
@ -52,20 +49,18 @@ public class TermVectorsFilter {
|
|||
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
|
||||
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
|
||||
|
||||
private Fields fields;
|
||||
private Fields topLevelFields;
|
||||
private final Fields fields;
|
||||
private final Fields topLevelFields;
|
||||
private final Set<String> selectedFields;
|
||||
private AggregatedDfs dfs;
|
||||
private Map<Term, ScoreTerm> scoreTerms;
|
||||
private Map<String, Integer> sizes = new HashMap<>();
|
||||
private TFIDFSimilarity similarity;
|
||||
private final Map<Term, ScoreTerm> scoreTerms;
|
||||
private final Map<String, Integer> sizes = new HashMap<>();
|
||||
private final TFIDFSimilarity similarity;
|
||||
|
||||
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
|
||||
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields) {
|
||||
this.fields = termVectorsByField;
|
||||
this.topLevelFields = topLevelFields;
|
||||
this.selectedFields = selectedFields;
|
||||
|
||||
this.dfs = dfs;
|
||||
this.scoreTerms = new HashMap<>();
|
||||
this.similarity = new ClassicSimilarity();
|
||||
}
|
||||
|
@ -196,7 +191,7 @@ public class TermVectorsFilter {
|
|||
topLevelTerms = terms;
|
||||
}
|
||||
|
||||
long numDocs = getDocCount(fieldName, topLevelTerms);
|
||||
long numDocs = topLevelTerms.getDocCount();
|
||||
|
||||
// one queue per field name
|
||||
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
|
||||
|
@ -212,13 +207,15 @@ public class TermVectorsFilter {
|
|||
Term term = new Term(fieldName, termBytesRef);
|
||||
|
||||
// remove noise words
|
||||
int freq = getTermFreq(termsEnum, docsEnum);
|
||||
docsEnum = termsEnum.postings(docsEnum);
|
||||
docsEnum.nextDoc();
|
||||
int freq = docsEnum.freq();
|
||||
if (isNoise(term.bytes().utf8ToString(), freq)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// now call on docFreq
|
||||
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
|
||||
long docFreq = topLevelTermsEnum.docFreq();
|
||||
if (!isAccepted(docFreq)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -275,26 +272,6 @@ public class TermVectorsFilter {
|
|||
return true;
|
||||
}
|
||||
|
||||
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
|
||||
if (dfs != null) {
|
||||
return dfs.fieldStatistics().get(fieldName).docCount();
|
||||
}
|
||||
return topLevelTerms.getDocCount();
|
||||
}
|
||||
|
||||
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
|
||||
if (dfs != null) {
|
||||
return dfs.termStatistics().get(term);
|
||||
}
|
||||
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
|
||||
docsEnum = termsEnum.postings(docsEnum);
|
||||
docsEnum.nextDoc();
|
||||
return docsEnum.freq();
|
||||
}
|
||||
|
||||
private float computeScore(long docFreq, int freq, long numDocs) {
|
||||
return freq * similarity.idf(docFreq, numDocs);
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.elasticsearch.action.ActionResponse;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamOutput;
|
||||
|
@ -38,7 +37,6 @@ import org.elasticsearch.common.io.stream.StreamOutput;
|
|||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.search.dfs.AggregatedDfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
@ -353,15 +351,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContentObj
|
|||
|
||||
public void setFields(Fields termVectorsByField, Set<String> selectedFields,
|
||||
EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
|
||||
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null);
|
||||
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
|
||||
}
|
||||
|
||||
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags,
|
||||
Fields topLevelFields, @Nullable AggregatedDfs dfs, TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
Fields topLevelFields, TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
TermVectorsWriter tvw = new TermVectorsWriter(this);
|
||||
|
||||
if (termVectorsByField != null) {
|
||||
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter);
|
||||
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, termVectorsFilter);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -23,15 +23,12 @@ import org.apache.lucene.index.PostingsEnum;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamOutput;
|
||||
import org.elasticsearch.search.dfs.AggregatedDfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -48,14 +45,14 @@ final class TermVectorsWriter {
|
|||
// size here?
|
||||
private static final String HEADER = "TV";
|
||||
private static final int CURRENT_VERSION = -1;
|
||||
TermVectorsResponse response = null;
|
||||
final TermVectorsResponse response;
|
||||
|
||||
TermVectorsWriter(TermVectorsResponse termVectorsResponse) throws IOException {
|
||||
TermVectorsWriter(TermVectorsResponse termVectorsResponse) {
|
||||
response = termVectorsResponse;
|
||||
}
|
||||
|
||||
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
|
||||
@Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
@Nullable TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
int numFieldsWritten = 0;
|
||||
PostingsEnum docsAndPosEnum = null;
|
||||
PostingsEnum docsEnum = null;
|
||||
|
@ -90,12 +87,8 @@ final class TermVectorsWriter {
|
|||
startField(field, termsSize, positions, offsets, payloads);
|
||||
|
||||
if (flags.contains(Flag.FieldStatistics)) {
|
||||
if (dfs != null) {
|
||||
writeFieldStatistics(dfs.fieldStatistics().get(field));
|
||||
} else {
|
||||
writeFieldStatistics(topLevelTerms);
|
||||
}
|
||||
}
|
||||
TermsEnum iterator = fieldTermVector.iterator();
|
||||
final boolean useDocsAndPos = positions || offsets || payloads;
|
||||
while (iterator.next() != null) { // iterate all terms of the current field
|
||||
|
@ -110,14 +103,6 @@ final class TermVectorsWriter {
|
|||
startTerm(termBytesRef);
|
||||
if (flags.contains(Flag.TermStatistics)) {
|
||||
// get the doc frequency
|
||||
if (dfs != null) {
|
||||
final TermStatistics statistics = dfs.termStatistics().get(term);
|
||||
if (statistics == null) {
|
||||
writeMissingTermStatistics();
|
||||
} else {
|
||||
writeTermStatistics(statistics);
|
||||
}
|
||||
} else {
|
||||
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
|
||||
if (foundTerm) {
|
||||
writeTermStatistics(topLevelIterator);
|
||||
|
@ -125,7 +110,6 @@ final class TermVectorsWriter {
|
|||
writeMissingTermStatistics();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (useDocsAndPos) {
|
||||
// given we have pos or offsets
|
||||
docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
|
||||
|
@ -158,7 +142,7 @@ final class TermVectorsWriter {
|
|||
header.writeVInt(numFieldsWritten);
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
header.writeString(fields.get(i));
|
||||
header.writeVLong(fieldOffset.get(i).longValue());
|
||||
header.writeVLong(fieldOffset.get(i));
|
||||
}
|
||||
header.close();
|
||||
return header.bytes();
|
||||
|
@ -259,15 +243,6 @@ final class TermVectorsWriter {
|
|||
writePotentiallyNegativeVLong(ttf);
|
||||
}
|
||||
|
||||
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
|
||||
int docFreq = (int) termStatistics.docFreq();
|
||||
assert (docFreq >= -1);
|
||||
writePotentiallyNegativeVInt(docFreq);
|
||||
long ttf = termStatistics.totalTermFreq();
|
||||
assert (ttf >= -1);
|
||||
writePotentiallyNegativeVLong(ttf);
|
||||
}
|
||||
|
||||
private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
|
||||
long sttf = topLevelTerms.getSumTotalTermFreq();
|
||||
assert (sttf >= -1);
|
||||
|
@ -280,18 +255,6 @@ final class TermVectorsWriter {
|
|||
writePotentiallyNegativeVInt(dc);
|
||||
}
|
||||
|
||||
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
|
||||
long sttf = fieldStats.sumTotalTermFreq();
|
||||
assert (sttf >= -1);
|
||||
writePotentiallyNegativeVLong(sttf);
|
||||
long sdf = fieldStats.sumDocFreq();
|
||||
assert (sdf >= -1);
|
||||
writePotentiallyNegativeVLong(sdf);
|
||||
int dc = (int) fieldStats.docCount();
|
||||
assert (dc >= -1);
|
||||
writePotentiallyNegativeVInt(dc);
|
||||
}
|
||||
|
||||
private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
|
||||
output.writeFloat(Math.max(0, scoreTerm.score));
|
||||
}
|
||||
|
@ -310,11 +273,11 @@ final class TermVectorsWriter {
|
|||
|
||||
/** Implements an empty {@link Terms}. */
|
||||
private static final Terms EMPTY_TERMS = new Terms() {
|
||||
@Override public TermsEnum iterator() throws IOException { return TermsEnum.EMPTY; }
|
||||
@Override public long size() throws IOException { return 0; }
|
||||
@Override public long getSumTotalTermFreq() throws IOException { return 0; }
|
||||
@Override public long getSumDocFreq() throws IOException { return 0; }
|
||||
@Override public int getDocCount() throws IOException { return 0; }
|
||||
@Override public TermsEnum iterator() { return TermsEnum.EMPTY; }
|
||||
@Override public long size() { return 0; }
|
||||
@Override public long getSumTotalTermFreq() { return 0; }
|
||||
@Override public long getSumDocFreq() { return 0; }
|
||||
@Override public int getDocCount() { return 0; }
|
||||
@Override public boolean hasFreqs() { return false; }
|
||||
@Override public boolean hasOffsets() { return false; }
|
||||
@Override public boolean hasPositions() { return false; }
|
||||
|
|
|
@ -54,7 +54,6 @@ import org.elasticsearch.index.mapper.StringFieldType;
|
|||
import org.elasticsearch.index.mapper.TextSearchInfo;
|
||||
import org.elasticsearch.index.mapper.Uid;
|
||||
import org.elasticsearch.index.shard.IndexShard;
|
||||
import org.elasticsearch.search.dfs.AggregatedDfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
|
@ -87,7 +86,6 @@ public class TermVectorsService {
|
|||
final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id()));
|
||||
|
||||
Fields termVectorsByField = null;
|
||||
AggregatedDfs dfs = null;
|
||||
TermVectorsFilter termVectorsFilter = null;
|
||||
|
||||
/* handle potential wildcards in fields */
|
||||
|
@ -104,10 +102,6 @@ public class TermVectorsService {
|
|||
/* from an artificial document */
|
||||
if (request.doc() != null) {
|
||||
termVectorsByField = generateTermVectorsFromDoc(indexShard, request);
|
||||
// if no document indexed in shard, take the queried document itself for stats
|
||||
if (topLevelFields == null) {
|
||||
topLevelFields = termVectorsByField;
|
||||
}
|
||||
termVectorsResponse.setArtificial(true);
|
||||
termVectorsResponse.setExists(true);
|
||||
}
|
||||
|
@ -134,7 +128,7 @@ public class TermVectorsService {
|
|||
/* if there are term vectors, optional compute dfs and/or terms filtering */
|
||||
if (termVectorsByField != null) {
|
||||
if (request.filterSettings() != null) {
|
||||
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs);
|
||||
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields());
|
||||
termVectorsFilter.setSettings(request.filterSettings());
|
||||
try {
|
||||
termVectorsFilter.selectBestTerms();
|
||||
|
@ -143,7 +137,7 @@ public class TermVectorsService {
|
|||
}
|
||||
}
|
||||
// write term vectors
|
||||
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs,
|
||||
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields,
|
||||
termVectorsFilter);
|
||||
}
|
||||
termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime));
|
||||
|
@ -233,7 +227,7 @@ public class TermVectorsService {
|
|||
MapperService mapperService = indexShard.mapperService();
|
||||
Analyzer analyzer;
|
||||
if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) {
|
||||
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field).toString());
|
||||
analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field));
|
||||
} else {
|
||||
MappedFieldType fieldType = mapperService.fieldType(field);
|
||||
analyzer = fieldType.indexAnalyzer();
|
||||
|
|
Loading…
Reference in New Issue