From 8089210815b3aa35e53f72843b5982795bd5932d Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 15 Sep 2020 08:59:55 +0100 Subject: [PATCH] Some small cleanups in TermVectorsService (#62292) We removed the use of aggregated stats from term vectors back in #16452, but there is a bunch of dead code left here which can be stripped out. --- .../action/termvectors/TermVectorsFilter.java | 45 ++++--------- .../termvectors/TermVectorsResponse.java | 8 +-- .../action/termvectors/TermVectorsWriter.java | 65 ++++--------------- .../index/termvectors/TermVectorsService.java | 12 +--- 4 files changed, 31 insertions(+), 99 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFilter.java b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFilter.java index cdeed093eed..03ae54bfdb1 100644 --- a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFilter.java +++ b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFilter.java @@ -23,12 +23,9 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.Nullable; -import org.elasticsearch.search.dfs.AggregatedDfs; import java.io.IOException; import java.util.HashMap; @@ -52,20 +49,18 @@ public class TermVectorsFilter { private int minWordLength = DEFAULT_MIN_WORD_LENGTH; private int maxWordLength = DEFAULT_MAX_WORD_LENGTH; - private Fields fields; - private Fields topLevelFields; + private final Fields fields; + private final Fields topLevelFields; private final Set selectedFields; - private AggregatedDfs dfs; - private Map scoreTerms; - private Map sizes = new HashMap<>(); - private TFIDFSimilarity similarity; + private final Map scoreTerms; + private final Map sizes = new HashMap<>(); + private final TFIDFSimilarity similarity; - public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set selectedFields, @Nullable AggregatedDfs dfs) { + public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set selectedFields) { this.fields = termVectorsByField; this.topLevelFields = topLevelFields; this.selectedFields = selectedFields; - this.dfs = dfs; this.scoreTerms = new HashMap<>(); this.similarity = new ClassicSimilarity(); } @@ -196,7 +191,7 @@ public class TermVectorsFilter { topLevelTerms = terms; } - long numDocs = getDocCount(fieldName, topLevelTerms); + long numDocs = topLevelTerms.getDocCount(); // one queue per field name ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size())); @@ -212,13 +207,15 @@ public class TermVectorsFilter { Term term = new Term(fieldName, termBytesRef); // remove noise words - int freq = getTermFreq(termsEnum, docsEnum); + docsEnum = termsEnum.postings(docsEnum); + docsEnum.nextDoc(); + int freq = docsEnum.freq(); if (isNoise(term.bytes().utf8ToString(), freq)) { continue; } // now call on docFreq - long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq(); + long docFreq = topLevelTermsEnum.docFreq(); if (!isAccepted(docFreq)) { continue; } @@ -275,26 +272,6 @@ public class TermVectorsFilter { return true; } - private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { - if (dfs != null) { - return dfs.fieldStatistics().get(fieldName).docCount(); - } - return topLevelTerms.getDocCount(); - } - - private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException { - if (dfs != null) { - return dfs.termStatistics().get(term); - } - return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq()); - } - - private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException { - docsEnum = termsEnum.postings(docsEnum); - docsEnum.nextDoc(); - return docsEnum.freq(); - } - private float computeScore(long docFreq, int freq, long numDocs) { return freq * similarity.idf(docFreq, numDocs); } diff --git a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsResponse.java b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsResponse.java index 914af03c652..3b1bcf2c31e 100644 --- a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsResponse.java +++ b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsResponse.java @@ -29,7 +29,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.elasticsearch.action.ActionResponse; import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag; -import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.BytesStreamOutput; @@ -38,7 +37,6 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.ToXContentObject; import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.search.dfs.AggregatedDfs; import java.io.IOException; import java.util.Collections; @@ -353,15 +351,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContentObj public void setFields(Fields termVectorsByField, Set selectedFields, EnumSet flags, Fields topLevelFields) throws IOException { - setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null); + setFields(termVectorsByField, selectedFields, flags, topLevelFields, null); } public void setFields(Fields termVectorsByField, Set selectedFields, EnumSet flags, - Fields topLevelFields, @Nullable AggregatedDfs dfs, TermVectorsFilter termVectorsFilter) throws IOException { + Fields topLevelFields, TermVectorsFilter termVectorsFilter) throws IOException { TermVectorsWriter tvw = new TermVectorsWriter(this); if (termVectorsByField != null) { - tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter); + tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, termVectorsFilter); } } diff --git a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsWriter.java b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsWriter.java index d38a980c589..194fc8229e8 100644 --- a/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsWriter.java +++ b/server/src/main/java/org/elasticsearch/action/termvectors/TermVectorsWriter.java @@ -23,15 +23,12 @@ import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.TermStatistics; import org.apache.lucene.util.BytesRef; import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.BytesStreamOutput; -import org.elasticsearch.search.dfs.AggregatedDfs; import java.io.IOException; import java.util.ArrayList; @@ -48,14 +45,14 @@ final class TermVectorsWriter { // size here? private static final String HEADER = "TV"; private static final int CURRENT_VERSION = -1; - TermVectorsResponse response = null; + final TermVectorsResponse response; - TermVectorsWriter(TermVectorsResponse termVectorsResponse) throws IOException { + TermVectorsWriter(TermVectorsResponse termVectorsResponse) { response = termVectorsResponse; } void setFields(Fields termVectorsByField, Set selectedFields, EnumSet flags, Fields topLevelFields, - @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException { + @Nullable TermVectorsFilter termVectorsFilter) throws IOException { int numFieldsWritten = 0; PostingsEnum docsAndPosEnum = null; PostingsEnum docsEnum = null; @@ -90,11 +87,7 @@ final class TermVectorsWriter { startField(field, termsSize, positions, offsets, payloads); if (flags.contains(Flag.FieldStatistics)) { - if (dfs != null) { - writeFieldStatistics(dfs.fieldStatistics().get(field)); - } else { - writeFieldStatistics(topLevelTerms); - } + writeFieldStatistics(topLevelTerms); } TermsEnum iterator = fieldTermVector.iterator(); final boolean useDocsAndPos = positions || offsets || payloads; @@ -110,20 +103,11 @@ final class TermVectorsWriter { startTerm(termBytesRef); if (flags.contains(Flag.TermStatistics)) { // get the doc frequency - if (dfs != null) { - final TermStatistics statistics = dfs.termStatistics().get(term); - if (statistics == null) { - writeMissingTermStatistics(); - } else { - writeTermStatistics(statistics); - } + boolean foundTerm = topLevelIterator.seekExact(termBytesRef); + if (foundTerm) { + writeTermStatistics(topLevelIterator); } else { - boolean foundTerm = topLevelIterator.seekExact(termBytesRef); - if (foundTerm) { - writeTermStatistics(topLevelIterator); - } else { - writeMissingTermStatistics(); - } + writeMissingTermStatistics(); } } if (useDocsAndPos) { @@ -158,7 +142,7 @@ final class TermVectorsWriter { header.writeVInt(numFieldsWritten); for (int i = 0; i < fields.size(); i++) { header.writeString(fields.get(i)); - header.writeVLong(fieldOffset.get(i).longValue()); + header.writeVLong(fieldOffset.get(i)); } header.close(); return header.bytes(); @@ -259,15 +243,6 @@ final class TermVectorsWriter { writePotentiallyNegativeVLong(ttf); } - private void writeTermStatistics(TermStatistics termStatistics) throws IOException { - int docFreq = (int) termStatistics.docFreq(); - assert (docFreq >= -1); - writePotentiallyNegativeVInt(docFreq); - long ttf = termStatistics.totalTermFreq(); - assert (ttf >= -1); - writePotentiallyNegativeVLong(ttf); - } - private void writeFieldStatistics(Terms topLevelTerms) throws IOException { long sttf = topLevelTerms.getSumTotalTermFreq(); assert (sttf >= -1); @@ -280,18 +255,6 @@ final class TermVectorsWriter { writePotentiallyNegativeVInt(dc); } - private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { - long sttf = fieldStats.sumTotalTermFreq(); - assert (sttf >= -1); - writePotentiallyNegativeVLong(sttf); - long sdf = fieldStats.sumDocFreq(); - assert (sdf >= -1); - writePotentiallyNegativeVLong(sdf); - int dc = (int) fieldStats.docCount(); - assert (dc >= -1); - writePotentiallyNegativeVInt(dc); - } - private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException { output.writeFloat(Math.max(0, scoreTerm.score)); } @@ -310,11 +273,11 @@ final class TermVectorsWriter { /** Implements an empty {@link Terms}. */ private static final Terms EMPTY_TERMS = new Terms() { - @Override public TermsEnum iterator() throws IOException { return TermsEnum.EMPTY; } - @Override public long size() throws IOException { return 0; } - @Override public long getSumTotalTermFreq() throws IOException { return 0; } - @Override public long getSumDocFreq() throws IOException { return 0; } - @Override public int getDocCount() throws IOException { return 0; } + @Override public TermsEnum iterator() { return TermsEnum.EMPTY; } + @Override public long size() { return 0; } + @Override public long getSumTotalTermFreq() { return 0; } + @Override public long getSumDocFreq() { return 0; } + @Override public int getDocCount() { return 0; } @Override public boolean hasFreqs() { return false; } @Override public boolean hasOffsets() { return false; } @Override public boolean hasPositions() { return false; } diff --git a/server/src/main/java/org/elasticsearch/index/termvectors/TermVectorsService.java b/server/src/main/java/org/elasticsearch/index/termvectors/TermVectorsService.java index e229b9c9ed1..6db84d829e6 100644 --- a/server/src/main/java/org/elasticsearch/index/termvectors/TermVectorsService.java +++ b/server/src/main/java/org/elasticsearch/index/termvectors/TermVectorsService.java @@ -54,7 +54,6 @@ import org.elasticsearch.index.mapper.StringFieldType; import org.elasticsearch.index.mapper.TextSearchInfo; import org.elasticsearch.index.mapper.Uid; import org.elasticsearch.index.shard.IndexShard; -import org.elasticsearch.search.dfs.AggregatedDfs; import java.io.IOException; import java.util.ArrayList; @@ -87,7 +86,6 @@ public class TermVectorsService { final Term uidTerm = new Term(IdFieldMapper.NAME, Uid.encodeId(request.id())); Fields termVectorsByField = null; - AggregatedDfs dfs = null; TermVectorsFilter termVectorsFilter = null; /* handle potential wildcards in fields */ @@ -104,10 +102,6 @@ public class TermVectorsService { /* from an artificial document */ if (request.doc() != null) { termVectorsByField = generateTermVectorsFromDoc(indexShard, request); - // if no document indexed in shard, take the queried document itself for stats - if (topLevelFields == null) { - topLevelFields = termVectorsByField; - } termVectorsResponse.setArtificial(true); termVectorsResponse.setExists(true); } @@ -134,7 +128,7 @@ public class TermVectorsService { /* if there are term vectors, optional compute dfs and/or terms filtering */ if (termVectorsByField != null) { if (request.filterSettings() != null) { - termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs); + termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields()); termVectorsFilter.setSettings(request.filterSettings()); try { termVectorsFilter.selectBestTerms(); @@ -143,7 +137,7 @@ public class TermVectorsService { } } // write term vectors - termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs, + termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, termVectorsFilter); } termVectorsResponse.setTookInMillis(TimeUnit.NANOSECONDS.toMillis(nanoTimeSupplier.getAsLong() - startTime)); @@ -233,7 +227,7 @@ public class TermVectorsService { MapperService mapperService = indexShard.mapperService(); Analyzer analyzer; if (perFieldAnalyzer != null && perFieldAnalyzer.containsKey(field)) { - analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field).toString()); + analyzer = mapperService.getIndexAnalyzers().get(perFieldAnalyzer.get(field)); } else { MappedFieldType fieldType = mapperService.fieldType(field); analyzer = fieldType.indexAnalyzer();