Remove script access to term statistics (#19462)
In scripts (at least some of the languages), the terms dictionary and postings can be access with the special _index variable. This is for very advanced use cases which want to do their own scoring. The problem is segment level statistics must be recomputed for every document. Additionally, this is not friendly to the terms index caching as the order of looking up terms should be controlled by lucene. This change removes _index from scripts. Anyone using it can and should instead write a Similarity plugin, which is explicitly designed to allow doing the calculations needed for a relevance score. closes #19359
This commit is contained in:
parent
1cae850cf5
commit
97d2657e18
|
@ -23,7 +23,6 @@ import org.apache.lucene.search.Scorer;
|
|||
import org.elasticsearch.index.fielddata.ScriptDocValues;
|
||||
import org.elasticsearch.search.lookup.LeafDocLookup;
|
||||
import org.elasticsearch.search.lookup.LeafFieldsLookup;
|
||||
import org.elasticsearch.search.lookup.LeafIndexLookup;
|
||||
import org.elasticsearch.search.lookup.LeafSearchLookup;
|
||||
import org.elasticsearch.search.lookup.SourceLookup;
|
||||
|
||||
|
@ -87,13 +86,6 @@ public abstract class AbstractSearchScript extends AbstractExecutableScript impl
|
|||
return lookup.source();
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows to access statistics on terms and fields.
|
||||
*/
|
||||
protected final LeafIndexLookup indexLookup() {
|
||||
return lookup.indexLookup();
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows to access the *stored* fields.
|
||||
*/
|
||||
|
|
|
@ -1,132 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.IntsRefBuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
/*
|
||||
* Can iterate over the positions of a term an arbitrary number of times.
|
||||
* */
|
||||
public class CachedPositionIterator extends PositionIterator {
|
||||
|
||||
public CachedPositionIterator(IndexFieldTerm indexFieldTerm) {
|
||||
super(indexFieldTerm);
|
||||
}
|
||||
|
||||
// all payloads of the term in the current document in one bytes array.
|
||||
// payloadStarts and payloadLength mark the start and end of one payload.
|
||||
final BytesRefBuilder payloads = new BytesRefBuilder();
|
||||
|
||||
final IntsRefBuilder payloadsLengths = new IntsRefBuilder();
|
||||
|
||||
final IntsRefBuilder payloadsStarts = new IntsRefBuilder();
|
||||
|
||||
final IntsRefBuilder positions = new IntsRefBuilder();
|
||||
|
||||
final IntsRefBuilder startOffsets = new IntsRefBuilder();
|
||||
|
||||
final IntsRefBuilder endOffsets = new IntsRefBuilder();
|
||||
|
||||
final BytesRef payload = new BytesRef();
|
||||
|
||||
@Override
|
||||
public Iterator<TermPosition> reset() {
|
||||
return new Iterator<TermPosition>() {
|
||||
private int pos = 0;
|
||||
private final TermPosition termPosition = new TermPosition();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return pos < freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermPosition next() {
|
||||
termPosition.position = positions.intAt(pos);
|
||||
termPosition.startOffset = startOffsets.intAt(pos);
|
||||
termPosition.endOffset = endOffsets.intAt(pos);
|
||||
termPosition.payload = payload;
|
||||
payload.bytes = payloads.bytes();
|
||||
payload.offset = payloadsStarts.intAt(pos);
|
||||
payload.length = payloadsLengths.intAt(pos);
|
||||
pos++;
|
||||
return termPosition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private void record() throws IOException {
|
||||
TermPosition termPosition;
|
||||
for (int i = 0; i < freq; i++) {
|
||||
termPosition = super.next();
|
||||
positions.setIntAt(i, termPosition.position);
|
||||
addPayload(i, termPosition.payload);
|
||||
startOffsets.setIntAt(i, termPosition.startOffset);
|
||||
endOffsets.setIntAt(i, termPosition.endOffset);
|
||||
}
|
||||
}
|
||||
private void ensureSize(int freq) {
|
||||
if (freq == 0) {
|
||||
return;
|
||||
}
|
||||
startOffsets.grow(freq);
|
||||
endOffsets.grow(freq);
|
||||
positions.grow(freq);
|
||||
payloadsLengths.grow(freq);
|
||||
payloadsStarts.grow(freq);
|
||||
payloads.grow(freq * 8);// this is just a guess....
|
||||
|
||||
}
|
||||
|
||||
private void addPayload(int i, BytesRef currPayload) {
|
||||
if (currPayload != null) {
|
||||
payloadsLengths.setIntAt(i, currPayload.length);
|
||||
payloadsStarts.setIntAt(i, i == 0 ? 0 : payloadsStarts.intAt(i - 1) + payloadsLengths.intAt(i - 1));
|
||||
payloads.grow(payloadsStarts.intAt(i) + currPayload.length);
|
||||
System.arraycopy(currPayload.bytes, currPayload.offset, payloads.bytes(), payloadsStarts.intAt(i), currPayload.length);
|
||||
} else {
|
||||
payloadsLengths.setIntAt(i, 0);
|
||||
payloadsStarts.setIntAt(i, i == 0 ? 0 : payloadsStarts.intAt(i - 1) + payloadsLengths.intAt(i - 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void nextDoc() throws IOException {
|
||||
super.nextDoc();
|
||||
ensureSize(freq);
|
||||
record();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermPosition next() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
|
@ -1,128 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.elasticsearch.common.util.MinimalMap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Script interface to all information regarding a field.
|
||||
* */
|
||||
public class IndexField extends MinimalMap<String, IndexFieldTerm> {
|
||||
|
||||
/*
|
||||
* TermsInfo Objects that represent the Terms are stored in this map when
|
||||
* requested. Information such as frequency, doc frequency and positions
|
||||
* information can be retrieved from the TermInfo objects in this map.
|
||||
*/
|
||||
private final Map<String, IndexFieldTerm> terms = new HashMap<>();
|
||||
|
||||
// the name of this field
|
||||
private final String fieldName;
|
||||
|
||||
/*
|
||||
* The holds the current reader. We need it to populate the field
|
||||
* statistics. We just delegate all requests there
|
||||
*/
|
||||
private final LeafIndexLookup indexLookup;
|
||||
|
||||
/*
|
||||
* General field statistics such as number of documents containing the
|
||||
* field.
|
||||
*/
|
||||
private final CollectionStatistics fieldStats;
|
||||
|
||||
/*
|
||||
* Represents a field in a document. Can be used to return information on
|
||||
* statistics of this field. Information on specific terms in this field can
|
||||
* be accessed by calling get(String term).
|
||||
*/
|
||||
public IndexField(String fieldName, LeafIndexLookup indexLookup) throws IOException {
|
||||
|
||||
assert fieldName != null;
|
||||
this.fieldName = fieldName;
|
||||
|
||||
assert indexLookup != null;
|
||||
this.indexLookup = indexLookup;
|
||||
|
||||
fieldStats = this.indexLookup.getIndexSearcher().collectionStatistics(fieldName);
|
||||
}
|
||||
|
||||
/* get number of documents containing the field */
|
||||
public long docCount() throws IOException {
|
||||
return fieldStats.docCount();
|
||||
}
|
||||
|
||||
/* get sum of the number of words over all documents that were indexed */
|
||||
public long sumttf() throws IOException {
|
||||
return fieldStats.sumTotalTermFreq();
|
||||
}
|
||||
|
||||
/*
|
||||
* get the sum of doc frequencies over all words that appear in any document
|
||||
* that has the field.
|
||||
*/
|
||||
public long sumdf() throws IOException {
|
||||
return fieldStats.sumDocFreq();
|
||||
}
|
||||
|
||||
// TODO: might be good to get the field lengths here somewhere?
|
||||
|
||||
/*
|
||||
* Returns a TermInfo object that can be used to access information on
|
||||
* specific terms. flags can be set as described in TermInfo.
|
||||
*
|
||||
* TODO: here might be potential for running time improvement? If we knew in
|
||||
* advance which terms are requested, we could provide an array which the
|
||||
* user could then iterate over.
|
||||
*/
|
||||
public IndexFieldTerm get(Object key, int flags) {
|
||||
String termString = (String) key;
|
||||
IndexFieldTerm indexFieldTerm = terms.get(termString);
|
||||
// see if we initialized already...
|
||||
if (indexFieldTerm == null) {
|
||||
indexFieldTerm = new IndexFieldTerm(termString, fieldName, indexLookup, flags);
|
||||
terms.put(termString, indexFieldTerm);
|
||||
}
|
||||
indexFieldTerm.validateFlags(flags);
|
||||
return indexFieldTerm;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a TermInfo object that can be used to access information on
|
||||
* specific terms. flags can be set as described in TermInfo.
|
||||
*/
|
||||
@Override
|
||||
public IndexFieldTerm get(Object key) {
|
||||
// per default, do not initialize any positions info
|
||||
return get(key, IndexLookup.FLAG_FREQUENCIES);
|
||||
}
|
||||
|
||||
public void setDocIdInTerms(int docId) {
|
||||
for (IndexFieldTerm ti : terms.values()) {
|
||||
ti.setDocument(docId);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,298 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FilterLeafReader.FilterPostingsEnum;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* Holds all information on a particular term in a field.
|
||||
* */
|
||||
public class IndexFieldTerm implements Iterable<TermPosition> {
|
||||
|
||||
// The posting list for this term. Is null if the term or field does not
|
||||
// exist.
|
||||
PostingsEnum postings;
|
||||
|
||||
// Stores if positions, offsets and payloads are requested.
|
||||
private final int flags;
|
||||
|
||||
private final String fieldName;
|
||||
|
||||
private final String term;
|
||||
|
||||
private final PositionIterator iterator;
|
||||
|
||||
// for lucene calls
|
||||
private final Term identifier;
|
||||
|
||||
private final TermStatistics termStats;
|
||||
|
||||
// get the document frequency of the term
|
||||
public long df() throws IOException {
|
||||
return termStats.docFreq();
|
||||
}
|
||||
|
||||
// get the total term frequency of the term, that is, how often does the
|
||||
// term appear in any document?
|
||||
public long ttf() throws IOException {
|
||||
return termStats.totalTermFreq();
|
||||
}
|
||||
|
||||
// when the reader changes, we have to get the posting list for this term
|
||||
// and reader
|
||||
private void setReader(LeafReader reader) {
|
||||
try {
|
||||
postings = getPostings(convertToLuceneFlags(flags), reader);
|
||||
|
||||
if (postings == null) {
|
||||
// no term or field for this segment, fake out the postings...
|
||||
final DocIdSetIterator empty = DocIdSetIterator.empty();
|
||||
postings = new PostingsEnum() {
|
||||
@Override
|
||||
public int docID() {
|
||||
return empty.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return empty.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return empty.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return empty.cost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextPosition() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int startOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int endOffset() throws IOException {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("Unable to get postings for field " + fieldName + " and term " + term, e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private int convertToLuceneFlags(int flags) {
|
||||
int lucenePositionsFlags = PostingsEnum.NONE;
|
||||
lucenePositionsFlags |= (flags & IndexLookup.FLAG_FREQUENCIES) > 0 ? PostingsEnum.FREQS : 0x0;
|
||||
lucenePositionsFlags |= (flags & IndexLookup.FLAG_POSITIONS) > 0 ? PostingsEnum.POSITIONS : 0x0;
|
||||
lucenePositionsFlags |= (flags & IndexLookup.FLAG_PAYLOADS) > 0 ? PostingsEnum.PAYLOADS : 0x0;
|
||||
lucenePositionsFlags |= (flags & IndexLookup.FLAG_OFFSETS) > 0 ? PostingsEnum.OFFSETS : 0x0;
|
||||
return lucenePositionsFlags;
|
||||
}
|
||||
|
||||
private PostingsEnum getPostings(int luceneFlags, LeafReader reader) throws IOException {
|
||||
assert identifier.field() != null;
|
||||
assert identifier.bytes() != null;
|
||||
final Fields fields = reader.fields();
|
||||
PostingsEnum newPostings = null;
|
||||
if (fields != null) {
|
||||
final Terms terms = fields.terms(identifier.field());
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
if (termsEnum.seekExact(identifier.bytes())) {
|
||||
newPostings = termsEnum.postings(postings, luceneFlags);
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
if (liveDocs != null) {
|
||||
newPostings = new FilterPostingsEnum(newPostings) {
|
||||
private int doNext(int d) throws IOException {
|
||||
for (; d != NO_MORE_DOCS; d = super.nextDoc()) {
|
||||
if (liveDocs.get(d)) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(super.nextDoc());
|
||||
}
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(super.advance(target));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return newPostings;
|
||||
}
|
||||
|
||||
private int freq = 0;
|
||||
|
||||
public void setDocument(int docId) {
|
||||
assert (postings != null);
|
||||
try {
|
||||
// we try to advance to the current document.
|
||||
int currentDocPos = postings.docID();
|
||||
if (currentDocPos < docId) {
|
||||
currentDocPos = postings.advance(docId);
|
||||
}
|
||||
if (currentDocPos == docId) {
|
||||
freq = postings.freq();
|
||||
} else {
|
||||
freq = 0;
|
||||
}
|
||||
iterator.nextDoc();
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("While trying to initialize term positions in IndexFieldTerm.setNextDoc() ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public IndexFieldTerm(String term, String fieldName, LeafIndexLookup indexLookup, int flags) {
|
||||
assert fieldName != null;
|
||||
this.fieldName = fieldName;
|
||||
assert term != null;
|
||||
this.term = term;
|
||||
assert indexLookup != null;
|
||||
identifier = new Term(fieldName, (String) term);
|
||||
this.flags = flags;
|
||||
boolean doRecord = ((flags & IndexLookup.FLAG_CACHE) > 0);
|
||||
if (!doRecord) {
|
||||
iterator = new PositionIterator(this);
|
||||
} else {
|
||||
iterator = new CachedPositionIterator(this);
|
||||
}
|
||||
setReader(indexLookup.getReader());
|
||||
setDocument(indexLookup.getDocId());
|
||||
try {
|
||||
termStats = indexLookup.getIndexSearcher().termStatistics(identifier,
|
||||
TermContext.build(indexLookup.getReaderContext(), identifier));
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("Cannot get term statistics: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public int tf() throws IOException {
|
||||
return freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<TermPosition> iterator() {
|
||||
return iterator.reset();
|
||||
}
|
||||
|
||||
/*
|
||||
* A user might decide inside a script to call get with _POSITIONS and then
|
||||
* a second time with _PAYLOADS. If the positions were recorded but the
|
||||
* payloads were not, the user will not have access to them. Therefore, throw
|
||||
* exception here explaining how to call get().
|
||||
*/
|
||||
public void validateFlags(int flags2) {
|
||||
if ((this.flags & flags2) < flags2) {
|
||||
throw new ElasticsearchException("You must call get with all required flags! Instead of " + getCalledStatement(flags2)
|
||||
+ "call " + getCallStatement(flags2 | this.flags) + " once");
|
||||
}
|
||||
}
|
||||
|
||||
private String getCalledStatement(int flags2) {
|
||||
String calledFlagsCall1 = getFlagsString(flags);
|
||||
String calledFlagsCall2 = getFlagsString(flags2);
|
||||
String callStatement1 = getCallStatement(calledFlagsCall1);
|
||||
String callStatement2 = getCallStatement(calledFlagsCall2);
|
||||
return " " + callStatement1 + " and " + callStatement2 + " ";
|
||||
}
|
||||
|
||||
private String getCallStatement(String calledFlags) {
|
||||
return "_index['" + this.fieldName + "'].get('" + this.term + "', " + calledFlags + ")";
|
||||
}
|
||||
|
||||
private String getFlagsString(int flags2) {
|
||||
String flagsString = null;
|
||||
if ((flags2 & IndexLookup.FLAG_FREQUENCIES) != 0) {
|
||||
flagsString = anddToFlagsString(flagsString, "_FREQUENCIES");
|
||||
}
|
||||
if ((flags2 & IndexLookup.FLAG_POSITIONS) != 0) {
|
||||
flagsString = anddToFlagsString(flagsString, "_POSITIONS");
|
||||
}
|
||||
if ((flags2 & IndexLookup.FLAG_OFFSETS) != 0) {
|
||||
flagsString = anddToFlagsString(flagsString, "_OFFSETS");
|
||||
}
|
||||
if ((flags2 & IndexLookup.FLAG_PAYLOADS) != 0) {
|
||||
flagsString = anddToFlagsString(flagsString, "_PAYLOADS");
|
||||
}
|
||||
if ((flags2 & IndexLookup.FLAG_CACHE) != 0) {
|
||||
flagsString = anddToFlagsString(flagsString, "_CACHE");
|
||||
}
|
||||
return flagsString;
|
||||
}
|
||||
|
||||
private String anddToFlagsString(String flagsString, String flag) {
|
||||
if (flagsString != null) {
|
||||
flagsString += " | ";
|
||||
} else {
|
||||
flagsString = "";
|
||||
}
|
||||
flagsString += flag;
|
||||
return flagsString;
|
||||
}
|
||||
|
||||
private String getCallStatement(int flags2) {
|
||||
String calledFlags = getFlagsString(flags2);
|
||||
String callStatement = getCallStatement(calledFlags);
|
||||
return " " + callStatement + " ";
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.util.Collections.unmodifiableMap;
|
||||
|
||||
public class IndexLookup {
|
||||
public static final Map<String, Object> NAMES;
|
||||
static {
|
||||
Map<String, Object> names = new HashMap<>();
|
||||
names.put("_FREQUENCIES", IndexLookup.FLAG_FREQUENCIES);
|
||||
names.put("_POSITIONS", IndexLookup.FLAG_POSITIONS);
|
||||
names.put("_OFFSETS", IndexLookup.FLAG_OFFSETS);
|
||||
names.put("_PAYLOADS", IndexLookup.FLAG_PAYLOADS);
|
||||
names.put("_CACHE", IndexLookup.FLAG_CACHE);
|
||||
NAMES = unmodifiableMap(names);
|
||||
}
|
||||
/**
|
||||
* Flag to pass to {@link IndexField#get(Object, int)} if you require
|
||||
* offsets in the returned {@link IndexFieldTerm}.
|
||||
*/
|
||||
public static final int FLAG_OFFSETS = 2;
|
||||
|
||||
/**
|
||||
* Flag to pass to {@link IndexField#get(Object, int)} if you require
|
||||
* payloads in the returned {@link IndexFieldTerm}.
|
||||
*/
|
||||
public static final int FLAG_PAYLOADS = 4;
|
||||
|
||||
/**
|
||||
* Flag to pass to {@link IndexField#get(Object, int)} if you require
|
||||
* frequencies in the returned {@link IndexFieldTerm}. Frequencies might be
|
||||
* returned anyway for some lucene codecs even if this flag is no set.
|
||||
*/
|
||||
public static final int FLAG_FREQUENCIES = 8;
|
||||
|
||||
/**
|
||||
* Flag to pass to {@link IndexField#get(Object, int)} if you require
|
||||
* positions in the returned {@link IndexFieldTerm}.
|
||||
*/
|
||||
public static final int FLAG_POSITIONS = 16;
|
||||
|
||||
/**
|
||||
* Flag to pass to {@link IndexField#get(Object, int)} if you require
|
||||
* positions in the returned {@link IndexFieldTerm}.
|
||||
*/
|
||||
public static final int FLAG_CACHE = 32;
|
||||
|
||||
public static LeafIndexLookup getLeafIndexLookup(LeafReaderContext context) {
|
||||
return new LeafIndexLookup(context);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,199 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReaderContext;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.util.MinimalMap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class LeafIndexLookup extends MinimalMap<String, IndexField> {
|
||||
|
||||
// Current reader from which we can get the term vectors. No info on term
|
||||
// and field statistics.
|
||||
private final LeafReader reader;
|
||||
|
||||
// The parent reader from which we can get proper field and term
|
||||
// statistics
|
||||
private final IndexReader parentReader;
|
||||
|
||||
// we need this later to get the field and term statistics of the shard
|
||||
private final IndexSearcher indexSearcher;
|
||||
|
||||
// current docId
|
||||
private int docId = -1;
|
||||
|
||||
// stores the objects that are used in the script. we maintain this map
|
||||
// because we do not want to re-initialize the objects each time a field is
|
||||
// accessed
|
||||
private final Map<String, IndexField> indexFields = new HashMap<>();
|
||||
|
||||
// number of documents per shard. cached here because the computation is
|
||||
// expensive
|
||||
private int numDocs = -1;
|
||||
|
||||
// the maximum doc number of the shard.
|
||||
private int maxDoc = -1;
|
||||
|
||||
// number of deleted documents per shard. cached here because the
|
||||
// computation is expensive
|
||||
private int numDeletedDocs = -1;
|
||||
|
||||
private boolean deprecationEmitted = false;
|
||||
|
||||
private void logDeprecation() {
|
||||
if (deprecationEmitted == false) {
|
||||
Logger logger = Loggers.getLogger(getClass());
|
||||
DeprecationLogger deprecationLogger = new DeprecationLogger(logger);
|
||||
deprecationLogger.deprecated("Using _index is deprecated. Create a custom ScriptEngine to access index internals.");
|
||||
deprecationEmitted = true;
|
||||
}
|
||||
}
|
||||
|
||||
public int numDocs() {
|
||||
logDeprecation();
|
||||
if (numDocs == -1) {
|
||||
numDocs = parentReader.numDocs();
|
||||
}
|
||||
return numDocs;
|
||||
}
|
||||
|
||||
public int maxDoc() {
|
||||
logDeprecation();
|
||||
if (maxDoc == -1) {
|
||||
maxDoc = parentReader.maxDoc();
|
||||
}
|
||||
return maxDoc;
|
||||
}
|
||||
|
||||
public int numDeletedDocs() {
|
||||
logDeprecation();
|
||||
if (numDeletedDocs == -1) {
|
||||
numDeletedDocs = parentReader.numDeletedDocs();
|
||||
}
|
||||
return numDeletedDocs;
|
||||
}
|
||||
|
||||
public LeafIndexLookup(LeafReaderContext ctx) {
|
||||
reader = ctx.reader();
|
||||
parentReader = ReaderUtil.getTopLevelContext(ctx).reader();
|
||||
indexSearcher = new IndexSearcher(parentReader);
|
||||
indexSearcher.setQueryCache(null);
|
||||
}
|
||||
|
||||
public void setDocument(int docId) {
|
||||
if (this.docId == docId) { // if we are called with the same docId,
|
||||
// nothing to do
|
||||
return;
|
||||
}
|
||||
// We assume that docs are processed in ascending order of id. If this
|
||||
// is not the case, we would have to re initialize all posting lists in
|
||||
// IndexFieldTerm. TODO: Instead of assert we could also call
|
||||
// setReaderInFields(); here?
|
||||
if (this.docId > docId) {
|
||||
// This might happen if the same SearchLookup is used in different
|
||||
// phases, such as score and fetch phase.
|
||||
// In this case we do not want to re initialize posting list etc.
|
||||
// because we do not even know if term and field statistics will be
|
||||
// needed in this new phase.
|
||||
// Therefore we just remove all IndexFieldTerms.
|
||||
indexFields.clear();
|
||||
}
|
||||
this.docId = docId;
|
||||
setNextDocIdInFields();
|
||||
}
|
||||
|
||||
protected void setNextDocIdInFields() {
|
||||
for (IndexField stat : indexFields.values()) {
|
||||
stat.setDocIdInTerms(this.docId);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: here might be potential for running time improvement? If we knew in
|
||||
* advance which terms are requested, we could provide an array which the
|
||||
* user could then iterate over.
|
||||
*/
|
||||
@Override
|
||||
public IndexField get(Object key) {
|
||||
logDeprecation();
|
||||
String stringField = (String) key;
|
||||
IndexField indexField = indexFields.get(key);
|
||||
if (indexField == null) {
|
||||
try {
|
||||
indexField = new IndexField(stringField, this);
|
||||
indexFields.put(stringField, indexField);
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException(e);
|
||||
}
|
||||
}
|
||||
return indexField;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the lucene term vectors. See
|
||||
* https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html
|
||||
* *
|
||||
*/
|
||||
public Fields termVectors() throws IOException {
|
||||
logDeprecation();
|
||||
assert reader != null;
|
||||
return reader.getTermVectors(docId);
|
||||
}
|
||||
|
||||
LeafReader getReader() {
|
||||
logDeprecation();
|
||||
return reader;
|
||||
}
|
||||
|
||||
public int getDocId() {
|
||||
logDeprecation();
|
||||
return docId;
|
||||
}
|
||||
|
||||
public IndexReader getParentReader() {
|
||||
logDeprecation();
|
||||
if (parentReader == null) {
|
||||
return reader;
|
||||
}
|
||||
return parentReader;
|
||||
}
|
||||
|
||||
public IndexSearcher getIndexSearcher() {
|
||||
logDeprecation();
|
||||
return indexSearcher;
|
||||
}
|
||||
|
||||
public IndexReaderContext getReaderContext() {
|
||||
logDeprecation();
|
||||
return getParentReader().getContext();
|
||||
}
|
||||
}
|
|
@ -35,24 +35,20 @@ public class LeafSearchLookup {
|
|||
final LeafDocLookup docMap;
|
||||
final SourceLookup sourceLookup;
|
||||
final LeafFieldsLookup fieldsLookup;
|
||||
final LeafIndexLookup indexLookup;
|
||||
final Map<String, Object> asMap;
|
||||
|
||||
public LeafSearchLookup(LeafReaderContext ctx, LeafDocLookup docMap, SourceLookup sourceLookup,
|
||||
LeafFieldsLookup fieldsLookup, LeafIndexLookup indexLookup, Map<String, Object> topLevelMap) {
|
||||
LeafFieldsLookup fieldsLookup) {
|
||||
this.ctx = ctx;
|
||||
this.docMap = docMap;
|
||||
this.sourceLookup = sourceLookup;
|
||||
this.fieldsLookup = fieldsLookup;
|
||||
this.indexLookup = indexLookup;
|
||||
|
||||
Map<String, Object> asMap = new HashMap<>(topLevelMap.size() + 5);
|
||||
asMap.putAll(topLevelMap);
|
||||
Map<String, Object> asMap = new HashMap<>(4);
|
||||
asMap.put("doc", docMap);
|
||||
asMap.put("_doc", docMap);
|
||||
asMap.put("_source", sourceLookup);
|
||||
asMap.put("_fields", fieldsLookup);
|
||||
asMap.put("_index", indexLookup);
|
||||
this.asMap = unmodifiableMap(asMap);
|
||||
}
|
||||
|
||||
|
@ -64,10 +60,6 @@ public class LeafSearchLookup {
|
|||
return this.sourceLookup;
|
||||
}
|
||||
|
||||
public LeafIndexLookup indexLookup() {
|
||||
return this.indexLookup;
|
||||
}
|
||||
|
||||
public LeafFieldsLookup fields() {
|
||||
return this.fieldsLookup;
|
||||
}
|
||||
|
@ -80,6 +72,5 @@ public class LeafSearchLookup {
|
|||
docMap.setDocument(docId);
|
||||
sourceLookup.setSegmentAndDocument(ctx, docId);
|
||||
fieldsLookup.setDocument(docId);
|
||||
indexLookup.setDocument(docId);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class PositionIterator implements Iterator<TermPosition> {
|
||||
|
||||
private boolean resetted = false;
|
||||
|
||||
protected IndexFieldTerm indexFieldTerm;
|
||||
|
||||
protected int freq = -1;
|
||||
|
||||
// current position of iterator
|
||||
private int currentPos;
|
||||
|
||||
protected final TermPosition termPosition = new TermPosition();
|
||||
|
||||
private PostingsEnum postings;
|
||||
|
||||
public PositionIterator(IndexFieldTerm indexFieldTerm) {
|
||||
this.indexFieldTerm = indexFieldTerm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove anything from TermPosition iterator.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return currentPos < freq;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TermPosition next() {
|
||||
try {
|
||||
termPosition.position = postings.nextPosition();
|
||||
termPosition.startOffset = postings.startOffset();
|
||||
termPosition.endOffset = postings.endOffset();
|
||||
termPosition.payload = postings.getPayload();
|
||||
} catch (IOException ex) {
|
||||
throw new ElasticsearchException("can not advance iterator", ex);
|
||||
}
|
||||
currentPos++;
|
||||
return termPosition;
|
||||
}
|
||||
|
||||
public void nextDoc() throws IOException {
|
||||
resetted = false;
|
||||
currentPos = 0;
|
||||
freq = indexFieldTerm.tf();
|
||||
postings = indexFieldTerm.postings;
|
||||
}
|
||||
|
||||
public Iterator<TermPosition> reset() {
|
||||
if (resetted) {
|
||||
throw new ElasticsearchException(
|
||||
"Cannot iterate twice! If you want to iterate more that once, add _CACHE explicitly.");
|
||||
}
|
||||
resetted = true;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -42,9 +42,7 @@ public class SearchLookup {
|
|||
return new LeafSearchLookup(context,
|
||||
docMap.getLeafDocLookup(context),
|
||||
sourceLookup,
|
||||
fieldsLookup.getLeafFieldsLookup(context),
|
||||
IndexLookup.getLeafIndexLookup(context),
|
||||
IndexLookup.NAMES);
|
||||
fieldsLookup.getLeafFieldsLookup(context));
|
||||
}
|
||||
|
||||
public DocLookup doc() {
|
||||
|
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.lookup;
|
||||
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
||||
public class TermPosition {
|
||||
|
||||
public int position = -1;
|
||||
public int startOffset = -1;
|
||||
public int endOffset = -1;
|
||||
public BytesRef payload;
|
||||
private CharsRefBuilder spare = new CharsRefBuilder();
|
||||
|
||||
public String payloadAsString() {
|
||||
if (payload != null && payload.length != 0) {
|
||||
spare.copyUTF8Bytes(payload);
|
||||
return spare.toString();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public float payloadAsFloat(float defaultMissing) {
|
||||
if (payload != null && payload.length != 0) {
|
||||
return PayloadHelper.decodeFloat(payload.bytes, payload.offset);
|
||||
} else {
|
||||
return defaultMissing;
|
||||
}
|
||||
}
|
||||
|
||||
public int payloadAsInt(int defaultMissing) {
|
||||
if (payload != null && payload.length != 0) {
|
||||
return PayloadHelper.decodeInt(payload.bytes, payload.offset);
|
||||
} else {
|
||||
return defaultMissing;
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -13,7 +13,11 @@ milliseconds since epoch as a `long`. The same is true for
|
|||
`doc.some_date_field[some_number]`. Use `doc.some_date_field.value.millis` to
|
||||
fetch the milliseconds since epoch if you need it.
|
||||
|
||||
==== Removed access to index internal via the _index variable
|
||||
|
||||
The `_index` variable has been removed. If you used it for advanced scoring, consider writing a `Similarity` plugin.
|
||||
|
||||
==== Script Settings
|
||||
|
||||
All of the existing scripting security settings have been deprecated. Instead
|
||||
they are replaced with `script.allowed_types` and `script.allowed_contexts`.
|
||||
they are replaced with `script.allowed_types` and `script.allowed_contexts`.
|
||||
|
|
|
@ -29,10 +29,6 @@ Field values can be accessed from a script using
|
|||
<<modules-scripting-doc-vals,doc-values>>, or
|
||||
<<modules-scripting-stored,stored fields or `_source` field>>, which are explained below.
|
||||
|
||||
Scripts may also have access to the document's relevance
|
||||
<<scripting-score,`_score`>> and, via the experimental `_index` variable,
|
||||
to term statistics for <<modules-advanced-scripting,advanced text scoring>>.
|
||||
|
||||
[[scripting-score]]
|
||||
[float]
|
||||
=== Accessing the score of a document within a script
|
||||
|
|
Loading…
Reference in New Issue