This commit is contained in:
Karl Wright 2017-08-24 13:41:34 -04:00
commit 7f52920352
23 changed files with 133 additions and 25 deletions

View File

@ -36,6 +36,12 @@ Optimizations
* LUCENE-7925: Collapse duplicate SHOULD or MUST clauses by summing up their
boosts. (Adrien Grand)
* LUCENE-7939: MinShouldMatchSumScorer now leverages two-phase iteration in
order to be faster when used in conjunctions. (Adrien Grand)
* LUCENE-7827: AnalyzingInfixSuggester doesn't create "textgrams"
when minPrefixChar=0 (Mikhail Khludnev)
Bug Fixes
* LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used

View File

@ -128,7 +128,12 @@ final class MinShouldMatchSumScorer extends Scorer {
@Override
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
}
@Override
public TwoPhaseIterator twoPhaseIterator() {
DocIdSetIterator approximation = new DocIdSetIterator() {
@Override
public int docID() {
@ -154,6 +159,12 @@ final class MinShouldMatchSumScorer extends Scorer {
}
setDocAndFreq();
// It would be correct to return doNextCandidate() at this point but if you
// call nextDoc as opposed to advance, it probably means that you really
// need the next match. Returning 'doc' here would lead to a similar
// iteration over sub postings overall except that the decision making would
// happen at a higher level where more abstractions are involved and
// benchmarks suggested it causes a significant performance hit.
return doNext();
}
@ -181,7 +192,7 @@ final class MinShouldMatchSumScorer extends Scorer {
}
setDocAndFreq();
return doNext();
return doNextCandidate();
}
@Override
@ -189,6 +200,30 @@ final class MinShouldMatchSumScorer extends Scorer {
return cost;
}
};
return new TwoPhaseIterator(approximation) {
@Override
public boolean matches() throws IOException {
while (freq < minShouldMatch) {
assert freq > 0;
if (freq + tailSize >= minShouldMatch) {
// a match on doc is still possible, try to
// advance scorers from the tail
advanceTail();
} else {
return false;
}
}
return true;
}
@Override
public float matchCost() {
// maximum number of scorer that matches() might advance
return tail.length;
}
};
}
private void addLead(DisiWrapper lead) {
@ -250,6 +285,18 @@ final class MinShouldMatchSumScorer extends Scorer {
return doc;
}
/** Move iterators to the tail until the cumulated size of lead+tail is
* greater than or equal to minShouldMath */
private int doNextCandidate() throws IOException {
while (freq + tailSize < minShouldMatch) {
// no match on doc is possible, move to the next potential match
pushBackLeads();
setDocAndFreq();
}
return doc;
}
/** Advance all entries from the tail to know about all matches on the
* current doc. */
private void updateFreq() throws IOException {

View File

@ -107,6 +107,10 @@ import org.apache.lucene.util.RamUsageEstimator;
public class AnalyzingInfixSuggester extends Lookup implements Closeable {
/** edgegrams for searching short prefixes without Prefix Query
* that's controlled by {@linkplain #minPrefixChars} */
protected final static String TEXTGRAMS_FIELD_NAME = "textgrams";
/** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text";
@ -353,7 +357,9 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
if (fieldName.equals("textgrams") && minPrefixChars > 0) {
assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0)
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars);
return new TokenStreamComponents(components.getTokenizer(), filter);
@ -410,7 +416,9 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
Document doc = new Document();
FieldType ft = getTextFieldType();
doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
doc.add(new Field("textgrams", textString, ft));
if (minPrefixChars>0) {
doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft));
}
doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
doc.add(new NumericDocValuesField("weight", weight));
@ -474,7 +482,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable {
protected Query getLastTokenQuery(String token) throws IOException {
if (token.length() < minPrefixChars) {
// The leading ngram was directly indexed:
return new TermQuery(new Term("textgrams", token));
return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token));
}
return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));

View File

@ -57,6 +57,8 @@ Upgrade Notes
If a reporter configures the group="cluster" attribute then please also configure the
class="org.apache.solr.metrics.reporters.solr.SolrClusterReporter" attribute.
* SOLR-11254: the abstract DocTransformer class now has an abstract score-less transform method variant.
New Features
----------------------
@ -158,6 +160,8 @@ Other Changes
* SOLR-11240: Raise UnInvertedField internal limit. (Toke Eskildsen)
* SOLR-11254: Add score-less (abstract) DocTransformer.transform method. (Christine Poerschke)
================== 7.0.0 ==================
Versions of Major Components

View File

@ -255,13 +255,24 @@ public class LTRFeatureLoggerTransformerFactory extends TransformerFactory {
@Override
public void transform(SolrDocument doc, int docid, float score)
throws IOException {
implTransform(doc, docid, new Float(score));
}
@Override
public void transform(SolrDocument doc, int docid)
throws IOException {
implTransform(doc, docid, 0.0f);
}
private void implTransform(SolrDocument doc, int docid, Float score)
throws IOException {
Object fv = featureLogger.getFeatureVector(docid, scoringQuery, searcher);
if (fv == null) { // FV for this document was not in the cache
fv = featureLogger.makeFeatureVector(
LTRRescorer.extractFeaturesInfo(
modelWeight,
docid,
(docsWereNotReranked ? new Float(score) : null),
(docsWereNotReranked ? score : null),
leafContexts));
}

View File

@ -265,7 +265,7 @@ public class RealTimeGetComponent extends SearchComponent
throw new SolrException(ErrorCode.INVALID_STATE, "Expected ADD or UPDATE_INPLACE. Got: " + oper);
}
if (transformer!=null) {
transformer.transform(doc, -1, 0); // unknown docID
transformer.transform(doc, -1); // unknown docID
}
docList.add(doc);
break;
@ -314,7 +314,7 @@ public class RealTimeGetComponent extends SearchComponent
resultContext = new RTGResultContext(rsp.getReturnFields(), searcherInfo.getSearcher(), req);
transformer.setContext(resultContext);
}
transformer.transform(doc, docid, 0);
transformer.transform(doc, docid);
}
docList.add(doc);
}

View File

@ -613,7 +613,7 @@ public class TermsComponent extends SearchComponent {
continue;
}
// Since splitTerms is already sorted, this array will also be sorted
// Since splitTerms is already sorted, this array will also be sorted. NOTE: this may not be true, it depends on readableToIndexed.
Term[] terms = new Term[splitTerms.length];
for (int i = 0; i < splitTerms.length; i++) {
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));

View File

@ -167,7 +167,11 @@ public class DocsStreamer implements Iterator<SolrDocument> {
if (transformer != null) {
boolean doScore = rctx.wantsScores();
try {
transformer.transform(sdoc, id, doScore ? docIterator.score() : 0);
if (doScore) {
transformer.transform(sdoc, id, docIterator.score());
} else {
transformer.transform(sdoc, id);
}
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error applying transformer", e);
}

View File

@ -45,7 +45,7 @@ public abstract class BaseEditorialTransformer extends DocTransformer {
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
//this only gets added if QueryElevationParams.MARK_EXCLUDED is true
Set<String> ids = getIdSet();
if (ids != null && ids.isEmpty() == false) {

View File

@ -123,7 +123,7 @@ class ChildDocTransformer extends DocTransformer {
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
FieldType idFt = idField.getType();
Object parentIdField = doc.getFirstValue(idField.getName());

View File

@ -51,7 +51,7 @@ class DocIdAugmenter extends DocTransformer
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
assert -1 <= docid;
doc.setField( name, docid );
}

View File

@ -73,13 +73,29 @@ public abstract class DocTransformer {
* If implementations require a valid docId and index access, the {@link #needsSolrIndexSearcher}
* method must return true
*
* Default implementation calls {@link #transform(SolrDocument, int)}.
*
* @param doc The document to alter
* @param docid The Lucene internal doc id, or -1 in cases where the <code>doc</code> did not come from the index
* @param score the score for this document
* @throws IOException If there is a low-level I/O error.
* @see #needsSolrIndexSearcher
*/
public abstract void transform(SolrDocument doc, int docid, float score) throws IOException;
public void transform(SolrDocument doc, int docid, float score) throws IOException {
transform(doc, docid);
}
/**
* This is where implementations do the actual work.
* If implementations require a valid docId and index access, the {@link #needsSolrIndexSearcher}
* method must return true
*
* @param doc The document to alter
* @param docid The Lucene internal doc id, or -1 in cases where the <code>doc</code> did not come from the index
* @throws IOException If there is a low-level I/O error.
* @see #needsSolrIndexSearcher
*/
public abstract void transform(SolrDocument doc, int docid) throws IOException;
/**
* When a transformer needs access to fields that are not automatically derived from the

View File

@ -77,6 +77,13 @@ public class DocTransformers extends DocTransformer
}
}
@Override
public void transform(SolrDocument doc, int docid) throws IOException {
for( DocTransformer a : children ) {
a.transform( doc, docid);
}
}
/** Returns true if and only if at least 1 child transformer returns true */
@Override
public boolean needsSolrIndexSearcher() {

View File

@ -107,7 +107,7 @@ public class ExplainAugmenterFactory extends TransformerFactory
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
if( context != null && context.getQuery() != null ) {
try {
Explanation exp = context.getSearcher().explain(context.getQuery(), docid);

View File

@ -131,7 +131,7 @@ public class GeoTransformerFactory extends TransformerFactory
}
@Override
public void transform(SolrDocument doc, int docid, float score) throws IOException {
public void transform(SolrDocument doc, int docid) throws IOException {
int leafOrd = ReaderUtil.subIndex(docid, context.getSearcher().getTopReaderContext().leaves());
LeafReaderContext ctx = context.getSearcher().getTopReaderContext().leaves().get(leafOrd);
ShapeValues values = shapes.getValues(ctx);
@ -148,7 +148,7 @@ public class GeoTransformerFactory extends TransformerFactory
return new DocTransformer() {
@Override
public void transform(SolrDocument doc, int docid, float score) throws IOException {
public void transform(SolrDocument doc, int docid) throws IOException {
Object val = doc.remove(updater.field);
if(val!=null) {
updater.setValue(doc, val);

View File

@ -107,7 +107,7 @@ public class RawValueTransformerFactory extends TransformerFactory
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
Object val = doc.remove(field);
if(val==null) {
return;

View File

@ -44,7 +44,7 @@ public class RenameFieldTransformer extends DocTransformer
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
Object v = (copy)?doc.get(from) : doc.remove( from );
if( v != null ) {
doc.setField(to, v);

View File

@ -44,4 +44,9 @@ public class ScoreAugmenter extends DocTransformer {
doc.setField( name, score );
}
}
@Override
public void transform(SolrDocument doc, int docid) {
transform(doc, docid, 0.0f);
}
}

View File

@ -321,7 +321,7 @@ class SubQueryAugmenter extends DocTransformer {
public boolean needsSolrIndexSearcher() { return false; }
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
final SolrParams docWithDeprefixed = SolrParams.wrapDefaults(
new DocRowParams(doc, prefix, separator), baseSubParams);

View File

@ -96,7 +96,7 @@ class ValueAugmenter extends DocTransformer
}
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
doc.setField( name, value );
}
}

View File

@ -77,7 +77,7 @@ public class ValueSourceAugmenter extends DocTransformer
List<LeafReaderContext> readerContexts;
@Override
public void transform(SolrDocument doc, int docid, float score) {
public void transform(SolrDocument doc, int docid) {
// This is only good for random-access functions
try {

View File

@ -105,7 +105,7 @@ public class TestCustomDocTransformer extends SolrTestCaseJ4 {
* This transformer simply concatenates the values of multiple fields
*/
@Override
public void transform(SolrDocument doc, int docid, float score) throws IOException {
public void transform(SolrDocument doc, int docid) throws IOException {
str.setLength(0);
for(String s : extra) {
String v = getAsString(s, doc);

View File

@ -59,14 +59,14 @@ If you do not specify the type of replica when it is created, it will be NRT typ
There are three combinations of replica types that are recommended:
* All NRT replicas
* All PULL replicas
* All TLOG replicas
* TLOG replicas with PULL replicas
==== All NRT Replicas
Use this for small to medium clusters, or even big clusters where the update (index) throughput is not too high. NRT is the only type of replica that supports soft-commits, so also use this combination when NearRealTime is needed.
==== All PULL Replicas
==== All TLOG Replicas
Use this combination if NearRealTime is not needed and the number of replicas per shard is high, but you still want all replicas to be able to handle update requests.