mirror of https://github.com/apache/lucene.git
LUCENE-5339: cutover DrillSideways
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542713 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
95a9cd2854
commit
46e67d8f04
1
TODO
1
TODO
|
@ -2,6 +2,7 @@ nocommit this!
|
|||
|
||||
TODO
|
||||
- associations
|
||||
- simplify ddq api
|
||||
- SSDVValueSourceFacets?
|
||||
- we could put more stuff into the "schema", e.g. this field is
|
||||
sorted-set-DV and that one is taxo?
|
||||
|
|
|
@ -180,7 +180,7 @@ public class DrillSideways {
|
|||
// Just do ordinary search when there are no drill-downs:
|
||||
FacetsCollector c = FacetsCollector.create(getDrillDownAccumulator(fsp));
|
||||
searcher.search(query, MultiCollector.wrap(hitCollector, c));
|
||||
return new DrillSidewaysResult(c.getFacetResults(), null);
|
||||
return new DrillSidewaysResult(c.getFacetResults(), null);
|
||||
}
|
||||
|
||||
List<FacetRequest> ddRequests = new ArrayList<FacetRequest>();
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/** Maps specified dims to provided Facets impls; else, uses
|
||||
* the default Facets impl. */
|
||||
public class MultiFacets extends Facets {
|
||||
private final Map<String,Facets> dimToFacets;
|
||||
private final Facets defaultFacets;
|
||||
|
||||
public MultiFacets(Map<String,Facets> dimToFacets, Facets defaultFacets) {
|
||||
this.dimToFacets = dimToFacets;
|
||||
this.defaultFacets = defaultFacets;
|
||||
}
|
||||
|
||||
public SimpleFacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
|
||||
Facets facets = dimToFacets.get(dim);
|
||||
if (facets == null) {
|
||||
facets = defaultFacets;
|
||||
}
|
||||
return facets.getTopChildren(topN, dim, path);
|
||||
}
|
||||
|
||||
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||
Facets facets = dimToFacets.get(dim);
|
||||
if (facets == null) {
|
||||
facets = defaultFacets;
|
||||
}
|
||||
return facets.getSpecificValue(dim, path);
|
||||
}
|
||||
|
||||
public List<SimpleFacetResult> getAllDims(int topN) throws IOException {
|
||||
// nocommit can/should we impl this? ie, sparse
|
||||
// faceting after drill sideways
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
|
@ -118,17 +118,17 @@ public final class SimpleDrillDownQuery extends Query {
|
|||
* Adds one dimension of drill downs; if you pass multiple values they are
|
||||
* OR'd, and then the entire dimension is AND'd against the base query.
|
||||
*/
|
||||
// nocommit can we remove CatPath here?
|
||||
// nocommit can we remove FacetLabel here?
|
||||
public void add(FacetLabel... paths) {
|
||||
add(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, Constants.DEFAULT_DELIM_CHAR, paths);
|
||||
}
|
||||
|
||||
// nocommit can we remove CatPath here?
|
||||
// nocommit can we remove FacetLabel here?
|
||||
public void add(String field, FacetLabel... paths) {
|
||||
add(field, Constants.DEFAULT_DELIM_CHAR, paths);
|
||||
}
|
||||
|
||||
// nocommit can we remove CatPath here?
|
||||
// nocommit can we remove FacetLabel here?
|
||||
public void add(String field, char delimChar, FacetLabel... paths) {
|
||||
Query q;
|
||||
if (paths[0].length == 0) {
|
||||
|
|
|
@ -0,0 +1,429 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.facet.index.FacetFields;
|
||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.FieldDoc;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.MultiCollector;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.TopFieldCollector;
|
||||
import org.apache.lucene.search.TopScoreDocCollector;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/**
|
||||
* Computes drill down and sideways counts for the provided
|
||||
* {@link DrillDownQuery}. Drill sideways counts include
|
||||
* alternative values/aggregates for the drill-down
|
||||
* dimensions so that a dimension does not disappear after
|
||||
* the user drills down into it.
|
||||
*
|
||||
* <p> Use one of the static search
|
||||
* methods to do the search, and then get the hits and facet
|
||||
* results from the returned {@link DrillSidewaysResult}.
|
||||
*
|
||||
* <p><b>NOTE</b>: this allocates one {@link
|
||||
* FacetsCollector} for each drill-down, plus one. If your
|
||||
* index has high number of facet labels then this will
|
||||
* multiply your memory usage.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public class SimpleDrillSideways {
|
||||
|
||||
protected final IndexSearcher searcher;
|
||||
protected final TaxonomyReader taxoReader;
|
||||
protected final SortedSetDocValuesReaderState state;
|
||||
protected final FacetsConfig facetsConfig;
|
||||
|
||||
/**
|
||||
* Create a new {@code DrillSideways} instance, assuming the categories were
|
||||
* indexed with {@link FacetFields}.
|
||||
*/
|
||||
public SimpleDrillSideways(IndexSearcher searcher, FacetsConfig facetsConfig, TaxonomyReader taxoReader) {
|
||||
this(searcher, facetsConfig, taxoReader, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@code DrillSideways} instance, assuming the categories were
|
||||
* indexed with {@link SortedSetDocValuesFacetFields}.
|
||||
*/
|
||||
public SimpleDrillSideways(IndexSearcher searcher, FacetsConfig facetsConfig, SortedSetDocValuesReaderState state) {
|
||||
this(searcher, facetsConfig, null, state);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@code DrillSideways} instance, where some
|
||||
* dimensions are sorted set facets and others are
|
||||
* taxononmy facets.
|
||||
*/
|
||||
public SimpleDrillSideways(IndexSearcher searcher, FacetsConfig facetsConfig, TaxonomyReader taxoReader, SortedSetDocValuesReaderState state) {
|
||||
this.searcher = searcher;
|
||||
this.facetsConfig = facetsConfig;
|
||||
this.taxoReader = taxoReader;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
/** Subclass can override to customize per-dim Facets
|
||||
* impl. */
|
||||
protected Facets buildFacetsResult(SimpleFacetsCollector drillDowns, SimpleFacetsCollector[] drillSideways, String[] drillSidewaysDims) throws IOException {
|
||||
|
||||
Facets drillDownFacets = new TaxonomyFacetCounts(taxoReader, facetsConfig, drillDowns);
|
||||
|
||||
if (drillSideways == null) {
|
||||
return drillDownFacets;
|
||||
} else {
|
||||
Map<String,Facets> drillSidewaysFacets = new HashMap<String,Facets>();
|
||||
for(int i=0;i<drillSideways.length;i++) {
|
||||
drillSidewaysFacets.put(drillSidewaysDims[i],
|
||||
new TaxonomyFacetCounts(taxoReader, facetsConfig, drillSideways[i]));
|
||||
}
|
||||
return new MultiFacets(drillSidewaysFacets, drillDownFacets);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search, collecting hits with a {@link Collector}, and
|
||||
* computing drill down and sideways counts.
|
||||
*/
|
||||
@SuppressWarnings({"rawtypes","unchecked"})
|
||||
public SimpleDrillSidewaysResult search(SimpleDrillDownQuery query, Collector hitCollector) throws IOException {
|
||||
|
||||
Map<String,Integer> drillDownDims = query.getDims();
|
||||
|
||||
SimpleFacetsCollector drillDownCollector = new SimpleFacetsCollector();
|
||||
|
||||
if (drillDownDims.isEmpty()) {
|
||||
// There are no drill-down dims, so there is no
|
||||
// drill-sideways to compute:
|
||||
searcher.search(query, MultiCollector.wrap(hitCollector, drillDownCollector));
|
||||
return new SimpleDrillSidewaysResult(buildFacetsResult(drillDownCollector, null, null), null);
|
||||
}
|
||||
|
||||
BooleanQuery ddq = query.getBooleanQuery();
|
||||
BooleanClause[] clauses = ddq.getClauses();
|
||||
|
||||
Query baseQuery;
|
||||
int startClause;
|
||||
if (clauses.length == drillDownDims.size()) {
|
||||
// TODO: we could optimize this pure-browse case by
|
||||
// making a custom scorer instead:
|
||||
baseQuery = new MatchAllDocsQuery();
|
||||
startClause = 0;
|
||||
} else {
|
||||
assert clauses.length == 1+drillDownDims.size();
|
||||
baseQuery = clauses[0].getQuery();
|
||||
startClause = 1;
|
||||
}
|
||||
|
||||
SimpleFacetsCollector[] drillSidewaysCollectors = new SimpleFacetsCollector[drillDownDims.size()];
|
||||
|
||||
int idx = 0;
|
||||
for(String dim : drillDownDims.keySet()) {
|
||||
drillSidewaysCollectors[idx++] = new SimpleFacetsCollector();
|
||||
}
|
||||
|
||||
boolean useCollectorMethod = scoreSubDocsAtOnce();
|
||||
|
||||
Term[][] drillDownTerms = null;
|
||||
|
||||
if (!useCollectorMethod) {
|
||||
// Optimistic: assume subQueries of the DDQ are either
|
||||
// TermQuery or BQ OR of TermQuery; if this is wrong
|
||||
// then we detect it and fallback to the mome general
|
||||
// but slower DrillSidewaysCollector:
|
||||
drillDownTerms = new Term[clauses.length-startClause][];
|
||||
for(int i=startClause;i<clauses.length;i++) {
|
||||
Query q = clauses[i].getQuery();
|
||||
|
||||
// DrillDownQuery always wraps each subQuery in
|
||||
// ConstantScoreQuery:
|
||||
assert q instanceof ConstantScoreQuery;
|
||||
|
||||
q = ((ConstantScoreQuery) q).getQuery();
|
||||
|
||||
if (q instanceof TermQuery) {
|
||||
drillDownTerms[i-startClause] = new Term[] {((TermQuery) q).getTerm()};
|
||||
} else if (q instanceof BooleanQuery) {
|
||||
BooleanQuery q2 = (BooleanQuery) q;
|
||||
BooleanClause[] clauses2 = q2.getClauses();
|
||||
drillDownTerms[i-startClause] = new Term[clauses2.length];
|
||||
for(int j=0;j<clauses2.length;j++) {
|
||||
if (clauses2[j].getQuery() instanceof TermQuery) {
|
||||
drillDownTerms[i-startClause][j] = ((TermQuery) clauses2[j].getQuery()).getTerm();
|
||||
} else {
|
||||
useCollectorMethod = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
useCollectorMethod = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (useCollectorMethod) {
|
||||
// TODO: maybe we could push the "collector method"
|
||||
// down into the optimized scorer to have a tighter
|
||||
// integration ... and so TermQuery clauses could
|
||||
// continue to run "optimized"
|
||||
collectorMethod(query, baseQuery, startClause, hitCollector, drillDownCollector, drillSidewaysCollectors);
|
||||
} else {
|
||||
SimpleDrillSidewaysQuery dsq = new SimpleDrillSidewaysQuery(baseQuery, drillDownCollector, drillSidewaysCollectors, drillDownTerms);
|
||||
searcher.search(dsq, hitCollector);
|
||||
}
|
||||
|
||||
return new SimpleDrillSidewaysResult(buildFacetsResult(drillDownCollector, drillSidewaysCollectors, drillDownDims.keySet().toArray(new String[drillDownDims.size()])), null);
|
||||
}
|
||||
|
||||
/** Uses the more general but slower method of sideways
|
||||
* counting. This method allows an arbitrary subQuery to
|
||||
* implement the drill down for a given dimension. */
|
||||
private void collectorMethod(SimpleDrillDownQuery ddq, Query baseQuery, int startClause, Collector hitCollector, Collector drillDownCollector, Collector[] drillSidewaysCollectors) throws IOException {
|
||||
|
||||
BooleanClause[] clauses = ddq.getBooleanQuery().getClauses();
|
||||
|
||||
Map<String,Integer> drillDownDims = ddq.getDims();
|
||||
|
||||
BooleanQuery topQuery = new BooleanQuery(true);
|
||||
final SimpleDrillSidewaysCollector collector = new SimpleDrillSidewaysCollector(hitCollector, drillDownCollector, drillSidewaysCollectors,
|
||||
drillDownDims);
|
||||
|
||||
// TODO: if query is already a BQ we could copy that and
|
||||
// add clauses to it, instead of doing BQ inside BQ
|
||||
// (should be more efficient)? Problem is this can
|
||||
// affect scoring (coord) ... too bad we can't disable
|
||||
// coord on a clause by clause basis:
|
||||
topQuery.add(baseQuery, BooleanClause.Occur.MUST);
|
||||
|
||||
// NOTE: in theory we could just make a single BQ, with
|
||||
// +query a b c minShouldMatch=2, but in this case,
|
||||
// annoyingly, BS2 wraps a sub-scorer that always
|
||||
// returns 2 as the .freq(), not how many of the
|
||||
// SHOULD clauses matched:
|
||||
BooleanQuery subQuery = new BooleanQuery(true);
|
||||
|
||||
Query wrappedSubQuery = new QueryWrapper(subQuery,
|
||||
new SetWeight() {
|
||||
@Override
|
||||
public void set(Weight w) {
|
||||
collector.setWeight(w, -1);
|
||||
}
|
||||
});
|
||||
Query constantScoreSubQuery = new ConstantScoreQuery(wrappedSubQuery);
|
||||
|
||||
// Don't impact score of original query:
|
||||
constantScoreSubQuery.setBoost(0.0f);
|
||||
|
||||
topQuery.add(constantScoreSubQuery, BooleanClause.Occur.MUST);
|
||||
|
||||
// Unfortunately this sub-BooleanQuery
|
||||
// will never get BS1 because today BS1 only works
|
||||
// if topScorer=true... and actually we cannot use BS1
|
||||
// anyways because we need subDocsScoredAtOnce:
|
||||
int dimIndex = 0;
|
||||
for(int i=startClause;i<clauses.length;i++) {
|
||||
Query q = clauses[i].getQuery();
|
||||
// DrillDownQuery always wraps each subQuery in
|
||||
// ConstantScoreQuery:
|
||||
assert q instanceof ConstantScoreQuery;
|
||||
q = ((ConstantScoreQuery) q).getQuery();
|
||||
|
||||
final int finalDimIndex = dimIndex;
|
||||
subQuery.add(new QueryWrapper(q,
|
||||
new SetWeight() {
|
||||
@Override
|
||||
public void set(Weight w) {
|
||||
collector.setWeight(w, finalDimIndex);
|
||||
}
|
||||
}),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
dimIndex++;
|
||||
}
|
||||
|
||||
// TODO: we could better optimize the "just one drill
|
||||
// down" case w/ a separate [specialized]
|
||||
// collector...
|
||||
int minShouldMatch = drillDownDims.size()-1;
|
||||
if (minShouldMatch == 0) {
|
||||
// Must add another "fake" clause so BQ doesn't erase
|
||||
// itself by rewriting to the single clause:
|
||||
Query end = new MatchAllDocsQuery();
|
||||
end.setBoost(0.0f);
|
||||
subQuery.add(end, BooleanClause.Occur.SHOULD);
|
||||
minShouldMatch++;
|
||||
}
|
||||
|
||||
subQuery.setMinimumNumberShouldMatch(minShouldMatch);
|
||||
|
||||
// System.out.println("EXE " + topQuery);
|
||||
|
||||
// Collects against the passed-in
|
||||
// drillDown/SidewaysCollectors as a side effect:
|
||||
searcher.search(topQuery, collector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Search, sorting by {@link Sort}, and computing
|
||||
* drill down and sideways counts.
|
||||
*/
|
||||
public SimpleDrillSidewaysResult search(SimpleDrillDownQuery query,
|
||||
Filter filter, FieldDoc after, int topN, Sort sort, boolean doDocScores,
|
||||
boolean doMaxScore) throws IOException {
|
||||
if (filter != null) {
|
||||
query = new SimpleDrillDownQuery(filter, query);
|
||||
}
|
||||
if (sort != null) {
|
||||
int limit = searcher.getIndexReader().maxDoc();
|
||||
if (limit == 0) {
|
||||
limit = 1; // the collector does not alow numHits = 0
|
||||
}
|
||||
topN = Math.min(topN, limit);
|
||||
final TopFieldCollector hitCollector = TopFieldCollector.create(sort,
|
||||
topN,
|
||||
after,
|
||||
true,
|
||||
doDocScores,
|
||||
doMaxScore,
|
||||
true);
|
||||
SimpleDrillSidewaysResult r = search(query, hitCollector);
|
||||
return new SimpleDrillSidewaysResult(r.facets, hitCollector.topDocs());
|
||||
} else {
|
||||
return search(after, query, topN);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Search, sorting by score, and computing
|
||||
* drill down and sideways counts.
|
||||
*/
|
||||
public SimpleDrillSidewaysResult search(ScoreDoc after,
|
||||
SimpleDrillDownQuery query, int topN) throws IOException {
|
||||
int limit = searcher.getIndexReader().maxDoc();
|
||||
if (limit == 0) {
|
||||
limit = 1; // the collector does not alow numHits = 0
|
||||
}
|
||||
topN = Math.min(topN, limit);
|
||||
TopScoreDocCollector hitCollector = TopScoreDocCollector.create(topN, after, true);
|
||||
SimpleDrillSidewaysResult r = search(query, hitCollector);
|
||||
return new SimpleDrillSidewaysResult(r.facets, hitCollector.topDocs());
|
||||
}
|
||||
|
||||
/** Override this and return true if your collector
|
||||
* (e.g., ToParentBlockJoinCollector) expects all
|
||||
* sub-scorers to be positioned on the document being
|
||||
* collected. This will cause some performance loss;
|
||||
* default is false. Note that if you return true from
|
||||
* this method (in a subclass) be sure your collector
|
||||
* also returns false from {@link
|
||||
* Collector#acceptsDocsOutOfOrder}: this will trick
|
||||
* BooleanQuery into also scoring all subDocs at once. */
|
||||
protected boolean scoreSubDocsAtOnce() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public static class SimpleDrillSidewaysResult {
|
||||
/** Combined drill down & sideways results. */
|
||||
public final Facets facets;
|
||||
|
||||
/** Hits. */
|
||||
public final TopDocs hits;
|
||||
|
||||
public SimpleDrillSidewaysResult(Facets facets, TopDocs hits) {
|
||||
this.facets = facets;
|
||||
this.hits = hits;
|
||||
}
|
||||
}
|
||||
private interface SetWeight {
|
||||
public void set(Weight w);
|
||||
}
|
||||
|
||||
/** Just records which Weight was given out for the
|
||||
* (possibly rewritten) Query. */
|
||||
private static class QueryWrapper extends Query {
|
||||
private final Query originalQuery;
|
||||
private final SetWeight setter;
|
||||
|
||||
public QueryWrapper(Query originalQuery, SetWeight setter) {
|
||||
this.originalQuery = originalQuery;
|
||||
this.setter = setter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(final IndexSearcher searcher) throws IOException {
|
||||
Weight w = originalQuery.createWeight(searcher);
|
||||
setter.set(w);
|
||||
return w;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query rewritten = originalQuery.rewrite(reader);
|
||||
if (rewritten != originalQuery) {
|
||||
return new QueryWrapper(rewritten, setter);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String s) {
|
||||
return originalQuery.toString(s);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof QueryWrapper)) return false;
|
||||
final QueryWrapper other = (QueryWrapper) o;
|
||||
return super.equals(o) && originalQuery.equals(other.originalQuery);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return super.hashCode() * 31 + originalQuery.hashCode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,188 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer.ChildScorer;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/** Collector that scrutinizes each hit to determine if it
|
||||
* passed all constraints (a true hit) or if it missed
|
||||
* exactly one dimension (a near-miss, to count for
|
||||
* drill-sideways counts on that dimension). */
|
||||
class SimpleDrillSidewaysCollector extends Collector {
|
||||
|
||||
private final Collector hitCollector;
|
||||
private final Collector drillDownCollector;
|
||||
private final Collector[] drillSidewaysCollectors;
|
||||
private final Scorer[] subScorers;
|
||||
private final int exactCount;
|
||||
|
||||
// Maps Weight to either -1 (mainQuery) or to integer
|
||||
// index of the dims drillDown. We needs this when
|
||||
// visiting the child scorers to correlate back to the
|
||||
// right scorers:
|
||||
private final Map<Weight,Integer> weightToIndex = new IdentityHashMap<Weight,Integer>();
|
||||
|
||||
private Scorer mainScorer;
|
||||
|
||||
public SimpleDrillSidewaysCollector(Collector hitCollector, Collector drillDownCollector, Collector[] drillSidewaysCollectors,
|
||||
Map<String,Integer> dims) {
|
||||
this.hitCollector = hitCollector;
|
||||
this.drillDownCollector = drillDownCollector;
|
||||
this.drillSidewaysCollectors = drillSidewaysCollectors;
|
||||
subScorers = new Scorer[dims.size()];
|
||||
|
||||
if (dims.size() == 1) {
|
||||
// When we have only one dim, we insert the
|
||||
// MatchAllDocsQuery, bringing the clause count to
|
||||
// 2:
|
||||
exactCount = 2;
|
||||
} else {
|
||||
exactCount = dims.size();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
//System.out.println("collect doc=" + doc + " main.freq=" + mainScorer.freq() + " main.doc=" + mainScorer.docID() + " exactCount=" + exactCount);
|
||||
|
||||
if (mainScorer == null) {
|
||||
// This segment did not have any docs with any
|
||||
// drill-down field & value:
|
||||
return;
|
||||
}
|
||||
|
||||
if (mainScorer.freq() == exactCount) {
|
||||
// All sub-clauses from the drill-down filters
|
||||
// matched, so this is a "real" hit, so we first
|
||||
// collect in both the hitCollector and the
|
||||
// drillDown collector:
|
||||
//System.out.println(" hit " + drillDownCollector);
|
||||
hitCollector.collect(doc);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownCollector.collect(doc);
|
||||
}
|
||||
|
||||
// Also collect across all drill-sideways counts so
|
||||
// we "merge in" drill-down counts for this
|
||||
// dimension.
|
||||
for(int i=0;i<subScorers.length;i++) {
|
||||
// This cannot be null, because it was a hit,
|
||||
// meaning all drill-down dims matched, so all
|
||||
// dims must have non-null scorers:
|
||||
assert subScorers[i] != null;
|
||||
int subDoc = subScorers[i].docID();
|
||||
assert subDoc == doc;
|
||||
drillSidewaysCollectors[i].collect(doc);
|
||||
}
|
||||
|
||||
} else {
|
||||
boolean found = false;
|
||||
for(int i=0;i<subScorers.length;i++) {
|
||||
if (subScorers[i] == null) {
|
||||
// This segment did not have any docs with this
|
||||
// drill-down field & value:
|
||||
drillSidewaysCollectors[i].collect(doc);
|
||||
assert allMatchesFrom(i+1, doc);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
int subDoc = subScorers[i].docID();
|
||||
//System.out.println(" i=" + i + " sub: " + subDoc);
|
||||
if (subDoc != doc) {
|
||||
//System.out.println(" +ds[" + i + "]");
|
||||
assert subDoc > doc: "subDoc=" + subDoc + " doc=" + doc;
|
||||
drillSidewaysCollectors[i].collect(doc);
|
||||
assert allMatchesFrom(i+1, doc);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert found;
|
||||
}
|
||||
}
|
||||
|
||||
// Only used by assert:
|
||||
private boolean allMatchesFrom(int startFrom, int doc) {
|
||||
for(int i=startFrom;i<subScorers.length;i++) {
|
||||
assert subScorers[i].docID() == doc;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
// We actually could accept docs out of order, but, we
|
||||
// need to force BooleanScorer2 so that the
|
||||
// sub-scorers are "on" each docID we are collecting:
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(AtomicReaderContext leaf) throws IOException {
|
||||
//System.out.println("DS.setNextReader reader=" + leaf.reader());
|
||||
hitCollector.setNextReader(leaf);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownCollector.setNextReader(leaf);
|
||||
}
|
||||
for(Collector dsc : drillSidewaysCollectors) {
|
||||
dsc.setNextReader(leaf);
|
||||
}
|
||||
}
|
||||
|
||||
void setWeight(Weight weight, int index) {
|
||||
assert !weightToIndex.containsKey(weight);
|
||||
weightToIndex.put(weight, index);
|
||||
}
|
||||
|
||||
private void findScorers(Scorer scorer) {
|
||||
Integer index = weightToIndex.get(scorer.getWeight());
|
||||
if (index != null) {
|
||||
if (index.intValue() == -1) {
|
||||
mainScorer = scorer;
|
||||
} else {
|
||||
subScorers[index] = scorer;
|
||||
}
|
||||
}
|
||||
for(ChildScorer child : scorer.getChildren()) {
|
||||
findScorers(child.child);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
mainScorer = null;
|
||||
Arrays.fill(subScorers, null);
|
||||
findScorers(scorer);
|
||||
hitCollector.setScorer(scorer);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownCollector.setScorer(scorer);
|
||||
}
|
||||
for(Collector dsc : drillSidewaysCollectors) {
|
||||
dsc.setScorer(scorer);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,198 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/** Only purpose is to punch through and return a
|
||||
* SimpleDrillSidewaysScorer */
|
||||
|
||||
class SimpleDrillSidewaysQuery extends Query {
|
||||
final Query baseQuery;
|
||||
final Collector drillDownCollector;
|
||||
final Collector[] drillSidewaysCollectors;
|
||||
final Term[][] drillDownTerms;
|
||||
|
||||
SimpleDrillSidewaysQuery(Query baseQuery, Collector drillDownCollector, Collector[] drillSidewaysCollectors, Term[][] drillDownTerms) {
|
||||
this.baseQuery = baseQuery;
|
||||
this.drillDownCollector = drillDownCollector;
|
||||
this.drillSidewaysCollectors = drillSidewaysCollectors;
|
||||
this.drillDownTerms = drillDownTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "DrillSidewaysQuery";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query newQuery = baseQuery;
|
||||
while(true) {
|
||||
Query rewrittenQuery = newQuery.rewrite(reader);
|
||||
if (rewrittenQuery == newQuery) {
|
||||
break;
|
||||
}
|
||||
newQuery = rewrittenQuery;
|
||||
}
|
||||
if (newQuery == baseQuery) {
|
||||
return this;
|
||||
} else {
|
||||
return new SimpleDrillSidewaysQuery(newQuery, drillDownCollector, drillSidewaysCollectors, drillDownTerms);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
final Weight baseWeight = baseQuery.createWeight(searcher);
|
||||
|
||||
return new Weight() {
|
||||
@Override
|
||||
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
|
||||
return baseWeight.explain(context, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query getQuery() {
|
||||
return baseQuery;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getValueForNormalization() throws IOException {
|
||||
return baseWeight.getValueForNormalization();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void normalize(float norm, float topLevelBoost) {
|
||||
baseWeight.normalize(norm, topLevelBoost);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean scoresDocsOutOfOrder() {
|
||||
// TODO: would be nice if AssertingIndexSearcher
|
||||
// confirmed this for us
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder,
|
||||
boolean topScorer, Bits acceptDocs) throws IOException {
|
||||
|
||||
SimpleDrillSidewaysScorer.DocsEnumsAndFreq[] dims = new SimpleDrillSidewaysScorer.DocsEnumsAndFreq[drillDownTerms.length];
|
||||
TermsEnum termsEnum = null;
|
||||
String lastField = null;
|
||||
int nullCount = 0;
|
||||
for(int dim=0;dim<dims.length;dim++) {
|
||||
dims[dim] = new SimpleDrillSidewaysScorer.DocsEnumsAndFreq();
|
||||
dims[dim].sidewaysCollector = drillSidewaysCollectors[dim];
|
||||
String field = drillDownTerms[dim][0].field();
|
||||
dims[dim].dim = drillDownTerms[dim][0].text();
|
||||
if (lastField == null || !lastField.equals(field)) {
|
||||
AtomicReader reader = context.reader();
|
||||
Terms terms = reader.terms(field);
|
||||
if (terms != null) {
|
||||
termsEnum = terms.iterator(null);
|
||||
} else {
|
||||
termsEnum = null;
|
||||
}
|
||||
lastField = field;
|
||||
}
|
||||
dims[dim].docsEnums = new DocsEnum[drillDownTerms[dim].length];
|
||||
if (termsEnum == null) {
|
||||
nullCount++;
|
||||
continue;
|
||||
}
|
||||
for(int i=0;i<drillDownTerms[dim].length;i++) {
|
||||
if (termsEnum.seekExact(drillDownTerms[dim][i].bytes())) {
|
||||
DocsEnum docsEnum = termsEnum.docs(null, null, 0);
|
||||
if (docsEnum != null) {
|
||||
dims[dim].docsEnums[i] = docsEnum;
|
||||
dims[dim].maxCost = Math.max(dims[dim].maxCost, docsEnum.cost());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nullCount > 1 || (nullCount == 1 && dims.length == 1)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Sort drill-downs by most restrictive first:
|
||||
Arrays.sort(dims);
|
||||
|
||||
// TODO: it could be better if we take acceptDocs
|
||||
// into account instead of baseScorer?
|
||||
Scorer baseScorer = baseWeight.scorer(context, scoreDocsInOrder, false, acceptDocs);
|
||||
|
||||
if (baseScorer == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new SimpleDrillSidewaysScorer(this, context,
|
||||
baseScorer,
|
||||
drillDownCollector, dims);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: these should do "deeper" equals/hash on the 2-D drillDownTerms array
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = super.hashCode();
|
||||
result = prime * result + ((baseQuery == null) ? 0 : baseQuery.hashCode());
|
||||
result = prime * result
|
||||
+ ((drillDownCollector == null) ? 0 : drillDownCollector.hashCode());
|
||||
result = prime * result + Arrays.hashCode(drillDownTerms);
|
||||
result = prime * result + Arrays.hashCode(drillSidewaysCollectors);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (!super.equals(obj)) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
SimpleDrillSidewaysQuery other = (SimpleDrillSidewaysQuery) obj;
|
||||
if (baseQuery == null) {
|
||||
if (other.baseQuery != null) return false;
|
||||
} else if (!baseQuery.equals(other.baseQuery)) return false;
|
||||
if (drillDownCollector == null) {
|
||||
if (other.drillDownCollector != null) return false;
|
||||
} else if (!drillDownCollector.equals(other.drillDownCollector)) return false;
|
||||
if (!Arrays.equals(drillDownTerms, other.drillDownTerms)) return false;
|
||||
if (!Arrays.equals(drillSidewaysCollectors, other.drillSidewaysCollectors)) return false;
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,654 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
class SimpleDrillSidewaysScorer extends Scorer {
|
||||
|
||||
//private static boolean DEBUG = false;
|
||||
|
||||
private final Collector drillDownCollector;
|
||||
|
||||
private final DocsEnumsAndFreq[] dims;
|
||||
|
||||
// DrillDown DocsEnums:
|
||||
private final Scorer baseScorer;
|
||||
|
||||
private final AtomicReaderContext context;
|
||||
|
||||
private static final int CHUNK = 2048;
|
||||
private static final int MASK = CHUNK-1;
|
||||
|
||||
private int collectDocID = -1;
|
||||
private float collectScore;
|
||||
|
||||
SimpleDrillSidewaysScorer(Weight w, AtomicReaderContext context, Scorer baseScorer, Collector drillDownCollector,
|
||||
DocsEnumsAndFreq[] dims) {
|
||||
super(w);
|
||||
this.dims = dims;
|
||||
this.context = context;
|
||||
this.baseScorer = baseScorer;
|
||||
this.drillDownCollector = drillDownCollector;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void score(Collector collector) throws IOException {
|
||||
//if (DEBUG) {
|
||||
// System.out.println("\nscore: reader=" + context.reader());
|
||||
//}
|
||||
//System.out.println("score r=" + context.reader());
|
||||
collector.setScorer(this);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownCollector.setScorer(this);
|
||||
drillDownCollector.setNextReader(context);
|
||||
}
|
||||
for(DocsEnumsAndFreq dim : dims) {
|
||||
dim.sidewaysCollector.setScorer(this);
|
||||
dim.sidewaysCollector.setNextReader(context);
|
||||
}
|
||||
|
||||
// TODO: if we ever allow null baseScorer ... it will
|
||||
// mean we DO score docs out of order ... hmm, or if we
|
||||
// change up the order of the conjuntions below
|
||||
assert baseScorer != null;
|
||||
|
||||
// Position all scorers to their first matching doc:
|
||||
baseScorer.nextDoc();
|
||||
for(DocsEnumsAndFreq dim : dims) {
|
||||
for (DocsEnum docsEnum : dim.docsEnums) {
|
||||
if (docsEnum != null) {
|
||||
docsEnum.nextDoc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final int numDims = dims.length;
|
||||
|
||||
DocsEnum[][] docsEnums = new DocsEnum[numDims][];
|
||||
Collector[] sidewaysCollectors = new Collector[numDims];
|
||||
long drillDownCost = 0;
|
||||
for(int dim=0;dim<numDims;dim++) {
|
||||
docsEnums[dim] = dims[dim].docsEnums;
|
||||
sidewaysCollectors[dim] = dims[dim].sidewaysCollector;
|
||||
for (DocsEnum de : dims[dim].docsEnums) {
|
||||
if (de != null) {
|
||||
drillDownCost += de.cost();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
long baseQueryCost = baseScorer.cost();
|
||||
|
||||
/*
|
||||
System.out.println("\nbaseDocID=" + baseScorer.docID() + " est=" + estBaseHitCount);
|
||||
System.out.println(" maxDoc=" + context.reader().maxDoc());
|
||||
System.out.println(" maxCost=" + maxCost);
|
||||
System.out.println(" dims[0].freq=" + dims[0].freq);
|
||||
if (numDims > 1) {
|
||||
System.out.println(" dims[1].freq=" + dims[1].freq);
|
||||
}
|
||||
*/
|
||||
|
||||
if (baseQueryCost < drillDownCost/10) {
|
||||
//System.out.println("baseAdvance");
|
||||
doBaseAdvanceScoring(collector, docsEnums, sidewaysCollectors);
|
||||
} else if (numDims > 1 && (dims[1].maxCost < baseQueryCost/10)) {
|
||||
//System.out.println("drillDownAdvance");
|
||||
doDrillDownAdvanceScoring(collector, docsEnums, sidewaysCollectors);
|
||||
} else {
|
||||
//System.out.println("union");
|
||||
doUnionScoring(collector, docsEnums, sidewaysCollectors);
|
||||
}
|
||||
}
|
||||
|
||||
/** Used when drill downs are highly constraining vs
|
||||
* baseQuery. */
|
||||
private void doDrillDownAdvanceScoring(Collector collector, DocsEnum[][] docsEnums, Collector[] sidewaysCollectors) throws IOException {
|
||||
final int maxDoc = context.reader().maxDoc();
|
||||
final int numDims = dims.length;
|
||||
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" doDrillDownAdvanceScoring");
|
||||
//}
|
||||
|
||||
// TODO: maybe a class like BS, instead of parallel arrays
|
||||
int[] filledSlots = new int[CHUNK];
|
||||
int[] docIDs = new int[CHUNK];
|
||||
float[] scores = new float[CHUNK];
|
||||
int[] missingDims = new int[CHUNK];
|
||||
int[] counts = new int[CHUNK];
|
||||
|
||||
docIDs[0] = -1;
|
||||
int nextChunkStart = CHUNK;
|
||||
|
||||
final FixedBitSet seen = new FixedBitSet(CHUNK);
|
||||
|
||||
while (true) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]);
|
||||
//}
|
||||
|
||||
// First dim:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" dim0");
|
||||
//}
|
||||
for(DocsEnum docsEnum : docsEnums[0]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
int docID = docsEnum.docID();
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
|
||||
if (docIDs[slot] != docID) {
|
||||
seen.set(slot);
|
||||
// Mark slot as valid:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " id=" + context.reader().document(docID).get("id"));
|
||||
//}
|
||||
docIDs[slot] = docID;
|
||||
missingDims[slot] = 1;
|
||||
counts[slot] = 1;
|
||||
}
|
||||
|
||||
docID = docsEnum.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
// Second dim:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" dim1");
|
||||
//}
|
||||
for(DocsEnum docsEnum : docsEnums[1]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
int docID = docsEnum.docID();
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
|
||||
if (docIDs[slot] != docID) {
|
||||
// Mark slot as valid:
|
||||
seen.set(slot);
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missingDim=0 id=" + context.reader().document(docID).get("id"));
|
||||
//}
|
||||
docIDs[slot] = docID;
|
||||
missingDims[slot] = 0;
|
||||
counts[slot] = 1;
|
||||
} else {
|
||||
// TODO: single-valued dims will always be true
|
||||
// below; we could somehow specialize
|
||||
if (missingDims[slot] >= 1) {
|
||||
missingDims[slot] = 2;
|
||||
counts[slot] = 2;
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missingDim=2 id=" + context.reader().document(docID).get("id"));
|
||||
//}
|
||||
} else {
|
||||
counts[slot] = 1;
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missingDim=" + missingDims[slot] + " id=" + context.reader().document(docID).get("id"));
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
docID = docsEnum.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
// After this we can "upgrade" to conjunction, because
|
||||
// any doc not seen by either dim 0 or dim 1 cannot be
|
||||
// a hit or a near miss:
|
||||
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" baseScorer");
|
||||
//}
|
||||
|
||||
// Fold in baseScorer, using advance:
|
||||
int filledCount = 0;
|
||||
int slot0 = 0;
|
||||
while (slot0 < CHUNK && (slot0 = seen.nextSetBit(slot0)) != -1) {
|
||||
int ddDocID = docIDs[slot0];
|
||||
assert ddDocID != -1;
|
||||
|
||||
int baseDocID = baseScorer.docID();
|
||||
if (baseDocID < ddDocID) {
|
||||
baseDocID = baseScorer.advance(ddDocID);
|
||||
}
|
||||
if (baseDocID == ddDocID) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" keep docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
|
||||
//}
|
||||
scores[slot0] = baseScorer.score();
|
||||
filledSlots[filledCount++] = slot0;
|
||||
counts[slot0]++;
|
||||
} else {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" no docID=" + ddDocID + " id=" + context.reader().document(ddDocID).get("id"));
|
||||
//}
|
||||
docIDs[slot0] = -1;
|
||||
|
||||
// TODO: we could jump slot0 forward to the
|
||||
// baseDocID ... but we'd need to set docIDs for
|
||||
// intervening slots to -1
|
||||
}
|
||||
slot0++;
|
||||
}
|
||||
seen.clear(0, CHUNK);
|
||||
|
||||
if (filledCount == 0) {
|
||||
if (nextChunkStart >= maxDoc) {
|
||||
break;
|
||||
}
|
||||
nextChunkStart += CHUNK;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: factor this out & share w/ union scorer,
|
||||
// except we start from dim=2 instead:
|
||||
for(int dim=2;dim<numDims;dim++) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]");
|
||||
//}
|
||||
for(DocsEnum docsEnum : docsEnums[dim]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
int docID = docsEnum.docID();
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
if (docIDs[slot] == docID && counts[slot] >= dim) {
|
||||
// TODO: single-valued dims will always be true
|
||||
// below; we could somehow specialize
|
||||
if (missingDims[slot] >= dim) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " count=" + (dim+2));
|
||||
//}
|
||||
missingDims[slot] = dim+1;
|
||||
counts[slot] = dim+2;
|
||||
} else {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missing count=" + (dim+1));
|
||||
//}
|
||||
counts[slot] = dim+1;
|
||||
}
|
||||
}
|
||||
// TODO: sometimes use advance?
|
||||
docID = docsEnum.nextDoc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" now collect: " + filledCount + " hits");
|
||||
//}
|
||||
for(int i=0;i<filledCount;i++) {
|
||||
int slot = filledSlots[i];
|
||||
collectDocID = docIDs[slot];
|
||||
collectScore = scores[slot];
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]);
|
||||
//}
|
||||
if (counts[slot] == 1+numDims) {
|
||||
collectHit(collector, sidewaysCollectors);
|
||||
} else if (counts[slot] == numDims) {
|
||||
collectNearMiss(sidewaysCollectors, missingDims[slot]);
|
||||
}
|
||||
}
|
||||
|
||||
if (nextChunkStart >= maxDoc) {
|
||||
break;
|
||||
}
|
||||
|
||||
nextChunkStart += CHUNK;
|
||||
}
|
||||
}
|
||||
|
||||
/** Used when base query is highly constraining vs the
|
||||
* drilldowns; in this case we just .next() on base and
|
||||
* .advance() on the dims. */
|
||||
private void doBaseAdvanceScoring(Collector collector, DocsEnum[][] docsEnums, Collector[] sidewaysCollectors) throws IOException {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" doBaseAdvanceScoring");
|
||||
//}
|
||||
int docID = baseScorer.docID();
|
||||
|
||||
final int numDims = dims.length;
|
||||
|
||||
nextDoc: while (docID != NO_MORE_DOCS) {
|
||||
int failedDim = -1;
|
||||
for(int dim=0;dim<numDims;dim++) {
|
||||
// TODO: should we sort this 2nd dimension of
|
||||
// docsEnums from most frequent to least?
|
||||
boolean found = false;
|
||||
for(DocsEnum docsEnum : docsEnums[dim]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
if (docsEnum.docID() < docID) {
|
||||
docsEnum.advance(docID);
|
||||
}
|
||||
if (docsEnum.docID() == docID) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
if (failedDim != -1) {
|
||||
// More than one dim fails on this document, so
|
||||
// it's neither a hit nor a near-miss; move to
|
||||
// next doc:
|
||||
docID = baseScorer.nextDoc();
|
||||
continue nextDoc;
|
||||
} else {
|
||||
failedDim = dim;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
collectDocID = docID;
|
||||
|
||||
// TODO: we could score on demand instead since we are
|
||||
// daat here:
|
||||
collectScore = baseScorer.score();
|
||||
|
||||
if (failedDim == -1) {
|
||||
collectHit(collector, sidewaysCollectors);
|
||||
} else {
|
||||
collectNearMiss(sidewaysCollectors, failedDim);
|
||||
}
|
||||
|
||||
docID = baseScorer.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
private void collectHit(Collector collector, Collector[] sidewaysCollectors) throws IOException {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" hit");
|
||||
//}
|
||||
|
||||
collector.collect(collectDocID);
|
||||
if (drillDownCollector != null) {
|
||||
drillDownCollector.collect(collectDocID);
|
||||
}
|
||||
|
||||
// TODO: we could "fix" faceting of the sideways counts
|
||||
// to do this "union" (of the drill down hits) in the
|
||||
// end instead:
|
||||
|
||||
// Tally sideways counts:
|
||||
for(int dim=0;dim<sidewaysCollectors.length;dim++) {
|
||||
sidewaysCollectors[dim].collect(collectDocID);
|
||||
}
|
||||
}
|
||||
|
||||
private void collectNearMiss(Collector[] sidewaysCollectors, int dim) throws IOException {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" missingDim=" + dim);
|
||||
//}
|
||||
sidewaysCollectors[dim].collect(collectDocID);
|
||||
}
|
||||
|
||||
private void doUnionScoring(Collector collector, DocsEnum[][] docsEnums, Collector[] sidewaysCollectors) throws IOException {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" doUnionScoring");
|
||||
//}
|
||||
|
||||
final int maxDoc = context.reader().maxDoc();
|
||||
final int numDims = dims.length;
|
||||
|
||||
// TODO: maybe a class like BS, instead of parallel arrays
|
||||
int[] filledSlots = new int[CHUNK];
|
||||
int[] docIDs = new int[CHUNK];
|
||||
float[] scores = new float[CHUNK];
|
||||
int[] missingDims = new int[CHUNK];
|
||||
int[] counts = new int[CHUNK];
|
||||
|
||||
docIDs[0] = -1;
|
||||
|
||||
// NOTE: this is basically a specialized version of
|
||||
// BooleanScorer, to the minShouldMatch=N-1 case, but
|
||||
// carefully tracking which dimension failed to match
|
||||
|
||||
int nextChunkStart = CHUNK;
|
||||
|
||||
while (true) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println("\ncycle nextChunkStart=" + nextChunkStart + " docIds[0]=" + docIDs[0]);
|
||||
//}
|
||||
int filledCount = 0;
|
||||
int docID = baseScorer.docID();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" base docID=" + docID);
|
||||
//}
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" docIDs[slot=" + slot + "]=" + docID + " id=" + context.reader().document(docID).get("id"));
|
||||
//}
|
||||
|
||||
// Mark slot as valid:
|
||||
assert docIDs[slot] != docID: "slot=" + slot + " docID=" + docID;
|
||||
docIDs[slot] = docID;
|
||||
scores[slot] = baseScorer.score();
|
||||
filledSlots[filledCount++] = slot;
|
||||
missingDims[slot] = 0;
|
||||
counts[slot] = 1;
|
||||
|
||||
docID = baseScorer.nextDoc();
|
||||
}
|
||||
|
||||
if (filledCount == 0) {
|
||||
if (nextChunkStart >= maxDoc) {
|
||||
break;
|
||||
}
|
||||
nextChunkStart += CHUNK;
|
||||
continue;
|
||||
}
|
||||
|
||||
// First drill-down dim, basically adds SHOULD onto
|
||||
// the baseQuery:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" dim=0 [" + dims[0].dim + "]");
|
||||
//}
|
||||
for(DocsEnum docsEnum : docsEnums[0]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
docID = docsEnum.docID();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" start docID=" + docID);
|
||||
//}
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
if (docIDs[slot] == docID) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " count=2");
|
||||
//}
|
||||
missingDims[slot] = 1;
|
||||
counts[slot] = 2;
|
||||
}
|
||||
docID = docsEnum.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
for(int dim=1;dim<numDims;dim++) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" dim=" + dim + " [" + dims[dim].dim + "]");
|
||||
//}
|
||||
for(DocsEnum docsEnum : docsEnums[dim]) {
|
||||
if (docsEnum == null) {
|
||||
continue;
|
||||
}
|
||||
docID = docsEnum.docID();
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" start docID=" + docID);
|
||||
//}
|
||||
while (docID < nextChunkStart) {
|
||||
int slot = docID & MASK;
|
||||
if (docIDs[slot] == docID && counts[slot] >= dim) {
|
||||
// This doc is still in the running...
|
||||
// TODO: single-valued dims will always be true
|
||||
// below; we could somehow specialize
|
||||
if (missingDims[slot] >= dim) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " count=" + (dim+2));
|
||||
//}
|
||||
missingDims[slot] = dim+1;
|
||||
counts[slot] = dim+2;
|
||||
} else {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missing count=" + (dim+1));
|
||||
//}
|
||||
counts[slot] = dim+1;
|
||||
}
|
||||
}
|
||||
docID = docsEnum.nextDoc();
|
||||
}
|
||||
|
||||
// TODO: sometimes use advance?
|
||||
|
||||
/*
|
||||
int docBase = nextChunkStart - CHUNK;
|
||||
for(int i=0;i<filledCount;i++) {
|
||||
int slot = filledSlots[i];
|
||||
docID = docBase + filledSlots[i];
|
||||
if (docIDs[slot] == docID && counts[slot] >= dim) {
|
||||
// This doc is still in the running...
|
||||
int ddDocID = docsEnum.docID();
|
||||
if (ddDocID < docID) {
|
||||
ddDocID = docsEnum.advance(docID);
|
||||
}
|
||||
if (ddDocID == docID) {
|
||||
if (missingDims[slot] >= dim && counts[slot] == allMatchCount) {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " count=" + (dim+2));
|
||||
// }
|
||||
missingDims[slot] = dim+1;
|
||||
counts[slot] = dim+2;
|
||||
} else {
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" set docID=" + docID + " missing count=" + (dim+1));
|
||||
// }
|
||||
counts[slot] = dim+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
// Collect:
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" now collect: " + filledCount + " hits");
|
||||
//}
|
||||
for(int i=0;i<filledCount;i++) {
|
||||
// NOTE: This is actually in-order collection,
|
||||
// because we only accept docs originally returned by
|
||||
// the baseScorer (ie that Scorer is AND'd)
|
||||
int slot = filledSlots[i];
|
||||
collectDocID = docIDs[slot];
|
||||
collectScore = scores[slot];
|
||||
//if (DEBUG) {
|
||||
// System.out.println(" docID=" + docIDs[slot] + " count=" + counts[slot]);
|
||||
//}
|
||||
//System.out.println(" collect doc=" + collectDocID + " main.freq=" + (counts[slot]-1) + " main.doc=" + collectDocID + " exactCount=" + numDims);
|
||||
if (counts[slot] == 1+numDims) {
|
||||
//System.out.println(" hit");
|
||||
collectHit(collector, sidewaysCollectors);
|
||||
} else if (counts[slot] == numDims) {
|
||||
//System.out.println(" sw");
|
||||
collectNearMiss(sidewaysCollectors, missingDims[slot]);
|
||||
}
|
||||
}
|
||||
|
||||
if (nextChunkStart >= maxDoc) {
|
||||
break;
|
||||
}
|
||||
|
||||
nextChunkStart += CHUNK;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return collectDocID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() {
|
||||
return collectScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int freq() {
|
||||
return 1+dims.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return baseScorer.cost();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<ChildScorer> getChildren() {
|
||||
return Collections.singletonList(new ChildScorer(baseScorer, "MUST"));
|
||||
}
|
||||
|
||||
static class DocsEnumsAndFreq implements Comparable<DocsEnumsAndFreq> {
|
||||
DocsEnum[] docsEnums;
|
||||
// Max cost for all docsEnums for this dim:
|
||||
long maxCost;
|
||||
Collector sidewaysCollector;
|
||||
String dim;
|
||||
|
||||
@Override
|
||||
public int compareTo(DocsEnumsAndFreq other) {
|
||||
if (maxCost < other.maxCost) {
|
||||
return -1;
|
||||
} else if (maxCost > other.maxCost) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -31,6 +31,8 @@ public final class SimpleFacetResult {
|
|||
|
||||
/** Child counts. */
|
||||
public final LabelAndValue[] labelValues;
|
||||
|
||||
// nocommit also return number of children?
|
||||
|
||||
public SimpleFacetResult(FacetLabel path, Number value, LabelAndValue[] labelValues) {
|
||||
this.path = path;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -65,8 +65,6 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
|||
|
||||
IndexWriter writer = new FacetIndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())), taxoWriter, fts);
|
||||
|
||||
// Reused across documents, to add the necessary facet
|
||||
// fields:
|
||||
Document doc = new Document();
|
||||
doc.add(new FacetField("Author", "Bob"));
|
||||
doc.add(new FacetField("Publish Date", "2010", "10", "15"));
|
||||
|
|
Loading…
Reference in New Issue