mirror of https://github.com/apache/lucene.git
LUCENE-4840: fix SortedSetDocValuesFacetFields to index drill-down postings
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3f9dcad7ac
commit
6e44af52e1
|
@ -79,7 +79,7 @@ New Features
|
||||||
* LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing
|
* LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing
|
||||||
scoring. (Simon Willnauer, Robert Muir)
|
scoring. (Simon Willnauer, Robert Muir)
|
||||||
|
|
||||||
* LUCENE-4795: Add SortedSetDocValuesFacetField and
|
* LUCENE-4795: Add SortedSetDocValuesFacetFields and
|
||||||
SortedSetDocValuesAccumulator, to compute topK facet counts from a
|
SortedSetDocValuesAccumulator, to compute topK facet counts from a
|
||||||
field's SortedSetDocValues. This method only supports flat
|
field's SortedSetDocValues. This method only supports flat
|
||||||
(dim/label) facets, is a bit (~25%) slower, has added cost
|
(dim/label) facets, is a bit (~25%) slower, has added cost
|
||||||
|
|
|
@ -48,18 +48,6 @@ import org.apache.lucene.util.IntsRef;
|
||||||
*/
|
*/
|
||||||
public class FacetFields {
|
public class FacetFields {
|
||||||
|
|
||||||
// The counting list is written in a payload, but we don't store it
|
|
||||||
// nor need norms.
|
|
||||||
private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType();
|
|
||||||
static {
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true);
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true);
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.setStored(false);
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true);
|
|
||||||
COUNTING_LIST_PAYLOAD_TYPE.freeze();
|
|
||||||
}
|
|
||||||
|
|
||||||
// The drill-down field is added with a TokenStream, hence why it's based on
|
// The drill-down field is added with a TokenStream, hence why it's based on
|
||||||
// TextField type. However in practice, it is added just like StringField.
|
// TextField type. However in practice, it is added just like StringField.
|
||||||
// Therefore we set its IndexOptions to DOCS_ONLY.
|
// Therefore we set its IndexOptions to DOCS_ONLY.
|
||||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
/** A {@link FacetsAccumulator} that uses previously
|
/** A {@link FacetsAccumulator} that uses previously
|
||||||
* indexed {@link SortedSetDocValuesFacetField} to perform faceting,
|
* indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
|
||||||
* without require a separate taxonomy index. Faceting is
|
* without require a separate taxonomy index. Faceting is
|
||||||
* a bit slower (~25%), and there is added cost on every
|
* a bit slower (~25%), and there is added cost on every
|
||||||
* {@link IndexReader} open to create a new {@link
|
* {@link IndexReader} open to create a new {@link
|
||||||
|
|
|
@ -17,7 +17,16 @@ package org.apache.lucene.facet.sortedset;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||||
|
import org.apache.lucene.facet.index.DrillDownStream;
|
||||||
|
import org.apache.lucene.facet.index.FacetFields;
|
||||||
|
import org.apache.lucene.facet.params.CategoryListParams;
|
||||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -29,39 +38,49 @@ import org.apache.lucene.util.BytesRef;
|
||||||
* this to your document, one per dimension + label, and
|
* this to your document, one per dimension + label, and
|
||||||
* it's fine if a given dimension is multi-valued. */
|
* it's fine if a given dimension is multi-valued. */
|
||||||
|
|
||||||
public class SortedSetDocValuesFacetField extends SortedSetDocValuesField {
|
public class SortedSetDocValuesFacetFields extends FacetFields {
|
||||||
|
|
||||||
/** Create a {@code SortedSetDocValuesFacetField} with the
|
/** Create a {@code SortedSetDocValuesFacetField} with the
|
||||||
* provided {@link CategoryPath}. */
|
* provided {@link CategoryPath}. */
|
||||||
public SortedSetDocValuesFacetField(CategoryPath cp) {
|
public SortedSetDocValuesFacetFields() {
|
||||||
this(FacetIndexingParams.DEFAULT, cp);
|
this(FacetIndexingParams.DEFAULT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create a {@code SortedSetDocValuesFacetField} with the
|
/** Create a {@code SortedSetDocValuesFacetField} with the
|
||||||
* provided {@link CategoryPath}, and custom {@link
|
* provided {@link CategoryPath}, and custom {@link
|
||||||
* FacetIndexingParams}. */
|
* FacetIndexingParams}. */
|
||||||
public SortedSetDocValuesFacetField(FacetIndexingParams fip, CategoryPath cp) {
|
public SortedSetDocValuesFacetFields(FacetIndexingParams fip) {
|
||||||
super(fip.getCategoryListParams(cp).field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION, toBytesRef(fip, cp));
|
super(null, fip);
|
||||||
}
|
|
||||||
|
|
||||||
private static BytesRef toBytesRef(FacetIndexingParams fip, CategoryPath cp) {
|
|
||||||
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
|
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
|
||||||
throw new IllegalArgumentException("partitions are not supported");
|
throw new IllegalArgumentException("partitions are not supported");
|
||||||
}
|
}
|
||||||
if (cp.length != 2) {
|
}
|
||||||
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported");
|
|
||||||
}
|
@Override
|
||||||
String dimension = cp.components[0];
|
public void addFields(Document doc, Iterable<CategoryPath> categories) throws IOException {
|
||||||
char delim = fip.getFacetDelimChar();
|
if (categories == null) {
|
||||||
if (dimension.indexOf(delim) != -1) {
|
throw new IllegalArgumentException("categories should not be null");
|
||||||
throw new IllegalArgumentException("facet dimension cannot contain FacetIndexingParams.getFacetDelimChar()=" + delim + " (U+" + Integer.toHexString(delim) + "); got dimension=\"" + dimension + "\"");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// We can't use cp.toString(delim) because that fails if
|
final Map<CategoryListParams,Iterable<CategoryPath>> categoryLists = createCategoryListMapping(categories);
|
||||||
// cp.components[1] has the delim char, when in fact
|
for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
|
||||||
// that is allowed here (but not when using taxonomy
|
|
||||||
// index):
|
CategoryListParams clp = e.getKey();
|
||||||
return new BytesRef(dimension + delim + cp.components[1]);
|
String dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION;
|
||||||
|
|
||||||
|
// Add sorted-set DV fields, one per value:
|
||||||
|
for(CategoryPath cp : e.getValue()) {
|
||||||
|
if (cp.length != 2) {
|
||||||
|
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp);
|
||||||
|
}
|
||||||
|
doc.add(new SortedSetDocValuesField(dvField, new BytesRef(cp.toString(indexingParams.getFacetDelimChar()))));
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the drill-down field
|
||||||
|
DrillDownStream drillDownStream = getDrillDownStream(e.getValue());
|
||||||
|
Field drillDown = new Field(clp.field, drillDownStream, drillDownFieldType());
|
||||||
|
doc.add(drillDown);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||||
import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult;
|
import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult;
|
||||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator;
|
import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator;
|
||||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
|
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields;
|
||||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
|
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
|
||||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
|
@ -500,6 +500,7 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
|
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
|
||||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
|
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
|
||||||
facetFields = new FacetFields(tw);
|
facetFields = new FacetFields(tw);
|
||||||
|
SortedSetDocValuesFacetFields dvFacetFields = new SortedSetDocValuesFacetFields();
|
||||||
|
|
||||||
for(Doc rawDoc : docs) {
|
for(Doc rawDoc : docs) {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
|
@ -519,9 +520,6 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
|
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
|
||||||
}
|
}
|
||||||
if (canUseDV) {
|
|
||||||
doc.add(new SortedSetDocValuesFacetField(cp));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
int dimValue2 = rawDoc.dims2[dim];
|
int dimValue2 = rawDoc.dims2[dim];
|
||||||
if (dimValue2 != -1) {
|
if (dimValue2 != -1) {
|
||||||
|
@ -531,13 +529,13 @@ public class TestDrillSideways extends FacetTestCase {
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
|
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
|
||||||
}
|
}
|
||||||
if (canUseDV) {
|
|
||||||
doc.add(new SortedSetDocValuesFacetField(cp));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!paths.isEmpty()) {
|
if (!paths.isEmpty()) {
|
||||||
facetFields.addFields(doc, paths);
|
facetFields.addFields(doc, paths);
|
||||||
|
if (canUseDV) {
|
||||||
|
dvFacetFields.addFields(doc, paths);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
w.addDocument(doc);
|
w.addDocument(doc);
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.facet.params.CategoryListParams;
|
||||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||||
import org.apache.lucene.facet.search.CountFacetRequest;
|
import org.apache.lucene.facet.search.CountFacetRequest;
|
||||||
|
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||||
import org.apache.lucene.facet.search.FacetRequest;
|
import org.apache.lucene.facet.search.FacetRequest;
|
||||||
import org.apache.lucene.facet.search.FacetResult;
|
import org.apache.lucene.facet.search.FacetResult;
|
||||||
import org.apache.lucene.facet.search.FacetsCollector;
|
import org.apache.lucene.facet.search.FacetsCollector;
|
||||||
|
@ -35,6 +36,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||||
|
import org.apache.lucene.search.TopDocs;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
|
||||||
public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
||||||
|
@ -57,6 +59,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(fip);
|
||||||
|
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
// Mixup order we add these paths, to verify tie-break
|
// Mixup order we add these paths, to verify tie-break
|
||||||
// order is by label (unicode sort) and has nothing to
|
// order is by label (unicode sort) and has nothing to
|
||||||
|
@ -67,22 +71,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
||||||
paths.add(new CategoryPath("a", "zoo"));
|
paths.add(new CategoryPath("a", "zoo"));
|
||||||
Collections.shuffle(paths, random());
|
Collections.shuffle(paths, random());
|
||||||
|
|
||||||
for(CategoryPath cp : paths) {
|
paths.add(new CategoryPath("b", "baz"));
|
||||||
doc.add(new SortedSetDocValuesFacetField(fip, cp));
|
paths.add(new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo"));
|
||||||
}
|
|
||||||
|
dvFields.addFields(doc, paths);
|
||||||
|
|
||||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz")));
|
|
||||||
// Make sure it's fine to use delim in the label (it's
|
|
||||||
// just not allowed in the dim):
|
|
||||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz" + delim + "foo")));
|
|
||||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo")));
|
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
if (random().nextBoolean()) {
|
if (random().nextBoolean()) {
|
||||||
writer.commit();
|
writer.commit();
|
||||||
}
|
}
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("a", "foo")));
|
dvFields.addFields(doc, Collections.singletonList(new CategoryPath("a", "foo")));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
|
|
||||||
// NRT open
|
// NRT open
|
||||||
|
@ -123,12 +123,25 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
||||||
int dimCount = doDimCount ? 4 : 0;
|
int dimCount = doDimCount ? 4 : 0;
|
||||||
assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0)));
|
assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0)));
|
||||||
|
|
||||||
dimCount = doDimCount ? 2 : 0;
|
dimCount = doDimCount ? 1 : 0;
|
||||||
assertEquals("b (" + dimCount + ")\n baz (1)\n baz" + delim + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
|
assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
|
||||||
|
|
||||||
dimCount = doDimCount ? 1 : 0;
|
dimCount = doDimCount ? 1 : 0;
|
||||||
assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2)));
|
assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2)));
|
||||||
|
|
||||||
|
// DrillDown:
|
||||||
|
|
||||||
|
DrillDownQuery q = new DrillDownQuery(fip);
|
||||||
|
q.add(new CategoryPath("a", "foo"));
|
||||||
|
q.add(new CategoryPath("b", "baz"));
|
||||||
|
TopDocs hits = searcher.search(q, 1);
|
||||||
|
assertEquals(1, hits.totalHits);
|
||||||
|
|
||||||
|
q = new DrillDownQuery(fip);
|
||||||
|
q.add(new CategoryPath("a"));
|
||||||
|
hits = searcher.search(q, 1);
|
||||||
|
assertEquals(2, hits.totalHits);
|
||||||
|
|
||||||
searcher.getIndexReader().close();
|
searcher.getIndexReader().close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue