mirror of https://github.com/apache/lucene.git
LUCENE-3261: Faceting module userguide
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1175341 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d48960ae05
commit
151c26e7d9
|
@ -29,6 +29,14 @@
|
|||
<property name="build.dir" location="build/" />
|
||||
<property name="dist.dir" location="dist/" />
|
||||
|
||||
<property name="examples.dir" location="src/examples"/>
|
||||
|
||||
<path id="classpath">
|
||||
<path refid="base.classpath" />
|
||||
<pathelement location="${build.dir}/classes/java" />
|
||||
<pathelement location="${build.dir}/classes/examples" />
|
||||
</path>
|
||||
|
||||
<path id="examples.classpath">
|
||||
<path refid="classpath" />
|
||||
<pathelement location="${build.dir}/classes/java" />
|
||||
|
@ -49,7 +57,7 @@
|
|||
</path>
|
||||
|
||||
<target name="compile-examples" description="Compiles Facets examples">
|
||||
<compile srcdir="src/examples" destdir="${build.dir}/classes/examples">
|
||||
<compile srcdir="${examples.dir}" destdir="${build.dir}/classes/examples">
|
||||
<classpath refid="examples.classpath" />
|
||||
</compile>
|
||||
</target>
|
||||
|
@ -64,4 +72,22 @@
|
|||
|
||||
<target name="jar-core" depends="common.jar-core,jar-examples" />
|
||||
|
||||
<target name="javadocs" depends="compile-core">
|
||||
<sequential>
|
||||
<mkdir dir="${javadoc.dir}/contrib-${name}"/>
|
||||
<copy todir="${javadoc.dir}/contrib-${name}" file="docs/userguide.html" />
|
||||
<!-- javadoc core classes -->
|
||||
<invoke-javadoc
|
||||
destdir="${javadoc.dir}/contrib-${name}"
|
||||
title="${Name} ${version} contrib-${name} API">
|
||||
<sources>
|
||||
<link href=""/>
|
||||
<packageset dir="${src.dir}"/>
|
||||
<packageset dir="${examples.dir}"/>
|
||||
</sources>
|
||||
</invoke-javadoc>
|
||||
<jarify basedir="${javadoc.dir}/contrib-${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,784 @@
|
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<title>Facet Userguide</title>
|
||||
|
||||
<!-- load stylesheet + javascript in development mode -->
|
||||
<link rel="stylesheet" type="text/css" href="../../../src/tools/prettify/prettify.css">
|
||||
<script src="../../../src/tools/prettify/prettify.js" type="text/javascript"></script>
|
||||
|
||||
<!-- load stylesheet + javascript in distribution mode -->
|
||||
<link rel="stylesheet" type="text/css" href="prettify.css">
|
||||
<script src="prettify.js" type="text/javascript"></script>
|
||||
|
||||
<script language="javascript">
|
||||
window.onload=function() {
|
||||
prettyPrint();
|
||||
}
|
||||
</script>
|
||||
|
||||
<style>
|
||||
body {
|
||||
margin-left: 20%;
|
||||
width: 60%;
|
||||
counter-reset: section;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
h1.title {
|
||||
text-align: center;
|
||||
margin-top: 30px;
|
||||
font-size: 5em;
|
||||
line-height: 150%;
|
||||
}
|
||||
|
||||
h1.section {
|
||||
margin-top: 50px;
|
||||
font-size: 2.5em;
|
||||
counter-reset: subsection;
|
||||
border: 1px solid black;
|
||||
background-color: #D8D8D8;
|
||||
padding-left: 5px;
|
||||
}
|
||||
|
||||
h2.subsection {
|
||||
font-size: 2em;
|
||||
border: 1px solid black;
|
||||
background-color: #D8D8D8;
|
||||
padding-left: 5px;
|
||||
}
|
||||
|
||||
/* auto-generated heading numbers */
|
||||
h1.section:before {
|
||||
counter-increment: section;
|
||||
content: counter(section) ". ";
|
||||
}
|
||||
|
||||
h2.subsection:before {
|
||||
counter-increment: subsection;
|
||||
content: counter(section) "." counter(subsection) " ";
|
||||
}
|
||||
|
||||
/* override from prettify.css - add shadow, padding etc. */
|
||||
pre.prettyprint {
|
||||
margin-left: 2%;
|
||||
width: 80%;
|
||||
padding: 5px 3px 5px 3px;
|
||||
/* shadow */
|
||||
-moz-box-shadow: 5px 5px 2px #888;
|
||||
-webkit-box-shadow: 5px 5px 2px #888;
|
||||
box-shadow: 5px 5px 2px #888;
|
||||
}
|
||||
|
||||
/* override from prettify.css - make keywords appear in bold */
|
||||
span.kwd {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
ol.toc a {
|
||||
text-decoration: none;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
li.toc_first {
|
||||
// margin-top: 10px;
|
||||
font-size: 16px;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
li.toc_second {
|
||||
font-size: 14px;
|
||||
margin-left: 15px;
|
||||
color: blue;
|
||||
}
|
||||
|
||||
/* reset style from prettify.css, so that line numbers appear in each line */
|
||||
li.L0,li.L1,li.L2,li.L3,li.L5,li.L6,li.L7,li.L8 {
|
||||
list-style-type:decimal
|
||||
}
|
||||
|
||||
table.code_description td {
|
||||
vertical-align: top;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
<body>
|
||||
<h1 class="title">
|
||||
Apache Lucene<br>
|
||||
Faceted Search<br>
|
||||
User's Guide</h1>
|
||||
|
||||
<div class="toc">
|
||||
<h1 class="toc">Table of Contents</h1>
|
||||
<ol class="toc">
|
||||
<li class="toc_first"><a href="#intro">Introduction</a></li>
|
||||
<li class="toc_first"><a href="#facet_features">Facet Features</a></li>
|
||||
<li class="toc_first"><a href="#facet_indexing">Indexing Categories Illustrated</a></li>
|
||||
<li class="toc_first"><a href="#facet_accumulation">Accumulating Facets Illustrated</a></li>
|
||||
<li class="toc_first"><a href="#indexed_facet_info">Indexed Facet Information</a></li>
|
||||
<li class="toc_first"><a href="#taxonomy_index">Taxonomy Index</a></li>
|
||||
<li class="toc_first"><a href="#facet_params">Facet Parameters</a></li>
|
||||
<li class="toc_first"><a href="#advanced">Advanced Faceted Examples</a></li>
|
||||
<li class="toc_first"><a href="#optimizations">Optimizations</a></li>
|
||||
<li class="toc_first"><a href="#concurrent_indexing_search">Concurrent Indexing and Search</a></li>
|
||||
</ol>
|
||||
|
||||
<h1 class="section"><a name="intro">Introduction</a></h1>
|
||||
<p>
|
||||
A category is an aspect of indexed documents which can be used to classify the
|
||||
documents. For example, in a collection of books at an online bookstore, categories of
|
||||
a book can be its price, author, publication date, binding type, and so on.
|
||||
<p>
|
||||
In faceted search, in addition to the standard set of search results, we also get facet
|
||||
results, which are lists of subcategories for certain categories. For example, for the
|
||||
price facet, we get a list of relevant price ranges; for the author facet, we get a list of
|
||||
relevant authors; and so on. In most UIs, when users click one of these subcategories,
|
||||
the search is narrowed, or drilled down, and a new search limited to this subcategory
|
||||
(e.g., to a specific price range or author) is performed.
|
||||
<p>
|
||||
Note that faceted search is more than just the ordinary fielded search. In fielded
|
||||
search, users can add search keywords like price:10 or author:"Mark
|
||||
Twain" to the query to narrow the search, but this requires knowledge of which
|
||||
fields are available, and which values are worth trying. This is where faceted search
|
||||
comes in: it provides a list of useful subcategories, which ensures that the user only
|
||||
drills down into useful subcategories and never into a category for which there are no
|
||||
results. In essence, faceted search makes it easy to navigate through the search results.
|
||||
The list of subcategories provided for each facet is also useful to the user in itself,
|
||||
even when the user never drills down. This list allows the user to see at one glance
|
||||
some statistics on the search results, e.g., what price ranges and which authors are
|
||||
most relevant to the given query.
|
||||
<p>
|
||||
In recent years, faceted search has become a very common UI feature in search
|
||||
engines, especially in e-commerce websites. Faceted search makes it easy for
|
||||
untrained users to find the specific item they are interested in, whereas manually
|
||||
adding search keywords (as in the examples above) proved too cumbersome for
|
||||
ordinary users, and required too much guesswork, trial-and-error, or the reading of
|
||||
lengthy help pages.
|
||||
<p>
|
||||
See <a href="http://en.wikipedia.org/wiki/Faceted_search">http://en.wikipedia.org/wiki/Faceted_search</a> for more information on faceted
|
||||
search.
|
||||
|
||||
<h1 class="section"><a name="facet_features">Facet Features</a></h1>
|
||||
First and main faceted search capability that comes to mind is counting, but in fact
|
||||
faceted search is more than facet counting. We now briefly discuss the available
|
||||
faceted search features.
|
||||
|
||||
<h2 class="subsection">Facet Counting</h2>
|
||||
<p>
|
||||
Which of the available subcategories of a facet should a UI display? A query in a
|
||||
book store might yield books by a hundred different authors, but normally we'd want
|
||||
do display only, say, ten of those.
|
||||
<p>
|
||||
Most available faceted search implementations use counts to determine the
|
||||
importance of each subcategory. These implementations go over all search results for
|
||||
the given query, and count how many results are in each subcategory. Finally, the
|
||||
subcategories with the most results can be displayed. So the user sees the price ranges,
|
||||
authors, and so on, for which there are most results. Often, the count is displayed next
|
||||
to the subcategory name, in parentheses, telling the user how many results he can
|
||||
expect to see if he drills down into this subcategory.
|
||||
<p>
|
||||
The main API for obtaining facet counting is <code>CountFacetRequest</code>, as in the
|
||||
following code snippet:
|
||||
<pre class="prettyprint lang-java">
|
||||
new CountFacetRequest(new CategoryPath("author"), 10));
|
||||
</pre>
|
||||
A detailed code example using count facet requests is shown below - see
|
||||
<a href="#facet_accumulation">Accumulating Facets</a>.
|
||||
|
||||
<h2 class="subsection"><a name="facet_association">Facet Associations</a></h2>
|
||||
<p>
|
||||
So far we've discussed categories as binary features, where a document either belongs
|
||||
to a category, or not.
|
||||
<p>
|
||||
While counts are useful in most situations, they are sometimes not sufficiently
|
||||
informative for the user, with respect to deciding which subcategory is more
|
||||
important to display.
|
||||
<p>
|
||||
For this, the facets package allows to associate a value with a category. The search
|
||||
time interpretation of the associated value is application dependent. For example, a
|
||||
possible interpretation is as a <i>match level</i> (e.g., confidence level). This value can
|
||||
then be used so that a document that is very weakly associated with a certain category
|
||||
will only contribute little to this category's aggregated weight.
|
||||
|
||||
<h2 class="subsection"><a name="multiple_requests">Multiple Facet Requests</a></h2>
|
||||
<p>
|
||||
A single faceted accumulation is capable of servicing multiple facet requests.
|
||||
Programmatic, this is quite simple - wrap all the facet requests of interest into the
|
||||
facet-search-parameters which are passed to a facets accumulator/collector (more on
|
||||
these objects below). The results would be comprised of as many facet results as there
|
||||
were facet requests.
|
||||
<p>
|
||||
However there is a delicate <b>limitation</b>: all facets maintained in the same location in
|
||||
the index are required to be treated the same. See the section on <a href="#indexing_params">Indexing Parameters</a>
|
||||
for an explanation on maintaining certain facets at certain locations.
|
||||
|
||||
<h2 class="subsection"><a name="facet_labels">Facet Labels at Search Time</a></h2>
|
||||
<p>
|
||||
Facets results always contain the facet (internal) ID and (accumulated) value. Some of
|
||||
the results also contain the facet label, AKA the category name. We mention this here
|
||||
since computing the label is a time consuming task, and hence applications can
|
||||
specify with a facet request to return top 1000 facets but to compute the label only for
|
||||
the top 10 facets. In order to compute labels for more of the facet results it is not
|
||||
required to perform accumulation again.
|
||||
<p>
|
||||
See <code>FacetRequest.getNumResults()</code>, <code>FacetRequest.getNumLabel()</code> and
|
||||
<code>FacetResultNode.getLabel(TaxonomyReader)</code>.
|
||||
|
||||
<h1 class="section"><a name="facet_indexing">Indexing Categories Illustrated</a></h1>
|
||||
<p>
|
||||
In order to find facets at search time they must first be added to the index at indexing
|
||||
time. Recall that Lucene documents are made of fields for textual search. The addition
|
||||
of categories is performed by an appropriate <code>DocumentBuilder</code> - or
|
||||
<code>CategoryDocumentBuilder</code> in our case.
|
||||
<p>
|
||||
Indexing therefore usually goes like this:
|
||||
<ul>
|
||||
<li>For each input document:
|
||||
<ul>
|
||||
<li>Create a fresh (empty) Lucene Document</li>
|
||||
<li>Parse input text and add appropriate text search fields</li>
|
||||
<li><b>Gather all input categories associated with the document and create
|
||||
a CategoryDocumentBuilder with the list of categories</b></li>
|
||||
<li><b>Build the document - this actually adds the categories to the
|
||||
Lucene document.</b></li>
|
||||
<li>Add the document to the index</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
Following is a code snippet for indexing categories. The complete example can be
|
||||
found in package <code>org.apache.lucene.facet.example.simple.SimpleIndexer</code>.
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
IndexWriter writer = ...
|
||||
TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
|
||||
...
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("title", titleText, Store.YES, Index.ANALYZED));
|
||||
...
|
||||
List<CategoryPath> categories = new ArrayList<CategoryPath>();
|
||||
categories.add(new CategoryPath("author", "Mark Twain"));
|
||||
categories.add(new CategoryPath("year", "2010"));
|
||||
...
|
||||
DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo);
|
||||
categoryDocBuilder.setCategoryPaths(categories);
|
||||
categoryDocBuilder.build(doc);
|
||||
writer.addDocument(doc);
|
||||
</pre>
|
||||
<p>
|
||||
We now explain the steps above, following the code line numbers:
|
||||
<table class="code_description">
|
||||
<tr>
|
||||
<td>(4)</td>
|
||||
<td>Document contains not only text search fields but also facet search
|
||||
information.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(7)</td>
|
||||
<td>Prepare a container for document categories.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(8)</td>
|
||||
<td>Categories that should be added to the document are accumulated in the
|
||||
categories list.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(11)</td>
|
||||
<td>A <code>CategoryDocumentBuilder</code> is created, set with the appropriate list
|
||||
of categories, and invoked to build - that is, to populate the document
|
||||
with categories. It is in this step that the taxonomy is updated to contain the
|
||||
newly added categories (if not already there) - see more on this in the
|
||||
section about the <a href="#taxonomy_index">Taxonomy Index</a> below. This line could be made more
|
||||
compact: one can create a single <code>CategoryDocumentBuilder cBuilder</code> and reuse it like this:
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
DocumentBuilder cBuilder = new CategoryDocumentBuilder(taxo);
|
||||
cBuilder.setCategoryPaths(categories).build(doc);
|
||||
</pre>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(14)</td>
|
||||
<td>Add the document to the index. As a result, category info is saved also in
|
||||
the regular search index, for supporting facet aggregation at search time
|
||||
(e.g. facet counting) as well as facet drill-down. For more information on
|
||||
indexed facet information see below the section <a href="#indexed_facet_info">Indexed Facet Information</a>.</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h1 class="section"><a name="facet_accumulation">Accumulating Facets Illustrated</a></h1>
|
||||
<p>
|
||||
Facets accumulation reflects a set of documents over some facet requests:
|
||||
<ul>
|
||||
<li><code>Document set</code> - a subset of the index documents, usually documents
|
||||
matching a user query.</li>
|
||||
<li><code>Facet requests</code> - facet accumulation specification, e.g. count a certain facet
|
||||
<i>dimension</i>.</li>
|
||||
</ul>
|
||||
<p>
|
||||
<code>FacetRequest</code> is a basic component in faceted search - it describes the facet
|
||||
information need. Every facet <b>request</b> is made of at least two fields:
|
||||
<ul>
|
||||
<li><code>CategoryPath</code> - root category of the facet request. The categories that
|
||||
are returned as a result of the request will all be descendants of this root</li>
|
||||
<li><code>Number of Results</code> - number of sub-categories to return (at most).</li>
|
||||
</ul>
|
||||
<p>
|
||||
There are other parameters to a facet request, such as -how many facet results to
|
||||
label-, -how <b>deep</b> to go from the request root when serving the facet request- and
|
||||
more - see the API Javadocs for <code>FacetRequest</code> and its subclasses for more
|
||||
information on these parameters. For labels in particular, see the section <a href="#facet_labels">Facet Labels
|
||||
at Search Time</a>.
|
||||
<p>
|
||||
<code>FacetRequest</code> in an abstract class, open for extensions, and users may add their
|
||||
own requests. The most often used request is <code>CountFacetRequest</code> - used for
|
||||
counting facets.
|
||||
<p>
|
||||
Facets accumulation is - not surprisingly - driven by a <code>FacetsAccumulator</code>. The
|
||||
most used one is <code>StandardFacetsAccumulator</code>, however there are also accumulators
|
||||
that support sampling - to be used in huge collections, and there's an adaptive facets
|
||||
accumulator which applies sampling conditionally on the statistics of the data. While
|
||||
facets accumulators are very extendible and powerful, they might be too
|
||||
overwhelming for beginners. For this reason, the code offers a higher level interface
|
||||
for facets accumulating: the <code>FacetsCollector</code>. It extends <code>Collector</code>, and as such
|
||||
can be passed to the search() method of Lucene's <code>IndexSearcher</code>. In case the
|
||||
application also needs to collect documents (in addition to accumulating/collecting
|
||||
facets), it can wrap multiple collectors with <code>MultiCollector</code>. Most code samples
|
||||
below use <code>FacetsCollector</code> due to its simple interface. It is quite likely that
|
||||
<code>FacetsCollector</code> should suffice the needs of most applications, therefore we
|
||||
recommend to start with it, and only when needing more flexibility turn to directly
|
||||
use facets accumulators.
|
||||
<p>
|
||||
Following is a code snippet from the example code - the complete example can be
|
||||
found under <code>org.apache.lucene.facet.example.simple.Searcher</code>:
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
IndexReader indexReader = IndexReader.open(indexDir);
|
||||
Searcher searcher = new IndexSearcher(indexReader);
|
||||
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
|
||||
...
|
||||
Query q = new TermQuery(new Term(SimpleUtils.TEXT, "white"));
|
||||
TopScoreDocCollector tdc = TopScoreDocCollector.create(10, true);
|
||||
...
|
||||
FacetSearchParams facetSearchParams = new FacetSearchParams();
|
||||
facetSearchParams.addFacetRequest(new CountFacetRequest(
|
||||
new CategoryPath("author"), 10));
|
||||
...
|
||||
FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, indexReader, taxo);
|
||||
searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector));
|
||||
List<FacetResult> res = facetsCollector.getFacetResults();
|
||||
</pre>
|
||||
<p>
|
||||
We now explain the steps above, following the code line numbers:
|
||||
<table class="code_description">
|
||||
<tr>
|
||||
<td>(1)</td>
|
||||
<td>Index reader and Searcher are initialized as usual.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(3)</td>
|
||||
<td>A taxonomy reader is opened - it provides access to the facet information
|
||||
which was stored by the Taxonomy Writer at indexing time.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(5)</td>
|
||||
<td>Regular text query is created to find the documents matching user need, and
|
||||
a collector for collecting the top matching documents is created.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(8)</td>
|
||||
<td>Facet-search-params is a container for facet requests.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(10)</td>
|
||||
<td>A single facet-request - namely a count facet request - is created and added
|
||||
to the facet search params. The request should return top 10 Author
|
||||
subcategory counts.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(12)</td>
|
||||
<td>Facets-Collector is the simplest interface for facets accumulation (counting
|
||||
in this example).</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(13)</td>
|
||||
<td>Lucene search takes both collectors - facets-collector and top-doccollector,
|
||||
both wrapped by a multi-collector. This way, a single search
|
||||
operation finds both top documents and top facets. Note however that facets
|
||||
aggregation takes place not only over the top documents, but rather over all
|
||||
documents matching the query.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(14)</td>
|
||||
<td>Once search completes, facet-results can be obtained from the facetscollector.</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
Returned facet results are organized in a list, conveniently ordered the same as the
|
||||
facet-requests in the facet-search-params. Each result however contains the request
|
||||
for which it was created.</li>
|
||||
<p>
|
||||
Here is the (recursive) structure of the facet result:
|
||||
<ul>
|
||||
<li><b>Facet Result</b>
|
||||
<ul>
|
||||
<li><b>Facet Request</b> - the request for which this result was obtained.</li>
|
||||
<li><b>Valid Descendants</b> - how many valid descendants were encountered
|
||||
over the set of matching documents (some of which might have been
|
||||
filtered out because e.g. only top 10 results were requested).</li>
|
||||
<li><b>Root Result Node</b> - root facet result for the request
|
||||
<ul>
|
||||
<li><b>Ordinal</b> - unique internal ID of the facet</li>
|
||||
<li><b>Label</b> - full label of the facet (possibly null)</li>
|
||||
<li><b>Value</b> - facet value, e.g. count</li>
|
||||
<li><b>Sub-results-nodes</b> - child result nodes (possibly null)</li>
|
||||
</ul></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<p>
|
||||
Note that not always there would be sub result nodes - this depends on the
|
||||
requested result mode:
|
||||
<ul>
|
||||
<li><b>PER_NODE_IN_TREE</b> - a tree, and so there may be sub results.</li>
|
||||
<li><b>GLOBAL_FLAT</b> - here the results tree would be rather flat, with only (at
|
||||
most) leaves below the root result node.</li>
|
||||
</ul>
|
||||
|
||||
<h1 class="section"><a name="indexed_facet_info">Indexed Facet Information</a></h1>
|
||||
<p>
|
||||
When indexing a document to which categories were added, information on these
|
||||
categories is added to the search index, in two locations:
|
||||
<ul>
|
||||
<li><i>Category Tokens</i> are added to the document for each category attached to
|
||||
that document. These categories can be used at search time for drill-down.</li>
|
||||
<li>A special <i>Category List Token</i> is added to each document containing
|
||||
information on all the categories that were added to this document. This can
|
||||
be used at search time for facet accumulation, e.g. facet counting.</li>
|
||||
</ul>
|
||||
<p>
|
||||
When a category is added to the index (that is, when a document containing a
|
||||
category is indexed), all its parent categories are added as well. For example, indexing
|
||||
a document with the category <code><<span style="color: blue">"author"</span>,
|
||||
<span style="color: blue">"American-</span>, <span style="color: blue">"Mark Twain"</span>></code> results in
|
||||
creating three tokens: <code>"/author"</code>, <code>"/author/American"</code>, and
|
||||
<code>"/author/American/Mark Twain"</code> (the character <code>'/'</code> here is just a human
|
||||
readable separator - there's no such element in the actual index). This allows drilling down
|
||||
and counting any category in the taxonomy, and not just leaf nodes, enabling a
|
||||
UI application to show either how many books have authors, or how many books
|
||||
have American authors, or how many books have Mark Twain as their (American)
|
||||
author.
|
||||
<p>
|
||||
Similarly, Drill-down capabilities are this way possible also for node categories.
|
||||
<p>
|
||||
In order to keep the counting list compact, it is built using category ordinal - an
|
||||
ordinal is an integer number attached to a category when it is added for the first time
|
||||
into the taxonomy.
|
||||
<p>
|
||||
For ways to further alter facet index see the section below on <a href="#indexing_params">Facet Indexing
|
||||
Parameters</a>.
|
||||
|
||||
<h1 class="section"><a name="taxonomy_index">Taxonomy Index</a></h1>
|
||||
<p>
|
||||
The taxonomy is an auxiliary data-structure maintained side-by-side with the regular
|
||||
index to support faceted search operations. It contains information about all the
|
||||
categories that ever existed in any document in the index. Its API is open and allows
|
||||
simple usage, or more advanced for the interested users.
|
||||
<p>
|
||||
When a category is added to a document, a corresponding node is added to the
|
||||
taxonomy (unless already there). In fact, sometimes more than one node is added -
|
||||
each parent category is added as well, so that the taxonomy is maintained as a Tree,
|
||||
with a virtual root.
|
||||
<p>
|
||||
So, for the above example, adding the category the category <code><<span style="color: blue">"author"</span>,
|
||||
<span style="color: blue">"American-</span>, <span style="color: blue">"Mark Twain"</span>></code>
|
||||
actually added three nodes: one for <code>"/author"</code>, one for <code>"/author/American"</code> and one for
|
||||
<code>"/author/American/Mark Twain"</code>.
|
||||
<p>
|
||||
An integer number - called ordinal is attached to each category the first time the
|
||||
category is added to the taxonomy. This allows for a compact representation of
|
||||
category list tokens in the index, for facets accumulation.
|
||||
<p>
|
||||
One interesting fact about the taxonomy index is worth knowing: once a category
|
||||
is added to the taxonomy, it is never removed, even if all related documents are
|
||||
removed. This differs from a regular index, where if all documents containing a
|
||||
certain term are removed, and their segments are merged, the term will also be
|
||||
removed. This might cause a performance issue: large taxonomy means large ordinal
|
||||
numbers for categories, and hence large categories values arrays would be maintained
|
||||
during accumulation. It is probably not a real problem for most applications, but be
|
||||
aware of this. If, for example, an application at a certain point in time removes an
|
||||
index entirely in order to recreate it, or, if it removed all the documents from the index
|
||||
in order to re-populate it, it also makes sense in this opportunity to remove the
|
||||
taxonomy index and create a new, fresh one, without the unused categories.
|
||||
|
||||
<h1 class="section"><a name="facet_params">Facet Parameters</a></h1>
|
||||
<p>
|
||||
Facet parameters control how categories and facets are indexed and searched. Apart
|
||||
from specifying facet requests within facet search parameters, under default settings it
|
||||
is not required to provide any parameters, as there are ready to use working defaults
|
||||
for everything.
|
||||
<p>
|
||||
However many aspects are configurable and can be modified by providing altered
|
||||
facet parameters for either search or indexing.
|
||||
|
||||
<h2 class="subsection"><a name="indexing_params">Facet Indexing Parameters</a></h2>
|
||||
<p>
|
||||
Facet Indexing Parameters are consulted with during indexing. Among several
|
||||
parameters it defines, the following two are likely to interest many applications:
|
||||
<ul>
|
||||
<li><b>Category list definitions</b> - in the index, facets are maintained in two
|
||||
forms: category-tokens (for drill-down) and category-list-tokens (for
|
||||
accumulation). This parameter allows to specify, for each category, the
|
||||
Lucene term used for maintaining the category-list-tokens for that category.
|
||||
The default implementation in <code>DefaultFacetIndexingParams</code> maintains
|
||||
this information for all categories under the same special dedicated term.
|
||||
One case where it is needed to maintain two categories in separate category
|
||||
lists, is when it is known that at search time it would be required to use
|
||||
different types of accumulation logic for each, but at the same accumulation
|
||||
call.</li>
|
||||
<li><b>Partition size</b> - category lists can be maintained in a partitioned way. If,
|
||||
for example, the partition size is set to 1000, a distinct sub-term is used for
|
||||
maintaining each 1000 categories, e.g. term1 for categories 0 to 999, term2
|
||||
for categories 1000 to 1999, etc. The default implementation in
|
||||
<code>DefaultFacetIndexingParams</code> maintains category lists in a single
|
||||
partition, hence it defines the partition size as <code>Integer.MAX_VALUE</code>. The
|
||||
importance of this parameter is on allowing to handle very large
|
||||
taxonomies without exhausting RAM resources. This is because at facet
|
||||
accumulation time, facet values arrays are maintained in the size of the
|
||||
partition. With a single partition, the size of these arrays is as the size of the
|
||||
taxonomy, which might be OK for most applications. Limited partition
|
||||
sizes allow to perform the accumulation with less RAM, but with some
|
||||
runtime overhead, as the matching documents are processed for each of the
|
||||
partitions.</li>
|
||||
</ul>
|
||||
<p>
|
||||
See the API Javadocs of <code>FacetIndexingParams</code> for additional configuration
|
||||
capabilities which were not discussed here.
|
||||
|
||||
<h2 class="subsection"><a name="search_params">Facet Search Parameters</a></h2>
|
||||
<p>
|
||||
Facet Search Parameters, consulted at search time (during facets accumulation) are
|
||||
rather plain, providing the following:
|
||||
<ul>
|
||||
<li><b>Facet indexing parameters</b> - which were in effect at indexing time -
|
||||
allowing facets accumulation to understand how facets are maintained in
|
||||
the index.</li>
|
||||
<li><b>Container of facet requests</b> - the requests which should be accumulated.</li>
|
||||
</ul>
|
||||
|
||||
<h2 class="subsection"><a name="category_lists_multiple_dimensions">Category Lists, Multiple Dimensions</a></h2>
|
||||
<p>
|
||||
Category list parameters which are accessible through the facet indexing parameters
|
||||
provide the information about:
|
||||
<ul>
|
||||
<li>Lucene Term under which category information is maintained in the index.</li>
|
||||
<li>Encoding (and decoding) used for writing and reading the categories
|
||||
information in the index.</li>
|
||||
</ul>
|
||||
<p>
|
||||
For cases when certain categories should be maintained in different location than
|
||||
others, use <code>PerDimensionIndexingParams</code>, which returns a different
|
||||
<code>CategoryListParams</code> object for each <i>dimension</i>. This is a good opportunity to
|
||||
explain about dimensions. This is just a notion: the top element - or first element - in
|
||||
a category path is denoted as the dimension of that category. Indeed, the dimension
|
||||
stands out as a top important part of the category path, such as <code>"Location"</code> for the
|
||||
category <code>"Location/Europe/France/Paris"</code>.
|
||||
|
||||
<h1 class="section"><a name="advanced">Advanced Faceted Examples</a></h1>
|
||||
<p>
|
||||
We now provide examples for more advanced facet indexing and search, such as
|
||||
drilling-down on facet values and multiple category lists.
|
||||
|
||||
<h2 class="subsection"><a name="drill_down">Drill-Down with Regular Facets</a></h2>
|
||||
<p>
|
||||
Drill-down allows users to focus on part of the results. Assume a commercial sport
|
||||
equipment site where a user is searching for a tennis racquet. The user issues the
|
||||
query <i>tennis racquet</i> and as result is shown a page with 10 tennis racquets, by
|
||||
various providers, of various types and prices. In addition, the site UI shows to the
|
||||
user a break down of all available racquets by price and make. The user now decides
|
||||
to focus on racquets made by <i>Head</i>, and will now be shown a new page, with 10
|
||||
Head racquets, and new break down of the results into racquet types and prices.
|
||||
Additionally, the application can choose to display a new breakdown, by racquet
|
||||
weights. This step of moving from results (and facet statistics) of the entire (or larger)
|
||||
data set into a portion of it by specifying a certain category, is what we call <i>Drilldown</i>.
|
||||
We now show the required code lines for implementing such a drill-down.
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
Query baseQuery = queryParser.parse("tennis racquet");
|
||||
Query q2 = DrillDown.query(baseQuery, new CategoryPath("make", "head"), 10));
|
||||
</pre>
|
||||
<p>
|
||||
In line 1 the original user query is created and then used to obtain information on
|
||||
all tennis racquets.
|
||||
<p>
|
||||
In line 2, a specific category from within the facet results was selected by the user,
|
||||
and is hence used for creating the drill-down query.
|
||||
<p>
|
||||
Please refer to <code>SimpleSearcher.searchWithDrillDown()</code> for a more detailed
|
||||
code example performing drill-down.
|
||||
|
||||
<h2 class="subsection"><a name="multi-category_list">Multiple Category Lists</a></h2>
|
||||
<p>
|
||||
The default is to maintain all categories information in a single list. While this will
|
||||
suit most applications, in some situations an application may wish to use multiple
|
||||
category lists, for example, when the distribution of some category values is different
|
||||
than that of other categories and calls for using a different encoding, more efficient
|
||||
for the specific distribution. Another example is when most facets are rarely used
|
||||
while some facets are used very heavily, so an application may opt to maintain the
|
||||
latter in memory - and in order to keep memory footprint lower it is useful to
|
||||
maintain only those heavily used facets in a separate category list.
|
||||
<p>
|
||||
First we define indexing parameters with multiple category lists:
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
PerDimensionIndexingParams iParams = new PerDimensionIndexingParams();
|
||||
iParams.addCategoryListParams(new CategoryPath("Author"),
|
||||
new CategoryListParams(new Term("$RarelyUsed", "Facets")));
|
||||
iParams.addCategoryListParams(new CategoryPath("Language"),
|
||||
new CategoryListParams(new Term("$HeavilyUsed", "Ones")));
|
||||
</pre>
|
||||
<p>
|
||||
This will cause the Language categories to be maintained in one category list, and
|
||||
Author facets to be maintained in a another category list. Note that any other category,
|
||||
if encountered, will still be maintained in the default category list.
|
||||
<p>
|
||||
These non-default indexing parameters should now be used both at indexing and
|
||||
search time. As depicted below, at indexing time this is done when creating the
|
||||
category document builder, while at search time this is done when creating the search
|
||||
parameters. Other than that the faceted search code is unmodified.
|
||||
<pre class="prettyprint lang-java linenums">
|
||||
DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo, iParams);
|
||||
...
|
||||
FacetSearchParams facetSearchParams = new FacetSearchParams(iParams);
|
||||
</pre>
|
||||
<p>
|
||||
A complete simple example can be found in package <code>org.apache.lucene.facet.example.multiCL</code>
|
||||
under the example code.
|
||||
|
||||
<h1 class="section"><a name="optimizations">Optimizations</a></h1>
|
||||
<p>
|
||||
Faceted search through a large collection of documents with large numbers of facets
|
||||
altogether and/or large numbers of facets per document is challenging performance
|
||||
wise, either in CPU, RAM, or both. A few ready to use optimizations exist to tackle
|
||||
these challenges.
|
||||
|
||||
<h2 class="subsection"><a name="sampling">Sampling</a></h2>
|
||||
<p>
|
||||
Facet sampling allows to accumulate facets over a sample of the matching
|
||||
documents set. In many cases, once top facets are found over the sample set, exact
|
||||
accumulations are computed for those facets only, this time over the entire matching
|
||||
document set.
|
||||
<p>
|
||||
Two kinds of sampling exist: complete support and wrapping support. The
|
||||
complete support is through <code>SamplingAccumulator</code> and is tied to an extension of the
|
||||
<code>StandardFacetsAccumulator</code> and has the benefit of automatically applying other
|
||||
optimizations, such as <a href="#complements">Complements</a>. The wrapping support is through
|
||||
<code>SamplingWrapper</code> and can wrap any accumulator, and as such, provides more
|
||||
freedom for applications.
|
||||
|
||||
<h2 class="subsection"><a name="complements">Complements</a></h2>
|
||||
<p>
|
||||
When accumulating facets over a very large matching documents set, possibly
|
||||
almost as large as the entire collection, it is possible to speed up accumulation by
|
||||
looking at the complement set of documents, and then obtaining the actual results by
|
||||
subtracting from the total results. It should be noted that this is available only for
|
||||
count requests, and that the first invocation that involves this optimization might take
|
||||
longer because the total counts have to be computed.
|
||||
<p>
|
||||
This optimization is applied automatically by <code>StandardFacetsAccumulator</code>.
|
||||
|
||||
<h2 class="subsection"><a name="partitions">Partitions</a></h2>
|
||||
<p>
|
||||
Partitions are also discussed in the section about <a href="#indexing_params">Facet Indexing parameters.</a>
|
||||
<p>
|
||||
Facets are internally accumulated by first accumulating all facets and later on
|
||||
extracting the results for the requested facets. During this process, accumulation
|
||||
arrays are maintained in the size of the taxonomy. For a very large taxonomy, with
|
||||
multiple simultaneous faceted search operations, this might lead to excessive memory
|
||||
footprint. Partitioning the faceted information allows to relax the memory usage, by
|
||||
maintaining the category lists in several partitions, and by processing one partition at
|
||||
a time. This is automatically done by <code>StandardFacetsAccumulator</code>. However the
|
||||
default partition size is <code>Integer.MAX_VALUE</code>, practically setting to a single partition,
|
||||
i.e. no partitions at all.
|
||||
<p>
|
||||
Decision to override this behavior and use multiple partitions must be taken at
|
||||
indexing time. Once the index is created and already contains category lists it is too
|
||||
late to modify this.
|
||||
<p>
|
||||
See <code>FacetIndexingParams.getPartitionSize()</code> for API to alter this default
|
||||
behavior.
|
||||
|
||||
<h1 class="section"><a name="concurrent_indexing_search">Concurrent Indexing and Search</a></h1>
|
||||
<p>
|
||||
Sometimes, indexing is done once, and when the index is fully prepared, searching
|
||||
starts. However, in most real applications indexing is <i>incremental</i> (new data comes in
|
||||
once in a while, and needs to be indexed), and indexing often needs to happen while
|
||||
searching is continuing at full steam.
|
||||
<p>
|
||||
Luckily, Lucene supports multiprocessing - one process writing to an index while
|
||||
another is reading from it. One of the key insights behind how Lucene allows multiprocessing
|
||||
is <i>Point In Time</i> semantics. The idea is that when an <code>IndexReader</code> is opened,
|
||||
it gets a view of the index at the <i>point in time</i> it was opened. If an <code>IndexWriter</code>
|
||||
in a different process or thread modifies the index, the reader does not know about it until a new
|
||||
<code>IndexReader</code> is opened (or the reopen() method of an existing <code>IndexReader</code> is called).
|
||||
<p>
|
||||
In faceted search, we complicate things somewhat by adding a second index - the
|
||||
taxonomy index. The taxonomy API also follows point-in-time semantics, but this is
|
||||
not quite enough. Some attention must be paid by the user to keep those two indexes
|
||||
consistently in sync:
|
||||
<p>
|
||||
The main index refers to category numbers defined in the taxonomy index.
|
||||
Therefore, it is important that we open the <code>TaxonomyReader</code> <i>after</i> opening the
|
||||
IndexReader. Moreover, every time an IndexReader is reopen()ed, the
|
||||
TaxonomyReader needs to be refresh()'ed as well.
|
||||
<p>
|
||||
But there is one extra caution: whenever the application deems it has written
|
||||
enough information worthy a commit, it must <b>first</b> call commit() for the
|
||||
<code>TaxonomyWriter</code> and only <b>after</b> that call commit() for the <code>IndexWriter</code>.
|
||||
Closing the indices should also be done in this order - <b>first</b> close the taxonomy, and only <b>after</b>
|
||||
that close the index.
|
||||
<p>
|
||||
To summarize, if you're writing a faceted search application where searching and
|
||||
indexing happens concurrently, please follow these guidelines (in addition to the usual
|
||||
guidelines on how to use Lucene correctly in the concurrent case):
|
||||
<ul>
|
||||
<li>In the indexing process:
|
||||
<ol>
|
||||
<li>Before a writer commit()s the IndexWriter, it must commit() the
|
||||
TaxonomyWriter. Nothing should be added to the index between these
|
||||
two commit()s.</li>
|
||||
<li>Similarly, before a writer close()s the IndexWriter, it must close() the
|
||||
TaxonomyWriter.</li>
|
||||
</ol></li>
|
||||
<li>In the searching process:
|
||||
<ol>
|
||||
<li>Open the IndexReader first, and then the TaxonomyReader.</li>
|
||||
<li>After a reopen() on the IndexReader, refresh() the TaxonomyReader.
|
||||
No search should be performed on the new IndexReader until refresh()
|
||||
has finished.</li>
|
||||
</ol></li>
|
||||
</ul>
|
||||
<p>
|
||||
Note that the above discussion assumes that the underlying file-system on which
|
||||
the index and the taxonomy are stored respects ordering: if index A is written before
|
||||
index B, then any reader finding a modified index B will also see a modified index A.
|
||||
<p>
|
||||
<b>Note:</b> <code>TaxonomyReader</code>'s refresh() is simpler than <code>IndexReader</code>'s reopen().
|
||||
While the latter keeps both the old and new reader open, the former keeps only the new reader. The reason
|
||||
is that a new <code>IndexReader</code> might have modified old information (old documents deleted, for
|
||||
example) so a thread which is in the middle of a search needs to continue using the old information. With
|
||||
<code>TaxonomyReader</code>, however, we are guaranteed that existing categories are never deleted or modified -
|
||||
the only thing that can happen is that new categories are added. Since search threads do not care if new categories
|
||||
are added in the middle of a search, there is no reason to keep around the old object, and the new one suffices.
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -62,10 +62,10 @@ public class TaxonomyMergeUtils {
|
|||
* Merges the given taxonomy and index directories and commits the changes to
|
||||
* the given writers. This method uses {@link MemoryOrdinalMap} to store the
|
||||
* mapped ordinals. If you cannot afford the memory, you can use
|
||||
* {@link #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)}
|
||||
* {@link #merge(Directory, Directory, LuceneTaxonomyWriter.OrdinalMap, IndexWriter, LuceneTaxonomyWriter)}
|
||||
* by passing {@link DiskOrdinalMap}.
|
||||
*
|
||||
* @see #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)
|
||||
* @see #merge(Directory, Directory, LuceneTaxonomyWriter.OrdinalMap, IndexWriter, LuceneTaxonomyWriter)
|
||||
*/
|
||||
public static void merge(Directory srcIndexDir, Directory srcTaxDir,
|
||||
IndexWriter destIndexWriter,
|
||||
|
|
|
@ -3,6 +3,6 @@
|
|||
<title>Faceted Indexing and Search</title>
|
||||
</head>
|
||||
<body>
|
||||
Provides faceted indexing and search capabilities.
|
||||
Provides faceted indexing and search capabilities. The <a href="../../../../userguide.html">userguide</a> is recommended for a start.
|
||||
</body>
|
||||
</html>
|
|
@ -21,6 +21,6 @@
|
|||
</title>
|
||||
</head>
|
||||
<body>
|
||||
facet
|
||||
Provides faceted indexing and search capabilities (checkout the <a href="userguide.html">userguide</a>).
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue