LUCENE-3261: Faceting module userguide

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1175341 13f79535-47bb-0310-9956-ffa450edef68
2011-09-25 09:20:59 +00:00 · 2011-09-25 09:20:59 +00:00 · 151c26e7d9
parent d48960ae05
commit 151c26e7d9
5 changed files with 815 additions and 5 deletions
--- a/modules/facet/build.xml
+++ b/modules/facet/build.xml
@ -29,6 +29,14 @@
  <property name="build.dir" location="build/" />
  <property name="dist.dir" location="dist/" />

+  <property name="examples.dir" location="src/examples"/>
+
+  <path id="classpath">
+    <path refid="base.classpath" />
+    <pathelement location="${build.dir}/classes/java" />
+    <pathelement location="${build.dir}/classes/examples" />
+  </path>
+    
  <path id="examples.classpath">
    <path refid="classpath" />
    <pathelement location="${build.dir}/classes/java" />
@ -49,7 +57,7 @@
  </path>

  <target name="compile-examples" description="Compiles Facets examples">
-    <compile srcdir="src/examples" destdir="${build.dir}/classes/examples">
+    <compile srcdir="${examples.dir}" destdir="${build.dir}/classes/examples">
      <classpath refid="examples.classpath" />
    </compile>
  </target>
@ -64,4 +72,22 @@

  <target name="jar-core" depends="common.jar-core,jar-examples" />

+  <target name="javadocs" depends="compile-core">
+	<sequential>
+      <mkdir dir="${javadoc.dir}/contrib-${name}"/>
+      <copy todir="${javadoc.dir}/contrib-${name}" file="docs/userguide.html" />
+      <!-- javadoc core classes -->
+      <invoke-javadoc
+        destdir="${javadoc.dir}/contrib-${name}"
+        title="${Name} ${version} contrib-${name} API">
+        <sources>
+          <link href=""/>
+          <packageset dir="${src.dir}"/>
+          <packageset dir="${examples.dir}"/>
+        </sources>
+      </invoke-javadoc>
+      <jarify basedir="${javadoc.dir}/contrib-${name}" destfile="${build.dir}/${final.name}-javadoc.jar"/>
+    </sequential>
+  </target>
+    
 </project>
--- a/modules/facet/docs/userguide.html
+++ b/modules/facet/docs/userguide.html
@ -0,0 +1,784 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<title>Facet Userguide</title>
+
+<!-- load stylesheet + javascript in development mode -->
+<link rel="stylesheet" type="text/css" href="../../../src/tools/prettify/prettify.css">
+<script src="../../../src/tools/prettify/prettify.js" type="text/javascript"></script>
+
+<!-- load stylesheet + javascript in distribution mode -->
+<link rel="stylesheet" type="text/css" href="prettify.css">
+<script src="prettify.js" type="text/javascript"></script>
+
+<script language="javascript">
+	window.onload=function() {
+		prettyPrint();
+	}
+</script>
+
+<style>
+body {
+  margin-left: 20%;
+  width: 60%;
+  counter-reset: section;
+  text-align: left;
+}
+
+h1.title {
+  text-align: center;
+  margin-top: 30px;
+  font-size: 5em;
+  line-height: 150%;
+}
+
+h1.section {
+  margin-top: 50px;
+  font-size: 2.5em;
+  counter-reset: subsection;
+  border: 1px solid black;
+  background-color: #D8D8D8;
+  padding-left: 5px;
+}
+
+h2.subsection {
+  font-size: 2em;
+  border: 1px solid black;
+  background-color: #D8D8D8;
+  padding-left: 5px;
+}
+
+/* auto-generated heading numbers */
+h1.section:before {
+counter-increment: section;
+content: counter(section) ". ";
+}
+
+h2.subsection:before  {
+counter-increment: subsection;
+content: counter(section) "." counter(subsection) " ";
+}
+
+/* override from prettify.css - add shadow, padding etc. */
+pre.prettyprint {
+  margin-left: 2%;
+  width: 80%;
+  padding: 5px 3px 5px 3px;
+  /* shadow */
+  -moz-box-shadow: 5px 5px 2px #888;
+  -webkit-box-shadow: 5px 5px 2px #888;
+  box-shadow: 5px 5px 2px #888;
+}
+
+/* override from prettify.css - make keywords appear in bold */
+span.kwd {
+  font-weight: bold;
+}
+
+ol.toc a {
+  text-decoration: none;
+  color: blue;
+}
+
+li.toc_first {
+  // margin-top: 10px;
+  font-size: 16px;
+  color: blue;
+}
+
+li.toc_second {
+  font-size: 14px;
+  margin-left: 15px;
+  color: blue;
+}
+
+/* reset style from prettify.css, so that line numbers appear in each line */
+li.L0,li.L1,li.L2,li.L3,li.L5,li.L6,li.L7,li.L8 {
+  list-style-type:decimal
+}
+
+table.code_description td {
+  vertical-align: top;
+}
+
+</style>
+
+<body>
+<h1 class="title">
+	Apache Lucene<br>
+	Faceted Search<br>
+	User's Guide</h1>
+
+<div class="toc">
+<h1 class="toc">Table of Contents</h1>
+<ol class="toc">
+<li class="toc_first"><a href="#intro">Introduction</a></li>
+<li class="toc_first"><a href="#facet_features">Facet Features</a></li>
+<li class="toc_first"><a href="#facet_indexing">Indexing Categories Illustrated</a></li>
+<li class="toc_first"><a href="#facet_accumulation">Accumulating Facets Illustrated</a></li>
+<li class="toc_first"><a href="#indexed_facet_info">Indexed Facet Information</a></li>
+<li class="toc_first"><a href="#taxonomy_index">Taxonomy Index</a></li>
+<li class="toc_first"><a href="#facet_params">Facet Parameters</a></li>
+<li class="toc_first"><a href="#advanced">Advanced Faceted Examples</a></li>
+<li class="toc_first"><a href="#optimizations">Optimizations</a></li>
+<li class="toc_first"><a href="#concurrent_indexing_search">Concurrent Indexing and Search</a></li>
+</ol>
+
+<h1 class="section"><a name="intro">Introduction</a></h1>
+<p>
+A category is an aspect of indexed documents which can be used to classify the
+documents. For example, in a collection of books at an online bookstore, categories of
+a book can be its price, author, publication date, binding type, and so on.
+<p>
+In faceted search, in addition to the standard set of search results, we also get facet
+results, which are lists of subcategories for certain categories. For example, for the
+price facet, we get a list of relevant price ranges; for the author facet, we get a list of
+relevant authors; and so on. In most UIs, when users click one of these subcategories,
+the search is narrowed, or drilled down, and a new search limited to this subcategory
+(e.g., to a specific price range or author) is performed.
+<p>
+Note that faceted search is more than just the ordinary fielded search. In fielded
+search, users can add search keywords like price:10 or author:"Mark
+Twain" to the query to narrow the search, but this requires knowledge of which
+fields are available, and which values are worth trying. This is where faceted search
+comes in: it provides a list of useful subcategories, which ensures that the user only
+drills down into useful subcategories and never into a category for which there are no
+results. In essence, faceted search makes it easy to navigate through the search results.
+The list of subcategories provided for each facet is also useful to the user in itself,
+even when the user never drills down. This list allows the user to see at one glance
+some statistics on the search results, e.g., what price ranges and which authors are
+most relevant to the given query.
+<p>
+In recent years, faceted search has become a very common UI feature in search
+engines, especially in e-commerce websites. Faceted search makes it easy for
+untrained users to find the specific item they are interested in, whereas manually
+adding search keywords (as in the examples above) proved too cumbersome for
+ordinary users, and required too much guesswork, trial-and-error, or the reading of
+lengthy help pages.
+<p>
+See <a href="http://en.wikipedia.org/wiki/Faceted_search">http://en.wikipedia.org/wiki/Faceted_search</a> for more information on faceted
+search.
+
+<h1 class="section"><a name="facet_features">Facet Features</a></h1>
+First and main faceted search capability that comes to mind is counting, but in fact
+faceted search is more than facet counting. We now briefly discuss the available
+faceted search features.
+
+<h2 class="subsection">Facet Counting</h2>
+<p>
+Which of the available subcategories of a facet should a UI display? A query in a
+book store might yield books by a hundred different authors, but normally we'd want
+do display only, say, ten of those.
+<p>
+Most available faceted search implementations use counts to determine the
+importance of each subcategory. These implementations go over all search results for
+the given query, and count how many results are in each subcategory. Finally, the
+subcategories with the most results can be displayed. So the user sees the price ranges,
+authors, and so on, for which there are most results. Often, the count is displayed next
+to the subcategory name, in parentheses, telling the user how many results he can
+expect to see if he drills down into this subcategory.
+<p>
+The main API for obtaining facet counting is <code>CountFacetRequest</code>, as in the
+following code snippet:
+<pre class="prettyprint lang-java">
+new CountFacetRequest(new CategoryPath("author"), 10));
+</pre>
+A detailed code example using count facet requests is shown below - see
+<a href="#facet_accumulation">Accumulating Facets</a>.
+
+<h2 class="subsection"><a name="facet_association">Facet Associations</a></h2>
+<p>
+So far we've discussed categories as binary features, where a document either belongs
+to a category, or not.
+<p>
+While counts are useful in most situations, they are sometimes not sufficiently
+informative for the user, with respect to deciding which subcategory is more
+important to display.
+<p>
+For this, the facets package allows to associate a value with a category. The search
+time interpretation of the associated value is application dependent. For example, a
+possible interpretation is as a <i>match level</i> (e.g., confidence level). This value can
+then be used so that a document that is very weakly associated with a certain category
+will only contribute little to this category's aggregated weight.
+
+<h2 class="subsection"><a name="multiple_requests">Multiple Facet Requests</a></h2>
+<p>
+A single faceted accumulation is capable of servicing multiple facet requests.
+Programmatic, this is quite simple - wrap all the facet requests of interest into the
+facet-search-parameters which are passed to a facets accumulator/collector (more on
+these objects below). The results would be comprised of as many facet results as there
+were facet requests.
+<p>
+However there is a delicate <b>limitation</b>: all facets maintained in the same location in
+the index are required to be treated the same. See the section on <a href="#indexing_params">Indexing Parameters</a>
+for an explanation on maintaining certain facets at certain locations.
+
+<h2 class="subsection"><a name="facet_labels">Facet Labels at Search Time</a></h2>
+<p>
+Facets results always contain the facet (internal) ID and (accumulated) value. Some of
+the results also contain the facet label, AKA the category name. We mention this here
+since computing the label is a time consuming task, and hence applications can
+specify with a facet request to return top 1000 facets but to compute the label only for
+the top 10 facets. In order to compute labels for more of the facet results it is not
+required to perform accumulation again.
+<p>
+See <code>FacetRequest.getNumResults()</code>, <code>FacetRequest.getNumLabel()</code> and
+<code>FacetResultNode.getLabel(TaxonomyReader)</code>.
+
+<h1 class="section"><a name="facet_indexing">Indexing Categories Illustrated</a></h1>
+<p>
+In order to find facets at search time they must first be added to the index at indexing
+time. Recall that Lucene documents are made of fields for textual search. The addition
+of categories is performed by an appropriate <code>DocumentBuilder</code> - or
+<code>CategoryDocumentBuilder</code> in our case.
+<p>
+Indexing therefore usually goes like this:
+<ul>
+<li>For each input document:
+<ul>
+<li>Create a fresh (empty) Lucene Document</li>
+<li>Parse input text and add appropriate text search fields</li>
+<li><b>Gather all input categories associated with the document and create
+a CategoryDocumentBuilder with the list of categories</b></li>
+<li><b>Build the document - this actually adds the categories to the
+Lucene document.</b></li>
+<li>Add the document to the index</li>
+</ul></li>
+</ul>
+Following is a code snippet for indexing categories. The complete example can be
+found in package <code>org.apache.lucene.facet.example.simple.SimpleIndexer</code>.
+<pre class="prettyprint lang-java linenums">
+IndexWriter writer = ...
+TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
+...
+Document doc = new Document();
+doc.add(new Field("title", titleText, Store.YES, Index.ANALYZED));
+...
+List&lt;CategoryPath&gt; categories = new ArrayList&lt;CategoryPath&gt;();
+categories.add(new CategoryPath("author", "Mark Twain"));
+categories.add(new CategoryPath("year", "2010"));
+...
+DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo);
+categoryDocBuilder.setCategoryPaths(categories);
+categoryDocBuilder.build(doc);
+writer.addDocument(doc);
+</pre>
+<p>
+We now explain the steps above, following the code line numbers:
+<table class="code_description">
+<tr>
+	<td>(4)</td>
+	<td>Document contains not only text search fields but also facet search
+information.</td>
+</tr>
+<tr>
+	<td>(7)</td>
+	<td>Prepare a container for document categories.</td>
+</tr>
+<tr>
+	<td>(8)</td>
+	<td>Categories that should be added to the document are accumulated in the
+categories list.</td>
+</tr>
+<tr>
+	<td>(11)</td>
+	<td>A <code>CategoryDocumentBuilder</code> is created, set with the appropriate list
+of categories, and invoked to build - that is, to populate the document
+with categories. It is in this step that the taxonomy is updated to contain the
+newly added categories (if not already there) - see more on this in the
+section about the <a href="#taxonomy_index">Taxonomy Index</a> below. This line could be made more
+compact: one can create a single <code>CategoryDocumentBuilder cBuilder</code> and reuse it like this:
+<pre class="prettyprint lang-java linenums">
+DocumentBuilder cBuilder = new CategoryDocumentBuilder(taxo);
+cBuilder.setCategoryPaths(categories).build(doc);
+</pre>
+	</td>
+</tr>
+<tr>
+	<td>(14)</td>
+	<td>Add the document to the index. As a result, category info is saved also in
+the regular search index, for supporting facet aggregation at search time
+(e.g. facet counting) as well as facet drill-down. For more information on
+indexed facet information see below the section <a href="#indexed_facet_info">Indexed Facet Information</a>.</td>
+</tr>
+</table>
+
+<h1 class="section"><a name="facet_accumulation">Accumulating Facets Illustrated</a></h1>
+<p>
+Facets accumulation reflects a set of documents over some facet requests:
+<ul>
+<li><code>Document set</code> - a subset of the index documents, usually documents
+matching a user query.</li>
+<li><code>Facet requests</code> - facet accumulation specification, e.g. count a certain facet
+<i>dimension</i>.</li>
+</ul>
+<p>
+<code>FacetRequest</code> is a basic component in faceted search - it describes the facet
+information need. Every facet <b>request</b> is made of at least two fields:
+<ul>
+<li><code>CategoryPath</code> - root category of the facet request. The categories that
+are returned as a result of the request will all be descendants of this root</li>
+<li><code>Number of Results</code> - number of sub-categories to return (at most).</li>
+</ul>
+<p>
+There are other parameters to a facet request, such as -how many facet results to
+label-, -how <b>deep</b> to go from the request root when serving the facet request- and
+more - see the API Javadocs for <code>FacetRequest</code> and its subclasses for more
+information on these parameters. For labels in particular, see the section <a href="#facet_labels">Facet Labels
+at Search Time</a>.
+<p>
+<code>FacetRequest</code> in an abstract class, open for extensions, and users may add their
+own requests. The most often used request is <code>CountFacetRequest</code> - used for
+counting facets.
+<p>
+Facets accumulation is - not surprisingly - driven by a <code>FacetsAccumulator</code>. The
+most used one is <code>StandardFacetsAccumulator</code>, however there are also accumulators
+that support sampling - to be used in huge collections, and there's an adaptive facets
+accumulator which applies sampling conditionally on the statistics of the data. While
+facets accumulators are very extendible and powerful, they might be too
+overwhelming for beginners. For this reason, the code offers a higher level interface
+for facets accumulating: the <code>FacetsCollector</code>. It extends <code>Collector</code>, and as such
+can be passed to the search() method of Lucene's <code>IndexSearcher</code>. In case the
+application also needs to collect documents (in addition to accumulating/collecting
+facets), it can wrap multiple collectors with <code>MultiCollector</code>. Most code samples
+below use <code>FacetsCollector</code> due to its simple interface. It is quite likely that
+<code>FacetsCollector</code> should suffice the needs of most applications, therefore we
+recommend to start with it, and only when needing more flexibility turn to directly
+use facets accumulators.
+<p>
+Following is a code snippet from the example code - the complete example can be
+found under <code>org.apache.lucene.facet.example.simple.Searcher</code>:
+<pre class="prettyprint lang-java linenums">
+IndexReader indexReader = IndexReader.open(indexDir);
+Searcher searcher = new IndexSearcher(indexReader);
+TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
+...
+Query q = new TermQuery(new Term(SimpleUtils.TEXT, "white"));
+TopScoreDocCollector tdc = TopScoreDocCollector.create(10, true);
+...
+FacetSearchParams facetSearchParams = new FacetSearchParams();
+facetSearchParams.addFacetRequest(new CountFacetRequest(
+    new CategoryPath("author"), 10));
+...
+FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, indexReader, taxo);
+searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector));
+List&lt;FacetResult&gt; res = facetsCollector.getFacetResults();
+</pre>
+<p>
+We now explain the steps above, following the code line numbers:
+<table class="code_description">
+<tr>
+	<td>(1)</td>
+	<td>Index reader and Searcher are initialized as usual.</td>
+</tr>
+<tr>
+	<td>(3)</td>
+	<td>A taxonomy reader is opened - it provides access to the facet information
+which was stored by the Taxonomy Writer at indexing time.</td>
+</tr>
+<tr>
+	<td>(5)</td>
+	<td>Regular text query is created to find the documents matching user need, and
+a collector for collecting the top matching documents is created.</td>
+</tr>
+<tr>
+	<td>(8)</td>
+	<td>Facet-search-params is a container for facet requests.</td>
+</tr>
+<tr>
+	<td>(10)</td>
+	<td>A single facet-request - namely a count facet request - is created and added
+to the facet search params. The request should return top 10 Author
+subcategory counts.</td>
+</tr>
+<tr>
+	<td>(12)</td>
+	<td>Facets-Collector is the simplest interface for facets accumulation (counting
+in this example).</td>
+</tr>
+<tr>
+	<td>(13)</td>
+	<td>Lucene search takes both collectors - facets-collector and top-doccollector,
+both wrapped by a multi-collector. This way, a single search
+operation finds both top documents and top facets. Note however that facets
+aggregation takes place not only over the top documents, but rather over all
+documents matching the query.</td>
+</tr>
+<tr>
+	<td>(14)</td>
+	<td>Once search completes, facet-results can be obtained from the facetscollector.</td>
+</tr>
+</table>
+
+<p>
+Returned facet results are organized in a list, conveniently ordered the same as the
+facet-requests in the facet-search-params. Each result however contains the request
+for which it was created.</li>
+<p>
+Here is the (recursive) structure of the facet result:
+<ul>
+<li><b>Facet Result</b>
+<ul>
+<li><b>Facet Request</b> - the request for which this result was obtained.</li>
+<li><b>Valid Descendants</b> - how many valid descendants were encountered
+over the set of matching documents (some of which might have been
+filtered out because e.g. only top 10 results were requested).</li>
+<li><b>Root Result Node</b> - root facet result for the request
+<ul>
+<li><b>Ordinal</b> - unique internal ID of the facet</li>
+<li><b>Label</b> - full label of the facet (possibly null)</li>
+<li><b>Value</b> - facet value, e.g. count</li>
+<li><b>Sub-results-nodes</b> - child result nodes (possibly null)</li>
+</ul></li>
+</ul></li>
+</ul>
+<p>
+Note that not always there would be sub result nodes - this depends on the
+requested result mode:
+<ul>
+<li><b>PER_NODE_IN_TREE</b> - a tree, and so there may be sub results.</li>
+<li><b>GLOBAL_FLAT</b> - here the results tree would be rather flat, with only (at
+most) leaves below the root result node.</li>
+</ul>
+
+<h1 class="section"><a name="indexed_facet_info">Indexed Facet Information</a></h1>
+<p>
+When indexing a document to which categories were added, information on these
+categories is added to the search index, in two locations:
+<ul>
+<li><i>Category Tokens</i> are added to the document for each category attached to
+that document. These categories can be used at search time for drill-down.</li>
+<li>A special <i>Category List Token</i> is added to each document containing
+information on all the categories that were added to this document. This can
+be used at search time for facet accumulation, e.g. facet counting.</li>
+</ul>
+<p>
+When a category is added to the index (that is, when a document containing a
+category is indexed), all its parent categories are added as well. For example, indexing
+a document with the category <code>&lt;<span style="color: blue">"author"</span>, 
+<span style="color: blue">"American-</span>, <span style="color: blue">"Mark Twain"</span>&gt;</code> results in
+creating three tokens: <code>"/author"</code>, <code>"/author/American"</code>, and
+<code>"/author/American/Mark Twain"</code> (the character <code>'/'</code> here is just a human
+readable separator - there's no such element in the actual index). This allows drilling down
+and counting any category in the taxonomy, and not just leaf nodes, enabling a
+UI application to show either how many books have authors, or how many books
+have American authors, or how many books have Mark Twain as their (American)
+author.
+<p>
+Similarly, Drill-down capabilities are this way possible also for node categories.
+<p>
+In order to keep the counting list compact, it is built using category ordinal - an
+ordinal is an integer number attached to a category when it is added for the first time
+into the taxonomy.
+<p>
+For ways to further alter facet index see the section below on <a href="#indexing_params">Facet Indexing
+Parameters</a>.
+
+<h1 class="section"><a name="taxonomy_index">Taxonomy Index</a></h1>
+<p>
+The taxonomy is an auxiliary data-structure maintained side-by-side with the regular
+index to support faceted search operations. It contains information about all the
+categories that ever existed in any document in the index. Its API is open and allows
+simple usage, or more advanced for the interested users.
+<p>
+When a category is added to a document, a corresponding node is added to the
+taxonomy (unless already there). In fact, sometimes more than one node is added -
+each parent category is added as well, so that the taxonomy is maintained as a Tree,
+with a virtual root.
+<p>
+So, for the above example, adding the category the category <code>&lt;<span style="color: blue">"author"</span>, 
+<span style="color: blue">"American-</span>, <span style="color: blue">"Mark Twain"</span>&gt;</code> 
+actually added three nodes: one for <code>"/author"</code>, one for <code>"/author/American"</code> and one for 
+<code>"/author/American/Mark Twain"</code>.
+<p>
+An integer number - called ordinal is attached to each category the first time the
+category is added to the taxonomy. This allows for a compact representation of
+category list tokens in the index, for facets accumulation.
+<p>
+One interesting fact about the taxonomy index is worth knowing: once a category
+is added to the taxonomy, it is never removed, even if all related documents are
+removed. This differs from a regular index, where if all documents containing a
+certain term are removed, and their segments are merged, the term will also be
+removed. This might cause a performance issue: large taxonomy means large ordinal
+numbers for categories, and hence large categories values arrays would be maintained
+during accumulation. It is probably not a real problem for most applications, but be
+aware of this. If, for example, an application at a certain point in time removes an
+index entirely in order to recreate it, or, if it removed all the documents from the index
+in order to re-populate it, it also makes sense in this opportunity to remove the
+taxonomy index and create a new, fresh one, without the unused categories.
+
+<h1 class="section"><a name="facet_params">Facet Parameters</a></h1>
+<p>
+Facet parameters control how categories and facets are indexed and searched. Apart
+from specifying facet requests within facet search parameters, under default settings it
+is not required to provide any parameters, as there are ready to use working defaults
+for everything.
+<p>
+However many aspects are configurable and can be modified by providing altered
+facet parameters for either search or indexing.
+
+<h2 class="subsection"><a name="indexing_params">Facet Indexing Parameters</a></h2>
+<p>
+Facet Indexing Parameters are consulted with during indexing. Among several
+parameters it defines, the following two are likely to interest many applications:
+<ul>
+<li><b>Category list definitions</b> - in the index, facets are maintained in two
+forms: category-tokens (for drill-down) and category-list-tokens (for
+accumulation). This parameter allows to specify, for each category, the
+Lucene term used for maintaining the category-list-tokens for that category.
+The default implementation in <code>DefaultFacetIndexingParams</code> maintains
+this information for all categories under the same special dedicated term.
+One case where it is needed to maintain two categories in separate category
+lists, is when it is known that at search time it would be required to use
+different types of accumulation logic for each, but at the same accumulation
+call.</li>
+<li><b>Partition size</b> - category lists can be maintained in a partitioned way. If,
+for example, the partition size is set to 1000, a distinct sub-term is used for
+maintaining each 1000 categories, e.g. term1 for categories 0 to 999, term2
+for categories 1000 to 1999, etc. The default implementation in
+<code>DefaultFacetIndexingParams</code> maintains category lists in a single
+partition, hence it defines the partition size as <code>Integer.MAX_VALUE</code>. The
+importance of this parameter is on allowing to handle very large
+taxonomies without exhausting RAM resources. This is because at facet
+accumulation time, facet values arrays are maintained in the size of the
+partition. With a single partition, the size of these arrays is as the size of the
+taxonomy, which might be OK for most applications. Limited partition
+sizes allow to perform the accumulation with less RAM, but with some
+runtime overhead, as the matching documents are processed for each of the
+partitions.</li>
+</ul>
+<p>
+See the API Javadocs of <code>FacetIndexingParams</code> for additional configuration
+capabilities which were not discussed here.
+
+<h2 class="subsection"><a name="search_params">Facet Search Parameters</a></h2>
+<p>
+Facet Search Parameters, consulted at search time (during facets accumulation) are
+rather plain, providing the following:
+<ul>
+<li><b>Facet indexing parameters</b> - which were in effect at indexing time -
+allowing facets accumulation to understand how facets are maintained in
+the index.</li>
+<li><b>Container of facet requests</b> - the requests which should be accumulated.</li>
+</ul>
+
+<h2 class="subsection"><a name="category_lists_multiple_dimensions">Category Lists, Multiple Dimensions</a></h2>
+<p>
+Category list parameters which are accessible through the facet indexing parameters
+provide the information about:
+<ul>
+<li>Lucene Term under which category information is maintained in the index.</li>
+<li>Encoding (and decoding) used for writing and reading the categories
+information in the index.</li>
+</ul>
+<p>
+For cases when certain categories should be maintained in different location than
+others, use <code>PerDimensionIndexingParams</code>, which returns a different
+<code>CategoryListParams</code> object for each <i>dimension</i>. This is a good opportunity to
+explain about dimensions. This is just a notion: the top element - or first element - in
+a category path is denoted as the dimension of that category. Indeed, the dimension
+stands out as a top important part of the category path, such as <code>"Location"</code> for the
+category <code>"Location/Europe/France/Paris"</code>.
+
+<h1 class="section"><a name="advanced">Advanced Faceted Examples</a></h1>
+<p>
+We now provide examples for more advanced facet indexing and search, such as
+drilling-down on facet values and multiple category lists.
+
+<h2 class="subsection"><a name="drill_down">Drill-Down with Regular Facets</a></h2>
+<p>
+Drill-down allows users to focus on part of the results. Assume a commercial sport
+equipment site where a user is searching for a tennis racquet. The user issues the
+query <i>tennis racquet</i> and as result is shown a page with 10 tennis racquets, by
+various providers, of various types and prices. In addition, the site UI shows to the
+user a break down of all available racquets by price and make. The user now decides
+to focus on racquets made by <i>Head</i>, and will now be shown a new page, with 10
+Head racquets, and new break down of the results into racquet types and prices.
+Additionally, the application can choose to display a new breakdown, by racquet
+weights. This step of moving from results (and facet statistics) of the entire (or larger)
+data set into a portion of it by specifying a certain category, is what we call <i>Drilldown</i>.
+We now show the required code lines for implementing such a drill-down.
+<pre class="prettyprint lang-java linenums">
+Query baseQuery = queryParser.parse("tennis racquet");
+Query q2 = DrillDown.query(baseQuery, new CategoryPath("make", "head"), 10));
+</pre>
+<p>
+In line 1 the original user query is created and then used to obtain information on
+all tennis racquets.
+<p>
+In line 2, a specific category from within the facet results was selected by the user,
+and is hence used for creating the drill-down query.
+<p>
+Please refer to <code>SimpleSearcher.searchWithDrillDown()</code> for a more detailed
+code example performing drill-down.
+
+<h2 class="subsection"><a name="multi-category_list">Multiple Category Lists</a></h2>
+<p>
+The default is to maintain all categories information in a single list. While this will
+suit most applications, in some situations an application may wish to use multiple
+category lists, for example, when the distribution of some category values is different
+than that of other categories and calls for using a different encoding, more efficient
+for the specific distribution. Another example is when most facets are rarely used
+while some facets are used very heavily, so an application may opt to maintain the
+latter in memory - and in order to keep memory footprint lower it is useful to
+maintain only those heavily used facets in a separate category list.
+<p>
+First we define indexing parameters with multiple category lists:
+<pre class="prettyprint lang-java linenums">
+PerDimensionIndexingParams iParams = new PerDimensionIndexingParams();
+iParams.addCategoryListParams(new CategoryPath("Author"), 
+    new CategoryListParams(new Term("$RarelyUsed", "Facets")));
+iParams.addCategoryListParams(new CategoryPath("Language"),
+    new CategoryListParams(new Term("$HeavilyUsed", "Ones")));
+</pre>
+<p>
+This will cause the Language categories to be maintained in one category list, and
+Author facets to be maintained in a another category list. Note that any other category,
+if encountered, will still be maintained in the default category list.
+<p>
+These non-default indexing parameters should now be used both at indexing and
+search time. As depicted below, at indexing time this is done when creating the
+category document builder, while at search time this is done when creating the search
+parameters. Other than that the faceted search code is unmodified.
+<pre class="prettyprint lang-java linenums">
+DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo, iParams);
+...
+FacetSearchParams facetSearchParams = new FacetSearchParams(iParams);
+</pre>
+<p>
+A complete simple example can be found in package <code>org.apache.lucene.facet.example.multiCL</code> 
+under the example code.
+
+<h1 class="section"><a name="optimizations">Optimizations</a></h1>
+<p>
+Faceted search through a large collection of documents with large numbers of facets
+altogether and/or large numbers of facets per document is challenging performance
+wise, either in CPU, RAM, or both. A few ready to use optimizations exist to tackle
+these challenges.
+
+<h2 class="subsection"><a name="sampling">Sampling</a></h2>
+<p>
+Facet sampling allows to accumulate facets over a sample of the matching
+documents set. In many cases, once top facets are found over the sample set, exact
+accumulations are computed for those facets only, this time over the entire matching
+document set.
+<p>
+Two kinds of sampling exist: complete support and wrapping support. The
+complete support is through <code>SamplingAccumulator</code> and is tied to an extension of the
+<code>StandardFacetsAccumulator</code> and has the benefit of automatically applying other
+optimizations, such as <a href="#complements">Complements</a>. The wrapping support is through
+<code>SamplingWrapper</code> and can wrap any accumulator, and as such, provides more
+freedom for applications.
+
+<h2 class="subsection"><a name="complements">Complements</a></h2>
+<p>
+When accumulating facets over a very large matching documents set, possibly
+almost as large as the entire collection, it is possible to speed up accumulation by
+looking at the complement set of documents, and then obtaining the actual results by
+subtracting from the total results. It should be noted that this is available only for
+count requests, and that the first invocation that involves this optimization might take
+longer because the total counts have to be computed.
+<p>
+This optimization is applied automatically by <code>StandardFacetsAccumulator</code>.
+
+<h2 class="subsection"><a name="partitions">Partitions</a></h2>
+<p>
+Partitions are also discussed in the section about <a href="#indexing_params">Facet Indexing parameters.</a>
+<p>
+Facets are internally accumulated by first accumulating all facets and later on
+extracting the results for the requested facets. During this process, accumulation
+arrays are maintained in the size of the taxonomy. For a very large taxonomy, with
+multiple simultaneous faceted search operations, this might lead to excessive memory
+footprint. Partitioning the faceted information allows to relax the memory usage, by
+maintaining the category lists in several partitions, and by processing one partition at
+a time. This is automatically done by <code>StandardFacetsAccumulator</code>. However the
+default partition size is <code>Integer.MAX_VALUE</code>, practically setting to a single partition,
+i.e. no partitions at all.
+<p>
+Decision to override this behavior and use multiple partitions must be taken at
+indexing time. Once the index is created and already contains category lists it is too
+late to modify this.
+<p>
+See <code>FacetIndexingParams.getPartitionSize()</code> for API to alter this default
+behavior.
+
+<h1 class="section"><a name="concurrent_indexing_search">Concurrent Indexing and Search</a></h1>
+<p>
+Sometimes, indexing is done once, and when the index is fully prepared, searching
+starts. However, in most real applications indexing is <i>incremental</i> (new data comes in
+once in a while, and needs to be indexed), and indexing often needs to happen while
+searching is continuing at full steam.
+<p>
+Luckily, Lucene supports multiprocessing - one process writing to an index while
+another is reading from it. One of the key insights behind how Lucene allows multiprocessing 
+is <i>Point In Time</i> semantics. The idea is that when an <code>IndexReader</code> is opened, 
+it gets a view of the index at the <i>point in time</i> it was opened. If an <code>IndexWriter</code> 
+in a different process or thread modifies the index, the reader does not know about it until a new 
+<code>IndexReader</code> is opened (or the reopen() method of an existing <code>IndexReader</code> is called).
+<p>
+In faceted search, we complicate things somewhat by adding a second index - the
+taxonomy index. The taxonomy API also follows point-in-time semantics, but this is
+not quite enough. Some attention must be paid by the user to keep those two indexes
+consistently in sync:
+<p>
+The main index refers to category numbers defined in the taxonomy index.
+Therefore, it is important that we open the <code>TaxonomyReader</code> <i>after</i> opening the
+IndexReader. Moreover, every time an IndexReader is reopen()ed, the
+TaxonomyReader needs to be refresh()'ed as well.
+<p>
+But there is one extra caution: whenever the application deems it has written
+enough information worthy a commit, it must <b>first</b> call commit() for the
+<code>TaxonomyWriter</code> and only <b>after</b> that call commit() for the <code>IndexWriter</code>. 
+Closing the indices should also be done in this order - <b>first</b> close the taxonomy, and only <b>after</b>
+that close the index.
+<p>
+To summarize, if you're writing a faceted search application where searching and
+indexing happens concurrently, please follow these guidelines (in addition to the usual
+guidelines on how to use Lucene correctly in the concurrent case):
+<ul>
+<li>In the indexing process:
+<ol>
+<li>Before a writer commit()s the IndexWriter, it must commit() the
+TaxonomyWriter. Nothing should be added to the index between these
+two commit()s.</li>
+<li>Similarly, before a writer close()s the IndexWriter, it must close() the
+TaxonomyWriter.</li>
+</ol></li>
+<li>In the searching process:
+<ol>
+<li>Open the IndexReader first, and then the TaxonomyReader.</li>
+<li>After a reopen() on the IndexReader, refresh() the TaxonomyReader.
+No search should be performed on the new IndexReader until refresh()
+has finished.</li>
+</ol></li>
+</ul>
+<p>
+Note that the above discussion assumes that the underlying file-system on which
+the index and the taxonomy are stored respects ordering: if index A is written before
+index B, then any reader finding a modified index B will also see a modified index A.
+<p>
+<b>Note:</b> <code>TaxonomyReader</code>'s refresh() is simpler than <code>IndexReader</code>'s reopen(). 
+While the latter keeps both the old and new reader open, the former keeps only the new reader. The reason 
+is that a new <code>IndexReader</code> might have modified old information (old documents deleted, for 
+example) so a thread which is in the middle of a search needs to continue using the old information. With 
+<code>TaxonomyReader</code>, however, we are guaranteed that existing categories are never deleted or modified - 
+the only thing that can happen is that new categories are added. Since search threads do not care if new categories 
+are added in the middle of a search, there is no reason to keep around the old object, and the new one suffices.
+
+</body>
+</html>
--- a/modules/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java
+++ b/modules/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java
@ -62,10 +62,10 @@ public class TaxonomyMergeUtils {
   * Merges the given taxonomy and index directories and commits the changes to
   * the given writers. This method uses {@link MemoryOrdinalMap} to store the
   * mapped ordinals. If you cannot afford the memory, you can use
-   * {@link #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)}
+   * {@link #merge(Directory, Directory, LuceneTaxonomyWriter.OrdinalMap, IndexWriter, LuceneTaxonomyWriter)}
   * by passing {@link DiskOrdinalMap}.
   * 
-   * @see #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)
+   * @see #merge(Directory, Directory, LuceneTaxonomyWriter.OrdinalMap, IndexWriter, LuceneTaxonomyWriter)
   */
  public static void merge(Directory srcIndexDir, Directory srcTaxDir,
                            IndexWriter destIndexWriter, 
--- a/modules/facet/src/java/org/apache/lucene/facet/package.html
+++ b/modules/facet/src/java/org/apache/lucene/facet/package.html
@ -3,6 +3,6 @@
    <title>Faceted Indexing and Search</title>
  </head>
  <body>
-    Provides faceted indexing and search capabilities.  
+    Provides faceted indexing and search capabilities. The <a href="../../../../userguide.html">userguide</a> is recommended for a start.  
  </body>
 </html>
--- a/modules/facet/src/java/overview.html
+++ b/modules/facet/src/java/overview.html
@ -21,6 +21,6 @@
    </title>
  </head>
  <body>
-  facet
+  Provides faceted indexing and search capabilities (checkout the <a href="userguide.html">userguide</a>).
  </body>
 </html>