From e2c0b5eded6fca566e3d10e2bc1b3673485d708d Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Fri, 19 Jul 2013 21:58:26 +0000 Subject: [PATCH] LUCENE-4894: remove facet userguide git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1505041 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 9 +- lucene/facet/build.xml | 3 - .../org/apache/lucene/facet/FacetPackage.java | 6 +- .../lucene/facet/codecs/facet42/package.html | 3 - .../lucene/facet/doc-files/prettify.css | 1 - .../apache/lucene/facet/doc-files/prettify.js | 28 - .../lucene/facet/doc-files/userguide.html | 788 ------------------ .../apache/lucene/facet/index/package.html | 1 - .../lucene/facet/range/RangeFacetRequest.java | 1 + .../apache/lucene/facet/taxonomy/package.html | 41 +- lucene/facet/src/java/overview.html | 14 +- 11 files changed, 37 insertions(+), 858 deletions(-) delete mode 100755 lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.css delete mode 100755 lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.js delete mode 100755 lucene/facet/src/java/org/apache/lucene/facet/doc-files/userguide.html diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7fa84a3f5ee..b147e00533b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -69,6 +69,9 @@ Bug Fixes * LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all deleted) segments. (Robert Muir, Shai Erera) +* LUCENE-4734: Add FastVectorHighlighter support for proximity queries and + phrase queries with gaps or overlapping terms. (Ryan Lauck, Adrien Grand) + API Changes * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. @@ -85,10 +88,10 @@ Optimizations * LUCENE-5119: DiskDV keeps the document-to-ordinal mapping on disk for SortedDocValues. (Robert Muir) -Bug fixes +Documentation -* LUCENE-4734: Add FastVectorHighlighter support for proximity queries and - phrase queries with gaps or overlapping terms. (Ryan Lauck, Adrien Grand) +* LUCENE-4894: remove facet userguide as it was outdated. Partially absorbed into + package's documentation and classes javadocs. (Shai Erera) ======================= Lucene 4.4.0 ======================= diff --git a/lucene/facet/build.xml b/lucene/facet/build.xml index 324125bfd9d..f4349e0381a 100644 --- a/lucene/facet/build.xml +++ b/lucene/facet/build.xml @@ -23,9 +23,6 @@ Faceted indexing and search capabilities - - - diff --git a/lucene/facet/src/java/org/apache/lucene/facet/FacetPackage.java b/lucene/facet/src/java/org/apache/lucene/facet/FacetPackage.java index efb9e33fc8d..861b577e290 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/FacetPackage.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/FacetPackage.java @@ -17,11 +17,7 @@ package org.apache.lucene.facet; * limitations under the License. */ -/** - * Required so that userguide files are copied as part of javadocs generation. - * Otherwise, if the root facet package contains no classes, doc-files aren't - * copied. - */ +/** Required for javadocs generation. */ public final class FacetPackage { private FacetPackage() {} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/package.html b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/package.html index 56f7f3f9618..c752b963484 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/package.html +++ b/lucene/facet/src/java/org/apache/lucene/facet/codecs/facet42/package.html @@ -16,9 +16,6 @@ limitations under the License. --> - - - Codec + DocValuesFormat that are optimized for facets. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.css b/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.css deleted file mode 100755 index d44b3a2282a..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.css +++ /dev/null @@ -1 +0,0 @@ -.pln{color:#000}@media screen{.str{color:#080}.kwd{color:#008}.com{color:#800}.typ{color:#606}.lit{color:#066}.pun,.opn,.clo{color:#660}.tag{color:#008}.atn{color:#606}.atv{color:#080}.dec,.var{color:#606}.fun{color:red}}@media print,projection{.str{color:#060}.kwd{color:#006;font-weight:bold}.com{color:#600;font-style:italic}.typ{color:#404;font-weight:bold}.lit{color:#044}.pun,.opn,.clo{color:#440}.tag{color:#006;font-weight:bold}.atn{color:#404}.atv{color:#060}}pre.prettyprint{padding:2px;border:1px solid #888}ol.linenums{margin-top:0;margin-bottom:0}li.L0,li.L1,li.L2,li.L3,li.L5,li.L6,li.L7,li.L8{list-style-type:none}li.L1,li.L3,li.L5,li.L7,li.L9{background:#eee} \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.js b/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.js deleted file mode 100755 index eef5ad7e6a0..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/doc-files/prettify.js +++ /dev/null @@ -1,28 +0,0 @@ -var q=null;window.PR_SHOULD_USE_CONTINUATION=!0; -(function(){function L(a){function m(a){var f=a.charCodeAt(0);if(f!==92)return f;var b=a.charAt(1);return(f=r[b])?f:"0"<=b&&b<="7"?parseInt(a.substring(1),8):b==="u"||b==="x"?parseInt(a.substring(2),16):a.charCodeAt(1)}function e(a){if(a<32)return(a<16?"\\x0":"\\x")+a.toString(16);a=String.fromCharCode(a);if(a==="\\"||a==="-"||a==="["||a==="]")a="\\"+a;return a}function h(a){for(var f=a.substring(1,a.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),a= -[],b=[],o=f[0]==="^",c=o?1:0,i=f.length;c122||(d<65||j>90||b.push([Math.max(65,j)|32,Math.min(d,90)|32]),d<97||j>122||b.push([Math.max(97,j)&-33,Math.min(d,122)&-33]))}}b.sort(function(a,f){return a[0]-f[0]||f[1]-a[1]});f=[];j=[NaN,NaN];for(c=0;ci[0]&&(i[1]+1>i[0]&&b.push("-"),b.push(e(i[1])));b.push("]");return b.join("")}function y(a){for(var f=a.source.match(/\[(?:[^\\\]]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),b=f.length,d=[],c=0,i=0;c=2&&a==="["?f[c]=h(j):a!=="\\"&&(f[c]=j.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return f.join("")}for(var t=0,s=!1,l=!1,p=0,d=a.length;p=5&&"lang-"===b.substring(0,5))&&!(o&&typeof o[1]==="string"))c=!1,b="src";c||(r[f]=b)}i=d;d+=f.length;if(c){c=o[1];var j=f.indexOf(c),k=j+c.length;o[2]&&(k=f.length-o[2].length,j=k-c.length);b=b.substring(5);B(l+i,f.substring(0,j),e,p);B(l+i+j,c,C(b,c),p);B(l+i+k,f.substring(k),e,p)}else p.push(l+i,b)}a.e=p}var h={},y;(function(){for(var e=a.concat(m), -l=[],p={},d=0,g=e.length;d=0;)h[n.charAt(k)]=r;r=r[1];n=""+r;p.hasOwnProperty(n)||(l.push(r),p[n]=q)}l.push(/[\S\s]/);y=L(l)})();var t=m.length;return e}function u(a){var m=[],e=[];a.tripleQuotedStrings?m.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?m.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/, -q,"'\"`"]):m.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&e.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var h=a.hashComments;h&&(a.cStyleComments?(h>1?m.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):m.push(["com",/^#(?:(?:define|elif|else|endif|error|ifdef|include|ifndef|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),e.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h|[a-z]\w*)>/,q])):m.push(["com",/^#[^\n\r]*/, -q,"#"]));a.cStyleComments&&(e.push(["com",/^\/\/[^\n\r]*/,q]),e.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));a.regexLiterals&&e.push(["lang-regex",/^(?:^^\.?|[!+-]|!=|!==|#|%|%=|&|&&|&&=|&=|\(|\*|\*=|\+=|,|-=|->|\/|\/=|:|::|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|[?@[^]|\^=|\^\^|\^\^=|{|\||\|=|\|\||\|\|=|~|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\s*(\/(?=[^*/])(?:[^/[\\]|\\[\S\s]|\[(?:[^\\\]]|\\[\S\s])*(?:]|$))+\/)/]);(h=a.types)&&e.push(["typ",h]);a=(""+a.keywords).replace(/^ | $/g, -"");a.length&&e.push(["kwd",RegExp("^(?:"+a.replace(/[\s,]+/g,"|")+")\\b"),q]);m.push(["pln",/^\s+/,q," \r\n\t\xa0"]);e.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/,q],["pun",/^.[^\s\w"-$'./@\\`]*/,q]);return x(m,e)}function D(a,m){function e(a){switch(a.nodeType){case 1:if(k.test(a.className))break;if("BR"===a.nodeName)h(a), -a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)e(a);break;case 3:case 4:if(p){var b=a.nodeValue,d=b.match(t);if(d){var c=b.substring(0,d.index);a.nodeValue=c;(b=b.substring(d.index+d[0].length))&&a.parentNode.insertBefore(s.createTextNode(b),a.nextSibling);h(a);c||a.parentNode.removeChild(a)}}}}function h(a){function b(a,d){var e=d?a.cloneNode(!1):a,f=a.parentNode;if(f){var f=b(f,1),g=a.nextSibling;f.appendChild(e);for(var h=g;h;h=g)g=h.nextSibling,f.appendChild(h)}return e} -for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),e;(e=a.parentNode)&&e.nodeType===1;)a=e;d.push(a)}var k=/(?:^|\s)nocode(?:\s|$)/,t=/\r\n?|\n/,s=a.ownerDocument,l;a.currentStyle?l=a.currentStyle.whiteSpace:window.getComputedStyle&&(l=s.defaultView.getComputedStyle(a,q).getPropertyValue("white-space"));var p=l&&"pre"===l.substring(0,3);for(l=s.createElement("LI");a.firstChild;)l.appendChild(a.firstChild);for(var d=[l],g=0;g=0;){var h=m[e];A.hasOwnProperty(h)?window.console&&console.warn("cannot override language handler %s",h):A[h]=a}}function C(a,m){if(!a||!A.hasOwnProperty(a))a=/^\s*=o&&(h+=2);e>=c&&(a+=2)}}catch(w){"console"in window&&console.log(w&&w.stack?w.stack:w)}}var v=["break,continue,do,else,for,if,return,while"],w=[[v,"auto,case,char,const,default,double,enum,extern,float,goto,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"], -"catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],F=[w,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,dynamic_cast,explicit,export,friend,inline,late_check,mutable,namespace,nullptr,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],G=[w,"abstract,boolean,byte,extends,final,finally,implements,import,instanceof,null,native,package,strictfp,super,synchronized,throws,transient"], -H=[G,"as,base,by,checked,decimal,delegate,descending,dynamic,event,fixed,foreach,from,group,implicit,in,interface,internal,into,is,lock,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var"],w=[w,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],I=[v,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"], -J=[v,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],v=[v,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],K=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)/,N=/\S/,O=u({keywords:[F,H,w,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END"+ -I,J,v],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),A={};k(O,["default-code"]);k(x([],[["pln",/^[^]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-",/^]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i]]), -["default-markup","htm","html","mxml","xhtml","xml","xsl"]);k(x([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"]],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/],["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css", -/^style\s*=\s*([^\s"'>]+)/i]]),["in.tag"]);k(x([],[["atv",/^[\S\s]+/]]),["uq.val"]);k(u({keywords:F,hashComments:!0,cStyleComments:!0,types:K}),["c","cc","cpp","cxx","cyc","m"]);k(u({keywords:"null,true,false"}),["json"]);k(u({keywords:H,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:K}),["cs"]);k(u({keywords:G,cStyleComments:!0}),["java"]);k(u({keywords:v,hashComments:!0,multiLineStrings:!0}),["bsh","csh","sh"]);k(u({keywords:I,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}), -["cv","py"]);k(u({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["perl","pl","pm"]);k(u({keywords:J,hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb"]);k(u({keywords:w,cStyleComments:!0,regexLiterals:!0}),["js"]);k(u({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,true,try,unless,until,when,while,yes", -hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);k(x([],[["str",/^[\S\s]+/]]),["regex"]);window.prettyPrintOne=function(a,m,e){var h=document.createElement("PRE");h.innerHTML=a;e&&D(h,e);E({g:m,i:e,h:h});return h.innerHTML};window.prettyPrint=function(a){function m(){for(var e=window.PR_SHOULD_USE_CONTINUATION?l.now()+250:Infinity;p=0){var k=k.match(g),f,b;if(b= -!k){b=n;for(var o=void 0,c=b.firstChild;c;c=c.nextSibling)var i=c.nodeType,o=i===1?o?b:c:i===3?N.test(c.nodeValue)?b:o:o;b=(f=o===b?void 0:o)&&"CODE"===f.tagName}b&&(k=f.className.match(g));k&&(k=k[1]);b=!1;for(o=n.parentNode;o;o=o.parentNode)if((o.tagName==="pre"||o.tagName==="code"||o.tagName==="xmp")&&o.className&&o.className.indexOf("prettyprint")>=0){b=!0;break}b||((b=(b=n.className.match(/\blinenums\b(?::(\d+))?/))?b[1]&&b[1].length?+b[1]:!0:!1)&&D(n,b),d={g:k,h:n,i:b},E(d))}}p - - -Facet Userguide - - - - - - - - - - -

- Apache Lucene
- Faceted Search
- User's Guide

- - - -

Introduction

-

-A category is an aspect of indexed documents which can be used to classify the -documents. For example, in a collection of books at an online bookstore, categories of -a book can be its price, author, publication date, binding type, and so on. -

-In faceted search, in addition to the standard set of search results, we also get facet -results, which are lists of subcategories for certain categories. For example, for the -price facet, we get a list of relevant price ranges; for the author facet, we get a list of -relevant authors; and so on. In most UIs, when users click one of these subcategories, -the search is narrowed, or drilled down, and a new search limited to this subcategory -(e.g., to a specific price range or author) is performed. -

-Note that faceted search is more than just the ordinary fielded search. In fielded -search, users can add search keywords like price:10 or author:"Mark -Twain" to the query to narrow the search, but this requires knowledge of which -fields are available, and which values are worth trying. This is where faceted search -comes in: it provides a list of useful subcategories, which ensures that the user only -drills down into useful subcategories and never into a category for which there are no -results. In essence, faceted search makes it easy to navigate through the search results. -The list of subcategories provided for each facet is also useful to the user in itself, -even when the user never drills down. This list allows the user to see at one glance -some statistics on the search results, e.g., what price ranges and which authors are -most relevant to the given query. -

-In recent years, faceted search has become a very common UI feature in search -engines, especially in e-commerce websites. Faceted search makes it easy for -untrained users to find the specific item they are interested in, whereas manually -adding search keywords (as in the examples above) proved too cumbersome for -ordinary users, and required too much guesswork, trial-and-error, or the reading of -lengthy help pages. -

-See http://en.wikipedia.org/wiki/Faceted_search for more information on faceted -search. - -

Facet Features

-First and main faceted search capability that comes to mind is counting, but in fact -faceted search is more than facet counting. We now briefly discuss the available -faceted search features. - -

Facet Counting

-

-Which of the available subcategories of a facet should a UI display? A query in a -book store might yield books by a hundred different authors, but normally we'd want -do display only, say, ten of those. -

-Most available faceted search implementations use counts to determine the -importance of each subcategory. These implementations go over all search results for -the given query, and count how many results are in each subcategory. Finally, the -subcategories with the most results can be displayed. So the user sees the price ranges, -authors, and so on, for which there are most results. Often, the count is displayed next -to the subcategory name, in parentheses, telling the user how many results he can -expect to see if he drills down into this subcategory. -

-The main API for obtaining facet counting is CountFacetRequest, as in the -following code snippet: -

-new CountFacetRequest(new CategoryPath("author"), 10));
-
-A detailed code example using count facet requests is shown below - see -Accumulating Facets. - -

Facet Associations

-

-So far we've discussed categories as binary features, where a document either belongs -to a category, or not. -

-While counts are useful in most situations, they are sometimes not sufficiently -informative for the user, with respect to deciding which subcategory is more -important to display. -

-For this, the facets package allows to associate a value with a category. The search -time interpretation of the associated value is application dependent. For example, a -possible interpretation is as a match level (e.g., confidence level). This value can -then be used so that a document that is very weakly associated with a certain category -will only contribute little to this category's aggregated weight. - -

Multiple Facet Requests

-

-A single faceted accumulation is capable of servicing multiple facet requests. -Programmatic, this is quite simple - wrap all the facet requests of interest into the -facet-search-parameters which are passed to a facets accumulator/collector (more on -these objects below). The results would be comprised of as many facet results as there -were facet requests. -

-However there is a delicate limitation: all facets maintained in the same location in -the index are required to be treated the same. See the section on Indexing Parameters -for an explanation on maintaining certain facets at certain locations. - -

Facet Labels at Search Time

-

-Facets results always contain the facet (internal) ID and (accumulated) value. Some of -the results also contain the facet label, AKA the category name. We mention this here -since computing the label is a time consuming task, and hence applications can -specify with a facet request to return top 1000 facets but to compute the label only for -the top 10 facets. In order to compute labels for more of the facet results it is not -required to perform accumulation again. -

-See FacetRequest.getNumResults(), FacetRequest.getNumLabel() and -FacetResultNode.getLabel(TaxonomyReader). - -

Indexing Categories Illustrated

-

-In order to find facets at search time they must first be added to the index at indexing -time. Recall that Lucene documents are made of fields for textual search. The addition -of categories is performed by an appropriate DocumentBuilder - or -CategoryDocumentBuilder in our case. -

-Indexing therefore usually goes like this: -

    -
  • For each input document: -
      -
    • Create a fresh (empty) Lucene Document
    • -
    • Parse input text and add appropriate text search fields
    • -
    • Gather all input categories associated with the document and create -a CategoryDocumentBuilder with the list of categories
    • -
    • Build the document - this actually adds the categories to the -Lucene document.
    • -
    • Add the document to the index
    • -
  • -
-Following is a code snippet for indexing categories. The complete example can be -found in package org.apache.lucene.facet.example.simple.SimpleIndexer. -
-IndexWriter writer = ...
-TaxonomyWriter taxo = new DirectoryTaxonomyWriter(taxoDir);
-...
-Document doc = new Document();
-doc.add(new Field("title", titleText, Store.YES, Index.ANALYZED));
-...
-List<CategoryPath> categories = new ArrayList<CategoryPath>();
-categories.add(new CategoryPath("author", "Mark Twain"));
-categories.add(new CategoryPath("year", "2010"));
-...
-DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo);
-categoryDocBuilder.setCategoryPaths(categories);
-categoryDocBuilder.build(doc);
-writer.addDocument(doc);
-
-

-We now explain the steps above, following the code line numbers: - - - - - - - - - - - - - - - - - - - - - -
(4)Document contains not only text search fields but also facet search -information.
(7)Prepare a container for document categories.
(8)Categories that should be added to the document are accumulated in the -categories list.
(11)A CategoryDocumentBuilder is created, set with the appropriate list -of categories, and invoked to build - that is, to populate the document -with categories. It is in this step that the taxonomy is updated to contain the -newly added categories (if not already there) - see more on this in the -section about the Taxonomy Index below. This line could be made more -compact: one can create a single CategoryDocumentBuilder cBuilder and reuse it like this: -
-DocumentBuilder cBuilder = new CategoryDocumentBuilder(taxo);
-cBuilder.setCategoryPaths(categories).build(doc);
-
-
(14)Add the document to the index. As a result, category info is saved also in -the regular search index, for supporting facet aggregation at search time -(e.g. facet counting) as well as facet drill-down. For more information on -indexed facet information see below the section Indexed Facet Information.
- -

Accumulating Facets Illustrated

-

-Facets accumulation reflects a set of documents over some facet requests: -

    -
  • Document set - a subset of the index documents, usually documents -matching a user query.
  • -
  • Facet requests - facet accumulation specification, e.g. count a certain facet -dimension.
  • -
-

-FacetRequest is a basic component in faceted search - it describes the facet -information need. Every facet request is made of at least two fields: -

    -
  • CategoryPath - root category of the facet request. The categories that -are returned as a result of the request will all be descendants of this root
  • -
  • Number of Results - number of sub-categories to return (at most).
  • -
-

-There are other parameters to a facet request, such as -how many facet results to -label-, -how deep to go from the request root when serving the facet request- and -more - see the API Javadocs for FacetRequest and its subclasses for more -information on these parameters. For labels in particular, see the section Facet Labels -at Search Time. -

-FacetRequest in an abstract class, open for extensions, and users may add their -own requests. The most often used request is CountFacetRequest - used for -counting facets. -

-Facets accumulation is - not surprisingly - driven by a FacetsAccumulator. The -most used one is StandardFacetsAccumulator, however there are also accumulators -that support sampling - to be used in huge collections, and there's an adaptive facets -accumulator which applies sampling conditionally on the statistics of the data. While -facets accumulators are very extendible and powerful, they might be too -overwhelming for beginners. For this reason, the code offers a higher level interface -for facets accumulating: the FacetsCollector. It extends Collector, and as such -can be passed to the search() method of Lucene's IndexSearcher. In case the -application also needs to collect documents (in addition to accumulating/collecting -facets), it can wrap multiple collectors with MultiCollector. Most code samples -below use FacetsCollector due to its simple interface. It is quite likely that -FacetsCollector should suffice the needs of most applications, therefore we -recommend to start with it, and only when needing more flexibility turn to directly -use facets accumulators. -

-Following is a code snippet from the example code - the complete example can be -found under org.apache.lucene.facet.example.simple.Searcher: -

-IndexReader indexReader = DirectoryReader.open(indexDir);
-IndexSearcher searcher = new IndexSearcher(indexReader);
-TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
-...
-Query q = new TermQuery(new Term(SimpleUtils.TEXT, "white"));
-TopScoreDocCollector tdc = TopScoreDocCollector.create(10, true);
-...
-FacetSearchParams facetSearchParams = new FacetSearchParams();
-facetSearchParams.addFacetRequest(new CountFacetRequest(
-    new CategoryPath("author"), 10));
-...
-FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, indexReader, taxo);
-searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector));
-List<FacetResult> res = facetsCollector.getFacetResults();
-
-

-We now explain the steps above, following the code line numbers: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
(1)Index reader and Searcher are initialized as usual.
(3)A taxonomy reader is opened - it provides access to the facet information -which was stored by the Taxonomy Writer at indexing time.
(5)Regular text query is created to find the documents matching user need, and -a collector for collecting the top matching documents is created.
(8)Facet-search-params is a container for facet requests.
(10)A single facet-request - namely a count facet request - is created and added -to the facet search params. The request should return top 10 Author -subcategory counts.
(12)Facets-Collector is the simplest interface for facets accumulation (counting -in this example).
(13)Lucene search takes both collectors - facets-collector and top-doccollector, -both wrapped by a multi-collector. This way, a single search -operation finds both top documents and top facets. Note however that facets -aggregation takes place not only over the top documents, but rather over all -documents matching the query.
(14)Once search completes, facet-results can be obtained from the facetscollector.
- -

-Returned facet results are organized in a list, conveniently ordered the same as the -facet-requests in the facet-search-params. Each result however contains the request -for which it was created. -

-Here is the (recursive) structure of the facet result: -

    -
  • Facet Result -
      -
    • Facet Request - the request for which this result was obtained.
    • -
    • Valid Descendants - how many valid descendants were encountered -over the set of matching documents (some of which might have been -filtered out because e.g. only top 10 results were requested).
    • -
    • Root Result Node - root facet result for the request -
        -
      • Ordinal - unique internal ID of the facet
      • -
      • Label - full label of the facet (possibly null)
      • -
      • Value - facet value, e.g. count
      • -
      • Sub-results-nodes - child result nodes (possibly null)
      • -
    • -
  • -
-

-Note that not always there would be sub result nodes - this depends on the -requested result mode: -

    -
  • PER_NODE_IN_TREE - a tree, and so there may be sub results.
  • -
  • GLOBAL_FLAT - here the results tree would be rather flat, with only (at -most) leaves below the root result node.
  • -
- -

Indexed Facet Information

-

-When indexing a document to which categories were added, information on these -categories is added to the search index, in two locations: -

    -
  • Category Tokens are added to the document for each category attached to -that document. These categories can be used at search time for drill-down.
  • -
  • A special Category List Token is added to each document containing -information on all the categories that were added to this document. This can -be used at search time for facet accumulation, e.g. facet counting.
  • -
-

-When a category is added to the index (that is, when a document containing a -category is indexed), all its parent categories are added as well. For example, indexing -a document with the category <"author", -"American-, "Mark Twain"> results in -creating three tokens: "/author", "/author/American", and -"/author/American/Mark Twain" (the character '/' here is just a human -readable separator - there's no such element in the actual index). This allows drilling down -and counting any category in the taxonomy, and not just leaf nodes, enabling a -UI application to show either how many books have authors, or how many books -have American authors, or how many books have Mark Twain as their (American) -author. -

-Similarly, Drill-down capabilities are this way possible also for node categories. -

-In order to keep the counting list compact, it is built using category ordinal - an -ordinal is an integer number attached to a category when it is added for the first time -into the taxonomy. -

-For ways to further alter facet index see the section below on Facet Indexing -Parameters. - -

Taxonomy Index

-

-The taxonomy is an auxiliary data-structure maintained side-by-side with the regular -index to support faceted search operations. It contains information about all the -categories that ever existed in any document in the index. Its API is open and allows -simple usage, or more advanced for the interested users. -

-When a category is added to a document, a corresponding node is added to the -taxonomy (unless already there). In fact, sometimes more than one node is added - -each parent category is added as well, so that the taxonomy is maintained as a Tree, -with a virtual root. -

-So, for the above example, adding the category the category <"author", -"American-, "Mark Twain"> -actually added three nodes: one for "/author", one for "/author/American" and one for -"/author/American/Mark Twain". -

-An integer number - called ordinal is attached to each category the first time the -category is added to the taxonomy. This allows for a compact representation of -category list tokens in the index, for facets accumulation. -

-One interesting fact about the taxonomy index is worth knowing: once a category -is added to the taxonomy, it is never removed, even if all related documents are -removed. This differs from a regular index, where if all documents containing a -certain term are removed, and their segments are merged, the term will also be -removed. This might cause a performance issue: large taxonomy means large ordinal -numbers for categories, and hence large categories values arrays would be maintained -during accumulation. It is probably not a real problem for most applications, but be -aware of this. If, for example, an application at a certain point in time removes an -index entirely in order to recreate it, or, if it removed all the documents from the index -in order to re-populate it, it also makes sense in this opportunity to remove the -taxonomy index and create a new, fresh one, without the unused categories. - -

Facet Parameters

-

-Facet parameters control how categories and facets are indexed and searched. Apart -from specifying facet requests within facet search parameters, under default settings it -is not required to provide any parameters, as there are ready to use working defaults -for everything. -

-However many aspects are configurable and can be modified by providing altered -facet parameters for either search or indexing. - -

Facet Indexing Parameters

-

-Facet Indexing Parameters are consulted with during indexing. Among several -parameters it defines, the following two are likely to interest many applications: -

    -
  • Category list definitions - in the index, facets are maintained in two -forms: category-tokens (for drill-down) and category-list-tokens (for -accumulation). This parameter allows to specify, for each category, the -Lucene term used for maintaining the category-list-tokens for that category. -The default implementation in FacetIndexingParams maintains -this information for all categories under the same special dedicated term. -One case where it is needed to maintain two categories in separate category -lists, is when it is known that at search time it would be required to use -different types of accumulation logic for each, but at the same accumulation -call.
  • -
  • Partition size - category lists can be maintained in a partitioned way. If, -for example, the partition size is set to 1000, a distinct sub-term is used for -maintaining each 1000 categories, e.g. term1 for categories 0 to 999, term2 -for categories 1000 to 1999, etc. The default implementation in -FacetIndexingParams maintains category lists in a single -partition, hence it defines the partition size as Integer.MAX_VALUE. The -importance of this parameter is on allowing to handle very large -taxonomies without exhausting RAM resources. This is because at facet -accumulation time, facet values arrays are maintained in the size of the -partition. With a single partition, the size of these arrays is as the size of the -taxonomy, which might be OK for most applications. Limited partition -sizes allow to perform the accumulation with less RAM, but with some -runtime overhead, as the matching documents are processed for each of the -partitions.
  • -
-

-See the API Javadocs of FacetIndexingParams for additional configuration -capabilities which were not discussed here. - -

Facet Search Parameters

-

-Facet Search Parameters, consulted at search time (during facets accumulation) are -rather plain, providing the following: -

    -
  • Facet indexing parameters - which were in effect at indexing time - -allowing facets accumulation to understand how facets are maintained in -the index.
  • -
  • Container of facet requests - the requests which should be accumulated.
  • -
- -

Category Lists, Multiple Dimensions

-

-Category list parameters which are accessible through the facet indexing parameters -provide the information about: -

    -
  • Lucene Term under which category information is maintained in the index.
  • -
  • Encoding (and decoding) used for writing and reading the categories -information in the index.
  • -
-

-For cases when certain categories should be maintained in different location than -others, use PerDimensionIndexingParams, which returns a different -CategoryListParams object for each dimension. This is a good opportunity to -explain about dimensions. This is just a notion: the top element - or first element - in -a category path is denoted as the dimension of that category. Indeed, the dimension -stands out as a top important part of the category path, such as "Location" for the -category "Location/Europe/France/Paris". - -

Advanced Faceted Examples

-

-We now provide examples for more advanced facet indexing and search, such as -drilling-down on facet values and multiple category lists. - -

Drill-Down with Regular Facets

-

-Drill-down allows users to focus on part of the results. Assume a commercial sport -equipment site where a user is searching for a tennis racquet. The user issues the -query tennis racquet and as result is shown a page with 10 tennis racquets, by -various providers, of various types and prices. In addition, the site UI shows to the -user a break down of all available racquets by price and make. The user now decides -to focus on racquets made by Head, and will now be shown a new page, with 10 -Head racquets, and new break down of the results into racquet types and prices. -Additionally, the application can choose to display a new breakdown, by racquet -weights. This step of moving from results (and facet statistics) of the entire (or larger) -data set into a portion of it by specifying a certain category, is what we call Drilldown. -We now show the required code lines for implementing such a drill-down. -

-Query baseQuery = queryParser.parse("tennis racquet");
-DrillDownQuery q2 = new DrillDownQuery(indexingParams, baseQuery);
-q2.add(new CategoryPath("make", "head"), 10));
-
-

-In line 1 the original user query is created and then used to obtain information on -all tennis racquets. -

-In line 2, a specific category from within the facet results was selected by the user, -and is hence used for creating the drill-down query. -

-Please refer to SimpleSearcher.searchWithDrillDown() for a more detailed -code example performing drill-down. - -

Multiple Category Lists

-

-The default is to maintain all categories information in a single list. While this will -suit most applications, in some situations an application may wish to use multiple -category lists, for example, when the distribution of some category values is different -than that of other categories and calls for using a different encoding, more efficient -for the specific distribution. Another example is when most facets are rarely used -while some facets are used very heavily, so an application may opt to maintain the -latter in memory - and in order to keep memory footprint lower it is useful to -maintain only those heavily used facets in a separate category list. -

-First we define indexing parameters with multiple category lists: -

-PerDimensionIndexingParams iParams = new PerDimensionIndexingParams();
-iParams.addCategoryListParams(new CategoryPath("Author"), 
-    new CategoryListParams(new Term("$RarelyUsed", "Facets")));
-iParams.addCategoryListParams(new CategoryPath("Language"),
-    new CategoryListParams(new Term("$HeavilyUsed", "Ones")));
-
-

-This will cause the Language categories to be maintained in one category list, and -Author facets to be maintained in a another category list. Note that any other category, -if encountered, will still be maintained in the default category list. -

-These non-default indexing parameters should now be used both at indexing and -search time. As depicted below, at indexing time this is done when creating the -category document builder, while at search time this is done when creating the search -parameters. Other than that the faceted search code is unmodified. -

-DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo, iParams);
-...
-FacetSearchParams facetSearchParams = new FacetSearchParams(iParams);
-
-

-A complete simple example can be found in package org.apache.lucene.facet.example.multiCL -under the example code. - -

Optimizations

-

-Faceted search through a large collection of documents with large numbers of facets -altogether and/or large numbers of facets per document is challenging performance -wise, either in CPU, RAM, or both. A few ready to use optimizations exist to tackle -these challenges. - -

Sampling

-

-Facet sampling allows to accumulate facets over a sample of the matching -documents set. In many cases, once top facets are found over the sample set, exact -accumulations are computed for those facets only, this time over the entire matching -document set. -

-Two kinds of sampling exist: complete support and wrapping support. The -complete support is through SamplingAccumulator and is tied to an extension of the -StandardFacetsAccumulator and has the benefit of automatically applying other -optimizations, such as Complements. The wrapping support is through -SamplingWrapper and can wrap any accumulator, and as such, provides more -freedom for applications. - -

Complements

-

-When accumulating facets over a very large matching documents set, possibly -almost as large as the entire collection, it is possible to speed up accumulation by -looking at the complement set of documents, and then obtaining the actual results by -subtracting from the total results. It should be noted that this is available only for -count requests, and that the first invocation that involves this optimization might take -longer because the total counts have to be computed. -

-This optimization is applied automatically by StandardFacetsAccumulator. - -

Partitions

-

-Partitions are also discussed in the section about Facet Indexing parameters. -

-Facets are internally accumulated by first accumulating all facets and later on -extracting the results for the requested facets. During this process, accumulation -arrays are maintained in the size of the taxonomy. For a very large taxonomy, with -multiple simultaneous faceted search operations, this might lead to excessive memory -footprint. Partitioning the faceted information allows to relax the memory usage, by -maintaining the category lists in several partitions, and by processing one partition at -a time. This is automatically done by StandardFacetsAccumulator. However the -default partition size is Integer.MAX_VALUE, practically setting to a single partition, -i.e. no partitions at all. -

-Decision to override this behavior and use multiple partitions must be taken at -indexing time. Once the index is created and already contains category lists it is too -late to modify this. -

-See FacetIndexingParams.getPartitionSize() for API to alter this default -behavior. - -

Concurrent Indexing and Search

-

-Sometimes, indexing is done once, and when the index is fully prepared, searching -starts. However, in most real applications indexing is incremental (new data comes in -once in a while, and needs to be indexed), and indexing often needs to happen while -searching is continuing at full steam. -

-Luckily, Lucene supports multiprocessing - one process writing to an index while -another is reading from it. One of the key insights behind how Lucene allows multiprocessing -is Point In Time semantics. The idea is that when an IndexReader is opened, -it gets a view of the index at the point in time it was opened. If an IndexWriter -in a different process or thread modifies the index, the reader does not know about it until a new -IndexReader is opened (or the reopen() method of an existing IndexReader is called). -

-In faceted search, we complicate things somewhat by adding a second index - the -taxonomy index. The taxonomy API also follows point-in-time semantics, but this is -not quite enough. Some attention must be paid by the user to keep those two indexes -consistently in sync: -

-The main index refers to category numbers defined in the taxonomy index. -Therefore, it is important that we open the TaxonomyReader after opening the -IndexReader. Moreover, every time an IndexReader is reopen()ed, the -TaxonomyReader needs to be refresh()'ed as well. -

-But there is one extra caution: whenever the application deems it has written -enough information worthy a commit, it must first call commit() for the -TaxonomyWriter and only after that call commit() for the IndexWriter. -Closing the indices should also be done in this order - first close the taxonomy, and only after -that close the index. -

-To summarize, if you're writing a faceted search application where searching and -indexing happens concurrently, please follow these guidelines (in addition to the usual -guidelines on how to use Lucene correctly in the concurrent case): -

    -
  • In the indexing process: -
      -
    1. Before a writer commit()s the IndexWriter, it must commit() the -TaxonomyWriter. Nothing should be added to the index between these -two commit()s.
    2. -
    3. Similarly, before a writer close()s the IndexWriter, it must close() the -TaxonomyWriter.
    4. -
  • -
  • In the searching process: -
      -
    1. Open the IndexReader first, and then the TaxonomyReader.
    2. -
    3. After a reopen() on the IndexReader, refresh() the TaxonomyReader. -No search should be performed on the new IndexReader until refresh() -has finished.
    4. -
  • -
-

-Note that the above discussion assumes that the underlying file-system on which -the index and the taxonomy are stored respects ordering: if index A is written before -index B, then any reader finding a modified index B will also see a modified index A. -

-Note: TaxonomyReader's refresh() is simpler than IndexReader's reopen(). -While the latter keeps both the old and new reader open, the former keeps only the new reader. The reason -is that a new IndexReader might have modified old information (old documents deleted, for -example) so a thread which is in the middle of a search needs to continue using the old information. With -TaxonomyReader, however, we are guaranteed that existing categories are never deleted or modified - -the only thing that can happen is that new categories are added. Since search threads do not care if new categories -are added in the middle of a search, there is no reason to keep around the old object, and the new one suffices. -
However, if the taxonomy index was recreated since the TaxonomyReader was opened or -refreshed, this assumption (that categories are forevr) no longer holds, and refresh() will -throw an InconsistentTaxonomyException, guiding the application to open -a new TaxonomyReader for up-to-date taxonomy data. (Old one can -be closed as soon as it is no more used.) - - - - diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/package.html b/lucene/facet/src/java/org/apache/lucene/facet/index/package.html index 1b94556b8aa..bc9b2ee6c70 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/package.html +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/package.html @@ -20,6 +20,5 @@ Facets indexing code. - \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetRequest.java index b548624a446..d3c3afc81be 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/range/RangeFacetRequest.java @@ -36,6 +36,7 @@ public class RangeFacetRequest extends FacetRequest { public final Range[] ranges; + @SuppressWarnings("unchecked") public RangeFacetRequest(String field, T...ranges) { super(new CategoryPath(field), 1); this.ranges = ranges; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/package.html b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/package.html index 36f13e84244..8713c672ca3 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/package.html +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/package.html @@ -21,28 +21,33 @@

Taxonomy of Categories

- Facets are defined using a hierarchy of categories, known as a - Taxonomy. - -
- For example, in a book store application, a Taxonomy could have the - following hierarchy: -

+ Facets are defined using a hierarchy of categories, known as a Taxonomy. + For example, the taxonomy of a book store application might have the following structure:

    -
  • Author
  • -
      -
    • Mark Twain
    • -
    • J. K. Rowling
    • -
    +
  • Author +
      +
    • Mark Twain
    • +
    • J. K. Rowling
    • +
    +
    -
  • Date
  • -
      -
    • 2010
    • -
    • 2009
    • -
    +
  • Date +
      +
    • 2010 +
        +
      • March
      • +
      • April
      • +
      +
    • +
    • 2009
    • +
    +
- The Taxonomy translates category-paths into category-ordinal and vice versa. + The Taxonomy translates category-paths into interger identifiers (often termed ordinals) and vice versa. + The category Author/Mark Twain adds two nodes to the taxonomy: Author and + Author/Mark Twain, each is assigned a different ordinal. The taxonomy maintains the invariant that a + node always has an ordinal that is < all its children. \ No newline at end of file diff --git a/lucene/facet/src/java/overview.html b/lucene/facet/src/java/overview.html index 93ce4b66050..0d7378f6f9f 100644 --- a/lucene/facet/src/java/overview.html +++ b/lucene/facet/src/java/overview.html @@ -15,12 +15,10 @@ limitations under the License. --> - - - facet - - - - Provides faceted indexing and search capabilities (checkout the userguide). - +facet + +Provides faceted indexing and search capabilities. Checkout this +and this blog posts for some overview on the facets module +as well as source code examples here. +