From 4ba69db56230650fc70d56f30a4572459b6c2167 Mon Sep 17 00:00:00 2001
From: Grant Ingersoll
Date: Tue, 22 Aug 2006 13:38:16 +0000
Subject: [PATCH] Initial check in of scoring.xml documentation. I have also
added lucene.css stylesheet and included it in the Anakia Site VSL, although
I am open to other ways of including style information on a per document
basis (I just don't know Velocity to make the changes).
I have not linked in scoring.xml to the main documentation yet, as I wanted others to proofread/edit before making it official. Once it is official, I will hook it in via the projects.xml
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@433627 13f79535-47bb-0310-9956-ffa450edef68
---
CHANGES.txt | 30 +-
docs/benchmarks.html | 1 +
docs/contributions.html | 1 +
docs/demo.html | 1 +
docs/demo2.html | 1 +
docs/demo3.html | 1 +
docs/demo4.html | 1 +
docs/features.html | 1 +
docs/fileformats.html | 18 +-
docs/gettingstarted.html | 1 +
docs/index.html | 1 +
docs/lucene-sandbox/index.html | 1 +
docs/mailinglists.html | 1 +
docs/queryparsersyntax.html | 1 +
docs/resources.html | 1 +
docs/systemproperties.html | 1 +
docs/whoweare.html | 3 +-
site/build.xml | 5 +
xdocs/scoring.xml | 548 +++++++++++++++++++++++++++++++++
xdocs/styles/lucene.css | 34 ++
xdocs/stylesheets/site.vsl | 1 +
21 files changed, 638 insertions(+), 15 deletions(-)
create mode 100644 xdocs/scoring.xml
create mode 100644 xdocs/styles/lucene.css
diff --git a/CHANGES.txt b/CHANGES.txt
index 70044e31662..2488c68abaa 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -39,12 +39,12 @@ API Changes
2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated
and is supposed to be replaced with the WordlistLoader class in
package org.apache.lucene.analysis (Daniel Naber)
-
+
3. LUCENE-609: Revert return type of Document.getField(s) to Field
for backward compatibility, added new Document.getFieldable(s)
for access to new lazy loaded fields. (Yonik Seeley)
-
- 4. LUCENE-608: Document.fields() has been deprecated and a new method
+
+ 4. LUCENE-608: Document.fields() has been deprecated and a new method
Document.getFields() has been added that returns a List instead of
an Enumeration (Daniel Naber)
@@ -60,12 +60,12 @@ API Changes
ie: IndexReader).
(Michael McCandless via Chris Hostetter)
- 7. LUCENE-638: FSDirectory.list() now only returns the directory's
+ 7. LUCENE-638: FSDirectory.list() now only returns the directory's
Lucene-related files. Thanks to this change one can now construct
a RAMDirectory from a file system directory that contains files
not related to Lucene.
(Simon Willnauer via Daniel Naber)
-
+
Bug fixes
1. Fixed the web application demo (built with "ant war-demo") which
@@ -93,10 +93,10 @@ Bug fixes
8. LUCENE-607: ParallelReader's TermEnum fails to advance properly to
new fields (Chuck Williams, Christian Kohlschuetter via Yonik Seeley)
- 9. LUCENE-610,LUCENE-611: Simple syntax changes to allow compilation with ecj:
+ 9. LUCENE-610,LUCENE-611: Simple syntax changes to allow compilation with ecj:
disambiguate inner class scorer's use of doc() in BooleanScorer2,
other test code changes. (DM Smith via Yonik Seeley)
-
+
10. LUCENE-451: All core query types now use ComplexExplanations so that
boosts of zero don't confuse the BooleanWeight explain method.
(Chris Hostetter)
@@ -132,6 +132,18 @@ Optimizations
keeping a count of buffered documents rather than counting after each
document addition. (Doron Cohen, Paul Smith, Yonik Seeley)
+ 5. Modified TermScorer.explain to use TermDocs.skipTo() instead of looping through docs. (Grant Ingersoll)
+
+Test Cases
+ 1. Added TestTermScorer.java (Grant Ingersoll)
+
+Documentation
+
+ 1. Added style sheet to xdocs named lucene.css and included in the Anakia VSL descriptor. (Grant Ingersoll)
+
+ 2. Added draft scoring.xml document into xdocs. Intent is to be the equivalent of fileformats.xml for scoring. It is not linked into project.xml, so it will not show up on the
+ website yet. (Grant Ingersoll and Steve Rowe)
+
Release 2.0.0 2006-05-26
API Changes
@@ -143,8 +155,8 @@ API Changes
2. DisjunctionSumScorer is no longer public.
(Paul Elschot via Otis Gospodnetic)
-
- 3. Creating a Field with both an empty name and an empty value
+
+ 3. Creating a Field with both an empty name and an empty value
now throws an IllegalArgumentException
(Daniel Naber)
diff --git a/docs/benchmarks.html b/docs/benchmarks.html
index a7d85ceb4df..a51e2eeac04 100644
--- a/docs/benchmarks.html
+++ b/docs/benchmarks.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Resources - Performance Benchmarks
+
diff --git a/docs/contributions.html b/docs/contributions.html
index bb7ddf40ae9..4986488add8 100644
--- a/docs/contributions.html
+++ b/docs/contributions.html
@@ -39,6 +39,7 @@ limitations under the License.
Apache Lucene -
Contributions - Apache Lucene
+
diff --git a/docs/demo.html b/docs/demo.html
index 0c4aecaf01e..c087cc3759d 100644
--- a/docs/demo.html
+++ b/docs/demo.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Building and Installing the Basic Demo
+
diff --git a/docs/demo2.html b/docs/demo2.html
index 0983f595d41..530a63080b3 100644
--- a/docs/demo2.html
+++ b/docs/demo2.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Basic Demo Sources Walk-through
+
diff --git a/docs/demo3.html b/docs/demo3.html
index e0426bde4f4..9d044a73d56 100644
--- a/docs/demo3.html
+++ b/docs/demo3.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Building and Installing the Basic Demo
+
diff --git a/docs/demo4.html b/docs/demo4.html
index 56fb075e6dd..04ce3d5480e 100644
--- a/docs/demo4.html
+++ b/docs/demo4.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Basic Demo Sources Walkthrough
+
diff --git a/docs/features.html b/docs/features.html
index 0bcccb5e871..73dbd7aca2a 100644
--- a/docs/features.html
+++ b/docs/features.html
@@ -33,6 +33,7 @@ limitations under the License.
Apache Lucene - Features
+
diff --git a/docs/fileformats.html b/docs/fileformats.html
index d868cbcd9e6..599281128a9 100644
--- a/docs/fileformats.html
+++ b/docs/fileformats.html
@@ -33,6 +33,7 @@ limitations under the License.
Apache Lucene - Index File Formats
+
@@ -113,7 +114,7 @@ limitations under the License.
This document defines the index file formats used
- in Lucene version 1.9. If you are using a different
+ in Lucene version 2.0. If you are using a different
version of Lucene, please consult the copy of
docs/fileformats.html that was distributed
with the version you are using.
@@ -220,6 +221,7 @@ limitations under the License.
tokenized, but sometimes it is useful for certain identifier fields
to be indexed literally.
+
See the Field java docs for more information on Fields.
@@ -362,8 +364,9 @@ limitations under the License.
Term Vectors. For each field in each document, the term vector
- (sometimes called document vector) is stored. A term vector consists
- of term text and term frequency.
+ (sometimes called document vector) may be stored. A term vector consists
+ of term text and term frequency. To add Term Vectors to your index see the
+ Field constructors
Deleted documents.
@@ -389,7 +392,8 @@ limitations under the License.
All files belonging to a segment have the same name with varying
extensions. The extensions correspond to the different file formats
- described below.
+ described below. When using the Compound File format (default in 1.4 and greater) these files are
+ collapsed into a single .cfs file (see below for details)
Typically, all segments
@@ -1197,6 +1201,7 @@ limitations under the License.
DataOffset --> Long
FileName --> String
FileData --> raw file data
+
The raw file data is the data from the individual files named above.
@@ -1495,7 +1500,10 @@ limitations under the License.
particular, it is the difference between the position of this term's
entry in that file and the position of the previous term's entry.
-
TODO: document skipInterval information
+
SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate TermDocs.skipTo(int).
+ Larger values result in smaller indexes, greater acceleration, but fewer accelerable cases, while
+ smaller values result in bigger indexes, less acceleration and more
+ accelerable cases.
diff --git a/docs/gettingstarted.html b/docs/gettingstarted.html
index 51228e175b1..03f61435bd5 100644
--- a/docs/gettingstarted.html
+++ b/docs/gettingstarted.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Getting Started Guide
+
diff --git a/docs/index.html b/docs/index.html
index 5820463dfba..523209b0ac1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -41,6 +41,7 @@ limitations under the License.
Apache Lucene - Overview - Apache Lucene
+
diff --git a/docs/lucene-sandbox/index.html b/docs/lucene-sandbox/index.html
index e0a849a510e..d61517ceb85 100644
--- a/docs/lucene-sandbox/index.html
+++ b/docs/lucene-sandbox/index.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Lucene Sandbox
+
diff --git a/docs/mailinglists.html b/docs/mailinglists.html
index 76d7ca36660..3ad0bbe0a0b 100644
--- a/docs/mailinglists.html
+++ b/docs/mailinglists.html
@@ -33,6 +33,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - Mailing Lists
+
diff --git a/docs/queryparsersyntax.html b/docs/queryparsersyntax.html
index 9163161ac15..5e0d8ab7748 100644
--- a/docs/queryparsersyntax.html
+++ b/docs/queryparsersyntax.html
@@ -37,6 +37,7 @@ limitations under the License.
Apache Lucene -
Query Parser Syntax - Apache Lucene
+
diff --git a/docs/resources.html b/docs/resources.html
index f2e75fceba8..eadc1e3d738 100644
--- a/docs/resources.html
+++ b/docs/resources.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Resources - Apache Lucene
+
diff --git a/docs/systemproperties.html b/docs/systemproperties.html
index 5953ea348f4..c5dc7835f02 100644
--- a/docs/systemproperties.html
+++ b/docs/systemproperties.html
@@ -35,6 +35,7 @@ limitations under the License.
Apache Lucene - Apache Lucene - System Properties
+
diff --git a/docs/whoweare.html b/docs/whoweare.html
index 626aee0ca6d..2c4a6fdff08 100644
--- a/docs/whoweare.html
+++ b/docs/whoweare.html
@@ -37,6 +37,7 @@ limitations under the License.
Apache Lucene - Who We Are - Apache Lucene
+
@@ -157,7 +158,7 @@ patents.
Daniel Naber (dnaber@...)
Bernhard Messer (bmesser@...)
Yonik Seeley (yonik@...)
-
Grant Ingersoll (gsingers@...)
+
Grant Ingersoll (gsingers@...)
Note that the email addresses above end with @apache.org.
Lucene scoring is the heart of why we all love Lucene. It is blazingly fast and it hides almost all of the complexity from the user.
+ In a nutshell, it works. At least, that is, until it doesn't work, or doesn't work as one would expect it to
+ work. Then we are left digging into Lucene internals or asking for help on java-user@lucene.apache.org to figure out why a document with five of our query terms
+ scores lower than a different document with only one of the query terms.
+
While this document won't answer your specific scoring issues, it will, hopefully, point you to the places that can
+ help you figure out the what and why of Lucene scoring.
+
Lucene scoring uses a combination of the
+ Vector Space Model (VSM) of Information
+ Retrieval and the Boolean model
+ to determine
+ how relevant a given Document is to a User's query. In general, the idea behind the VSM is the more
+ times a query term appears in a document relative to
+ the number of times the term appears in all the documents in the collection, the more relevant that
+ document is to the query. It uses the Boolean model to first narrow down the documents that need to
+ be scored based on the use of boolean logic in the Query specification. Lucene also adds some
+ capabilities and refinements onto this model to support boolean and fuzzy searching, but it
+ essentially remains a VSM based system at the heart.
+ For some valuable references on VSM and IR in general refer to the
+ Lucene Wiki IR references.
+
+
The rest of this document will cover Scoring basics and how to change your
+ Similarity. Next it will cover ways you can
+ customize the Lucene internals in Changing your Scoring
+ -- Expert Level which gives details on implementing your own
+ Query class and related functionality. Finally, we
+ will finish up with some reference material in the Appendix.
+
+
+
+
Scoring is very much dependent on the way documents are indexed,
+ so it is important to understand indexing (see
+ Apache Lucene - Getting Started Guide
+ and the Lucene
+ file formats
+ before continuing on with this section.) It is also assumed that readers know how to use the
+ Searcher.explain(Query query, int doc) functionality,
+ which can go a long way in informing why a score is returned.
+
+
+
In Lucene, the objects we are scoring are
+ Documents. A Document is a collection
+ of
+ Fields. Each Field has semantics about how
+ it is created and stored (i.e. tokenized, untokenized, raw data, compressed, etc.) It is important to
+ note that Lucene scoring works on Fields and then combines the results to return Documents. This is
+ important because two Documents with the exact same content, but one having the content in two Fields
+ and the other in one Field will return different scores for the same query due to length normalization
+ (assumming the
+ DefaultSimilarity
+ on the Fields.
+
+
+
+
+ Lucene's scoring formula, taken from
+ Similarity
+ is
+
This scoring formula is mostly incorporated into the
+ TermScorer class, where it makes calls to the
+ Similarity class to retrieve values for the following:
+
+
tf - Term Frequency - The number of times the term t appears in the current document being scored.
+
idf - Inverse Document Frequency - One divided by the number of documents in which the term t appears in.
+
getBoost(t in q) - The boost, specified in the query by the user, that should be applied to this term.
+
lengthNorm(t.field in q) - The factor to apply to account for differing lengths in the fields that are being searched. Usually longer fields return a smaller value.
+
coord(q, d) - Score factor based on how many terms the specified document has in common with the query.
+
queryNorm(sumOfSquaredWeights) - Factor used to make scores between queries comparable
+ GSI: might be interesting to have a note on why this formula was chosen. I have always understood (but not 100% sure)
+ that it is not a good idea to compare scores across queries or indexes, so any use of normalization may lead to false assumptions. However, I also seem
+ to remember some research on using sum of squares as being somewhat suitable for score comparison. Anyone have any thoughts here?
+
+ Note, the above definitions are summaries of the javadocs which can be accessed by clicking the links in the formula and are merely provided
+ for context and are not authoratitive.
+
+
+
+
OK, so the tf-idf formula and the
+ Similarity
+ is great for understanding the basics of Lucene scoring, but what really drives Lucene scoring are
+ the use and interactions between the
+ Query classes, as created by each application in
+ response to a user's information need.
+
+
In this regard, Lucene offers a wide variety of Query implementations, most of which are in the
+ org.apache.lucene.search package.
+ These implementations can be combined in a wide variety of ways to provide complex querying
+ capabilities along with
+ information about where matches took place in the document collection. The Query
+ section below will
+ highlight some of the more important Query classes. For information on the other ones, see the
+ package summary. For details on implementing
+ your own Query class, see Changing your Scoring --
+ Expert Level below.
+
+
Once a Query has been created and submitted to the
+ IndexSearcher, the scoring process
+ begins. (See the Appendix Algorithm section for more notes on the process.) After some infrastructure setup,
+ control finally passes to the Weight implementation and it's
+ Scorer instance. In the case of any type of
+ BooleanQuery, scoring is handled by the
+ BooleanWeight2 (link goes to ViewVC BooleanQuery java code which contains the BooleanWeight2 inner class),
+ unless the static
+
+ BooleanQuery#setUseScorer14(boolean) method is set to true,
+ in which case the
+ BooleanWeight
+ (link goes to ViewVC BooleanQuery java code, which contains the BooleanWeight inner class) from the 1.4 version of Lucene is used by default.
+ See CHANGES.txt under release 1.9 RC1 for more information on choosing which Scorer to use.
+
+
+ Assuming the use of the BooleanWeight2, a
+ BooleanScorer2 is created by bringing together
+ all of the
+ Scorers from the sub-clauses of the BooleanQuery.
+ When the BooleanScorer2 is asked to score it delegates its work to an internal Scorer based on the type
+ of clauses in the Query. This internal Scorer essentially loops over the sub scorers and sums the scores
+ provided by each scorer while factoring in the coord() score.
+
+
Of the various implementations of
+ Query, the
+ TermQuery
+ is the easiest to understand and the most often used in most applications. A TermQuery is a Query
+ that matches all the documents that contain the specified
+ Term
+ . A Term is a word that occurs in a specific
+ Field
+ . Thus, a TermQuery identifies and scores all
+ Document
+ s that have a Field with the specified string in it.
+ Constructing a TermQuery is as simple as:
+ TermQuery tq = new TermQuery(new Term("fieldName", "term");
+ In this example, the Query would identify all Documents that have the Field named "fieldName" that
+ contain the word "term".
+
Things start to get interesting when one starts to combine TermQuerys, which is handled by the
+ BooleanQuery
+ class. The BooleanQuery is a collection
+ of other
+ Query
+ classes along with semantics about how to combine the different subqueries.
+ It currently supports three different operators for specifying the logic of the query (see
+ BooleanClause
+ )
+
+
SHOULD -- Use this operator when a clause can occur in the result set, but is not required.
+ If a query is made up of all SHOULD clauses, then a non-empty result
+ set will have matched at least one of the clauses in the query.
+
MUST -- Use this operator when a clause is required to occur in the result set.
+
MUST NOT -- Use this operator when a clause must not occur in the result set.
+
+ Boolean queries are constructed by adding two or more
+ BooleanClause
+ instances to the BooleanQuery instance. In some cases,
+ too many clauses may be added to the BooleanQuery, which will cause a TooManyClauses exception to be
+ thrown. This
+ most often occurs when using a Query that is rewritten into many TermQuery instances, such as the
+ WildCardQuery
+ . The default
+ setting for too many clauses is currently set to 1024, but it can be overridden via the
+ BooleanQuery#setMaxClauseCount(int) static method on BooleanQuery.
+
+
Phrases
+
Another common task in search is to identify phrases, which can be handled in two different ways.
+
+
+ PhraseQuery
+ -- Matches a sequence of
+ Terms
+ . The PhraseQuery can specify a slop factor which determines
+ how many positions may occur between any two terms and still be considered a match.
+
+
+ SpanNearQuery
+ -- Matches a sequence of other
+ SpanQuery
+ instances. The SpanNearQuery allows for much more
+ complicated phrasal queries to be built since it is constructed out of other SpanQuery
+ objects, not just Terms.
+
The
+ RangeQuery
+ matches all documents that occur in the
+ exclusive range of a lower
+ Term
+ and an upper
+ Term
+ . For instance, one could find all documents
+ that have terms beginning with the letters a through c. This type of Query is most often used to
+ find
+ documents that occur in a specific date range.
+
While the
+ PrefixQuery
+ has a different implementation, it is essentially a special case of the
+ WildcardQuery
+ . The PrefixQuery allows an application
+ to identify all documents with terms that begin with a certain string. The WildcardQuery generalize
+ this by allowing
+ for the use of * and ? wildcards. Note that the WildcardQuery can be quite slow. Also note that
+ WildcardQuerys should
+ not start with * and ?, as these are extremely slow. For tricks on how to search using a wildcard at
+ the beginning of a term, see
+
+ Starts With x and Ends With x Queries
+ from the Lucene archives.
+
A
+ FuzzyQuery
+ matches documents that contain similar terms to the specified term. Similarity is
+ determined using the
+ Levenshtein (edit distance) algorithm
+ . This type of query can be useful when accounting for spelling variations in the collection.
+
+
+
+
Chances are, the
+ DefaultSimilarity is sufficient for all your searching needs.
+ However, in some applications it may be necessary to alter your Similarity. For instance, some applications do not need to
+ distinguish between shorter documents and longer documents (for example,
+ see a "fair" similarity)
+ To change the Similarity, one must do so for both indexing and searching and the changes must take place before
+ any of these actions are undertaken (although in theory there is nothing stopping you from changing mid-stream, it just isn't well-defined what is going to happen).
+ To make this change, implement your Similarity (you probably want to override
+ DefaultSimilarity) and then set the new
+ class on
+ IndexWriter.setSimilarity(org.apache.lucene.search.Similarity) for indexing and on
+ Searcher.setSimilarity(org.apache.lucene.search.Similarity).
+
+
+ If you are interested in use cases for changing your similarity, see the mailing list at Overriding Similarity.
+ In summary, here are a few use cases:
+
+
SweetSpotSimilarity -- SweetSpotSimilarity gives small increases as the frequency increases a small amount
+ and then greater increases when you hit the "sweet spot", i.e. where you think the frequency of terms is more significant.
+
Overriding tf -- In some applications, it doesn't matter what the score of a document is as long as a matching term occurs. In these
+ cases people have overridden Similarity to return 1 from the tf() method.
+
Changing Length Normalization -- By overriding lengthNorm, it is possible to discount how the length of a field contributes
+ to a score. In the DefaultSimilarity, lengthNorm = 1/ (numTerms in field)^0.5, but if one changes this to be
+ 1 / (numTerms in field), all fields will be treated
+ "fairly".
+
+ In general, Chris Hostetter sums it up best in saying (from the mailing list):
+
[One would override the Similarity in] ... any situation where you know more about your data then just that
+ it's "text" is a situation where it *might* make sense to to override your
+ Similarity method.
+
+
+
+
+
+
Changing scoring is an expert level task, so tread carefully and be prepared to share your code if
+ you want help.
+
+
With the warning out of the way, it is possible to change a lot more than just the Similarity
+ when it comes to scoring in Lucene. Lucene's scoring is a complex mechanism that is grounded by
+ three main classes:
+
+
+ Query -- The abstract object representation of the user's information need.
+
+ Weight -- The internal interface representation of the user's Query, so that Query objects may be reused.
+
+ Scorer -- An abstract class containing common functionality for scoring. Provides both scoring and explanation capabilities.
+
+ Details on each of these classes, and their children can be found in the subsections below.
+
+
+
In some sense, the
+ Query
+ class is where it all begins. Without a Query, there would be
+ nothing to score. Furthermore, the Query class is the catalyst for the other scoring classes as it
+ is often responsible
+ for creating them or coordinating the functionality between them. The
+ Query class has several methods that are important for
+ derived classes:
+
+
createWeight(Searcher searcher) -- A
+ Weight is the internal representation of the Query, so each Query implementation must
+ provide an implementation of Weight. See the subsection on The Weight Interface below for details on implementing the Weight interface.
The
+ Weight
+ interface provides an internal representation of the Query so that it can be reused. Any
+ Searcher
+ dependent state should be stored in the Weight implementation,
+ not in the Query class. The interface defines 6 methods that must be implemented:
+
+
+ Weight#getQuery() -- Pointer to the Query that this Weight represents.
+
+ Weight#getValue() -- The weight for this Query. For example, the TermQuery.TermWeight value is
+ equal to the idf^2 * boost * queryNorm
+
+ Weight#normalize(float) -- Determine the query normalization factor. The query normalization may
+ allow for comparing scores between queries.
+
+
+ Weight#scorer(IndexReader) -- Construct a new
+ Scorer
+ for this Weight. See
+ The Scorer Class
+ below for help defining a Scorer. As the name implies, the
+ Scorer is responsible for doing the actual scoring of documents given the Query.
+
The
+ Scorer
+ abstract class provides common scoring functionality for all Scorer implementations and
+ is the heart of the Lucene scoring process. The Scorer defines the following abstract methods which
+ must be implemented:
+
+
+ Scorer#next() -- Advances to the next document that matches this Query, returning true if and only
+ if there is another document that matches.
+
+ Scorer#doc() -- Returns the id of the
+ Document
+ that contains the match. Is not valid until next() has been called at least once.
+
+
+ Scorer#score() -- Return the score of the current document. This value can be determined in any
+ appropriate way for an application. For instance, the
+ TermScorer
+ returns the tf * Weight.getValue() * fieldNorm.
+
+
+ Scorer#skipTo(int) -- Skip ahead in the document matches to the document whose id is greater than
+ or equal to the passed in value. In many instances, skipTo can be
+ implemented more efficiently than simply looping through all the matching documents until
+ the target document is identified.
In a nutshell, you want to add your own custom Query implementation when you think that Lucene's
+ aren't appropriate for the
+ task that you want to do. You might be doing some cutting edge research or you need more information
+ back
+ out of Lucene (similar to Doug adding SpanQuery functionality).
GSI Note: This section is mostly my notes on stepping through the Scoring process and serves as
+ fertilizer for the earlier sections.
+
In the typical search application, a
+ Query
+ is passed to the
+ Searcher
+ , beginning the scoring process.
+
+
Once inside the Searcher, a
+ Hits
+ object is constructed, which handles the scoring and caching of the search results.
+ The Hits constructor stores references to three or four important objects:
+
+
The
+ Weight
+ object of the Query. The Weight object is an internal representation of the Query that
+ allows the Query to be reused by the Searcher.
+
+
The Searcher that initiated the call.
+
A
+ Filter
+ for limiting the result set. Note, the Filter may be null.
+
+
A
+ Sort
+ object for specifying how to sort the results if the standard score based sort method is not
+ desired.
+
+
+
+
Now that the Hits object has been initialized, it begins the process of identifying documents that
+ match the query by calling getMoreDocs method. Assuming we are not sorting (since sorting doesn't
+ effect the raw Lucene score),
+ we call on the "expert" search method of the Searcher, passing in our
+ Weight
+ object,
+ Filter
+ and the number of results we want. This method
+ returns a
+ TopDocs
+ object, which is an internal collection of search results.
+ The Searcher creates a
+ TopDocCollector
+ and passes it along with the Weight, Filter to another expert search method (for more on the
+ HitCollector
+ mechanism, see
+ Searcher
+ .) The TopDocCollector uses a
+ PriorityQueue
+ to collect the top results for the search.
+
+
If a Filter is being used, some initial setup is done to determine which docs to include. Otherwise,
+ we ask the Weight for
+ a
+ Scorer
+ for the
+ IndexReader
+ of the current searcher and we proceed by
+ calling the score method on the
+ Scorer
+ .
+
+
At last, we are actually going to score some documents. The score method takes in the HitCollector
+ (most likely the TopDocCollector) and does its business.
+ Of course, here is where things get involved. The
+ Scorer
+ that is returned by the
+ Weight
+ object depends on what type of Query was submitted. In most real world applications with multiple
+ query terms,
+ the
+ Scorer
+ is going to be a
+ BooleanScorer2
+ (see the section on customizing your scoring for info on changing this.)
+
+
+
Assuming a BooleanScorer2 scorer, we first initialize the Coordinator, which is used to apply the
+ coord() factor. We then
+ get a internal Scorer based on the required, optional and prohibited parts of the query.
+ Using this internal Scorer, the BooleanScorer2 then proceeds
+ into a while loop based on the Scorer#next() method. The next() method advances to the next document
+ matching the query. This is an
+ abstract method in the Scorer class and is thus overriden by all derived
+ implementations. If you have a simple OR query
+ your internal Scorer is most likely a DisjunctionSumScorer, which essentially combines the scorers
+ from the sub scorers of the OR'd terms.
+
+
+
+
\ No newline at end of file
diff --git a/xdocs/styles/lucene.css b/xdocs/styles/lucene.css
new file mode 100644
index 00000000000..5c1fa7acc16
--- /dev/null
+++ b/xdocs/styles/lucene.css
@@ -0,0 +1,34 @@
+/*
+ Place for sharing style information across the XDocs
+
+*/
+
+
+.big{
+ font-size: 1.5em;
+}
+
+.formula{
+ font-size: 0.9em;
+ display: block;
+ position: relative;
+ left: -25px;
+}
+
+#summation{
+
+}
+
+.summation-range{
+ position: relative;
+ top: 5px;
+ font-size: 0.85em;
+}
+
+/*
+Useful for highlighting pieces of documentation that others should pay special attention to
+when proof reading
+*/
+.highlight-for-editing{
+ background-color: yellow;
+}
\ No newline at end of file
diff --git a/xdocs/stylesheets/site.vsl b/xdocs/stylesheets/site.vsl
index 7e6d9986cc6..6e1279d071a 100644
--- a/xdocs/stylesheets/site.vsl
+++ b/xdocs/stylesheets/site.vsl
@@ -266,6 +266,7 @@ limitations under the License.
#end
$project.getChild("title").getText() - $root.getChild("properties").getChild("title").getText()
+