Added XML-query-parser module for new extensible query parser that handles queries expressed as XML. Is dependent on new "queries" contrib module.

Added "queries" contrib module for various new query/filter classes. This area is also intended to consolidate existing query classes so have moved a copy of MoreLikeThis into here. Probably need to remove "similarity" module as a result, if no one objects.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@380874 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2006-02-25 00:39:18 +00:00
parent c73de87a8f
commit 87768c51c6
50 changed files with 3864 additions and 0 deletions

10
contrib/queries/build.xml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0"?>
<project name="queries" default="default">
<description>
Queries - various query object exotica not in core
</description>
<import file="../contrib-build.xml"/>
</project>

View File

@ -0,0 +1,27 @@
This module contains a number of filter and query objects that add to core lucene.
The "MoreLikeThis" class from the "similarity" module has been copied into here.
If people are generally happy with this move then the similarity module can be deleted, or at least a
"Moved to queries module..." note left in its place.
==== FuzzyLikeThis - mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
of fuzzy scoring factors. This generally produces good results for queries where users may provide details in a number of
fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching. The query is fast because, like
MoreLikeThis, it optimizes the query to only the most distinguishing terms.
==== BoostingQuery - effectively demotes search results that match a given query.
Unlike the "NOT" clause, this still selects documents that contain undesirable terms,
but reduces the overall score of docs containing these terms.
==== TermsFilter - Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
a sequence. An example might be a collection of primary keys from a database query result or perhaps
a choice of "category" labels picked by the end user.
Mark Harwood
25/02/2006

View File

@ -0,0 +1,71 @@
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
/**
* The BoostingQuery class can be used to effectively demote results that match a given query.
* Unlike the "NOT" clause, this still selects documents that contain undesirable terms,
* but reduces their overall score:
*
* Query balancedQuery = new BoostingQuery(positiveQuery, negativeQuery, 0.01f);
* In this scenario the positiveQuery contains the mandatory, desirable criteria which is used to
* select all matching documents, and the negativeQuery contains the undesirable elements which
* are simply used to lessen the scores. Documents that match the negativeQuery have their score
* multiplied by the supplied "boost" parameter, so this should be less than 1 to achieve a
* demoting effect
*
* This code was originally made available here: [WWW] http://marc.theaimsgroup.com/?l=lucene-user&m=108058407130459&w=2
* and is documented here: http://wiki.apache.org/jakarta-lucene/CommunityContributions
*/
public class BoostingQuery extends Query {
private float boost; // the amount to boost by
private Query match; // query to match
private Query context; // boost when matches too
public BoostingQuery(Query match, Query context, float boost) {
this.match = match;
this.context = (Query)context.clone(); // clone before boost
this.boost = boost;
context.setBoost(0.0f); // ignore context-only matches
}
public Query rewrite(IndexReader reader) throws IOException {
BooleanQuery result = new BooleanQuery() {
public Similarity getSimilarity(Searcher searcher) {
return new DefaultSimilarity() {
public float coord(int overlap, int max) {
switch (overlap) {
case 1: // matched only one clause
return 1.0f; // use the score as-is
case 2: // matched both clauses
return boost; // multiply by boost
default:
return 0.0f;
}
}
};
}
};
result.add(match, BooleanClause.Occur.MUST);
result.add(context, BooleanClause.Occur.SHOULD);
return result;
}
public String toString(String field) {
return match.toString(field) + "/" + context.toString(field);
}
}

View File

@ -0,0 +1,302 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.util.PriorityQueue;
/**
* Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
* In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
* of fuzzy scoring factors.
* This generally produces good results for queries where users may provide details in a number of
* fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
* a fast query.
*
* For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
* we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
* TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
* terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
* term) and this is factored into the variant's boost. If the source query term does not exist in the
* index the average IDF of the variants is used.
* @author maharwood
*/
public class FuzzyLikeThisQuery extends Query
{
static Similarity sim=new DefaultSimilarity();
Query rewrittenQuery=null;
ArrayList fieldVals=new ArrayList();
Analyzer analyzer;
ScoreTermQueue q;
int MAX_VARIANTS_PER_TERM=50;
boolean ignoreTF=false;
/**
*
* @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
* @param analyzer
*/
public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
{
q=new ScoreTermQueue(maxNumTerms);
this.analyzer=analyzer;
}
class FieldVals
{
String queryString;
String fieldName;
float minSimilarity;
int prefixLength;
public FieldVals(String name, float similarity, int length, String queryString)
{
fieldName = name;
minSimilarity = similarity;
prefixLength = length;
this.queryString = queryString;
}
}
/**
* Adds user input for "fuzzification"
* @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
* @param fieldName
* @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
* @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
*/
public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
{
fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
}
private void addTerms(IndexReader reader,FieldVals f) throws IOException
{
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
Token token=ts.next();
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects
while(token!=null)
{
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
TermEnum origEnum = reader.terms(startTerm);
int df=0;
if(startTerm.equals(origEnum.term()))
{
df=origEnum.docFreq(); //store the df so all variants use same idf
}
int numVariants=0;
int totalVariantDocFreqs=0;
do
{
Term possibleMatch=fe.term();
if(possibleMatch!=null)
{
numVariants++;
totalVariantDocFreqs+=fe.docFreq();
float score=fe.difference();
if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
variantsQ.insert(st);
minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
}
}
}
while(fe.next());
if(numVariants==0)
{
//no variants to rank here
break;
}
int avgDf=totalVariantDocFreqs/numVariants;
if(df==0)//no direct match we can use as df for all variants
{
df=avgDf; //use avg df of all variants
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking overall top query terms
int size = variantsQ.size();
for(int i = 0; i < size; i++)
{
ScoreTerm st = (ScoreTerm) variantsQ.pop();
st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
q.insert(st);
}
token=ts.next();
}
}
public Query rewrite(IndexReader reader) throws IOException
{
if(rewrittenQuery!=null)
{
return rewrittenQuery;
}
//load up the list of possible terms
for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
{
FieldVals f = (FieldVals) iter.next();
addTerms(reader,f);
}
//clear the list of fields
fieldVals.clear();
BooleanQuery bq=new BooleanQuery();
//create BooleanQueries to hold the variants for each token/field pair and ensure it
// has no coord factor
//Step 1: sort the termqueries by term/field
HashMap variantQueries=new HashMap();
int size = q.size();
for(int i = 0; i < size; i++)
{
ScoreTerm st = (ScoreTerm) q.pop();
ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
if(l==null)
{
l=new ArrayList();
variantQueries.put(st.fuzziedSourceTerm,l);
}
l.add(st);
}
//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
{
ArrayList variants = (ArrayList) iter.next();
if(variants.size()==1)
{
//optimize where only one selected variant
ScoreTerm st=(ScoreTerm) variants.get(0);
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
tq.setBoost(st.score); // set the boost to a mix of IDF and score
bq.add(tq, BooleanClause.Occur.SHOULD);
}
else
{
BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
for (Iterator iterator2 = variants.iterator(); iterator2
.hasNext();)
{
ScoreTerm st = (ScoreTerm) iterator2.next();
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
tq.setBoost(st.score); // set the boost using the ScoreTerm's score
termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
}
bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
}
}
//TODO possible alternative step 3 - organize above booleans into a new layer of field-based
// booleans with a minimum-should-match of NumFields-1?
this.rewrittenQuery=bq;
return bq;
}
//Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
// term variants) then is reset with IDF for use in ranking against all other
// terms/fields
private static class ScoreTerm{
public Term term;
public float score;
Term fuzziedSourceTerm;
public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
this.term = term;
this.score = score;
this.fuzziedSourceTerm=fuzziedSourceTerm;
}
}
private static class ScoreTermQueue extends PriorityQueue {
public ScoreTermQueue(int size){
initialize(size);
}
/* (non-Javadoc)
* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
*/
protected boolean lessThan(Object a, Object b) {
ScoreTerm termA = (ScoreTerm)a;
ScoreTerm termB = (ScoreTerm)b;
if (termA.score== termB.score)
return termA.term.compareTo(termB.term) > 0;
else
return termA.score < termB.score;
}
}
//overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
private static class FuzzyTermQuery extends TermQuery
{
boolean ignoreTF;
public FuzzyTermQuery(Term t, boolean ignoreTF)
{
super(t);
this.ignoreTF=ignoreTF;
}
public Similarity getSimilarity(Searcher searcher)
{
Similarity result = super.getSimilarity(searcher);
result = new SimilarityDelegator(result) {
public float tf(float freq)
{
if(ignoreTF)
{
return 1; //ignore tf
}
return super.tf(freq);
}
public float idf(int docFreq, int numDocs)
{
//IDF is already factored into individual term boosts
return 1;
}
};
return result;
}
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
public String toString(String field)
{
return null;
}
public boolean isIgnoreTF()
{
return ignoreTF;
}
public void setIgnoreTF(boolean ignoreTF)
{
this.ignoreTF = ignoreTF;
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
/**
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries)
*
* @author maharwood
*/
public class TermsFilter extends Filter
{
ArrayList termsList=new ArrayList();
/**
* Adds a term to the list of acceptable terms
* @param term
*/
public void addTerm(Term term)
{
termsList.add(term);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
*/
public BitSet bits(IndexReader reader) throws IOException
{
BitSet result=new BitSet(reader.maxDoc());
for (Iterator iter = termsList.iterator(); iter.hasNext();)
{
Term term = (Term) iter.next();
TermDocs td=reader.termDocs(term);
while (td.next())
{
result.set(td.doc());
}
}
return result;
}
}

View File

@ -0,0 +1,926 @@
/**
* Copyright 2004-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similar;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import java.util.Set;
import java.util.HashMap;
import java.util.Map;
import java.util.Collection;
import java.util.Iterator;
import java.io.IOException;
import java.io.Reader;
import java.io.File;
import java.io.PrintStream;
import java.io.StringReader;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
/**
* Generate "more like this" similarity queries.
* Based on this mail:
* <code><pre>
* Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
* is usually fast enough. But looking up the docFreq() of every term in the document is
* probably too slow.
*
* You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
* or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
* in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
* reduce the number of terms under consideration. Another heuristic is that terms with a
* high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
* number of characters, not selecting anything less than, e.g., six or seven characters.
* With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
* that do a pretty good job of characterizing a document.
*
* It all depends on what you're trying to do. If you're trying to eek out that last percent
* of precision and recall regardless of computational difficulty so that you can win a TREC
* competition, then the techniques I mention above are useless. But if you're trying to
* provide a "more like this" button on a search results page that does a decent job and has
* good performance, such techniques might be useful.
*
* An efficient, effective "more-like-this" query generator would be a great contribution, if
* anyone's interested. I'd imagine that it would take a Reader or a String (the document's
* text), analyzer Analyzer, and return a set of representative terms using heuristics like those
* above. The frequency and length thresholds could be parameters, etc.
*
* Doug
* </pre></code>
*
*
* <p>
* <h3>Initial Usage</h3>
*
* This class has lots of options to try to make it efficient and flexible.
* See the body of {@link #main main()} below in the source for real code, or
* if you want pseudo code, the simpliest possible usage is as follows. The bold
* fragment is specific to this class.
*
* <code><pre>
*
* IndexReader ir = ...
* IndexSearcher is = ...
* <b>
* MoreLikeThis mlt = new MoreLikeThis(ir);
* Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
* Query query = mlt.like( target);
* </b>
* Hits hits = is.search(query);
* <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
* you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
*
* </pre></code>
*
* Thus you:
* <ol>
* <li> do your normal, Lucene setup for searching,
* <li> create a MoreLikeThis,
* <li> get the text of the doc you want to find similaries to
* <li> then call one of the like() calls to generate a similarity query
* <li> call the searcher to find the similar docs
* </ol>
*
* <h3>More Advanced Usage</h3>
*
* You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
* multiple fields (e.g. body and title) for similarity.
* <p>
*
* Depending on the size of your index and the size and makeup of your documents you
* may want to call the other set methods to control how the similarity queries are
* generated:
* <ul>
* <li> {@link #setMinTermFreq setMinTermFreq(...)}
* <li> {@link #setMinDocFreq setMinDocFreq(...)}
* <li> {@link #setMinWordLen setMinWordLen(...)}
* <li> {@link #setMaxWordLen setMaxWordLen(...)}
* <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
* <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
* <li> {@link #setStopWords setStopWord(...)}
* </ul>
*
* <hr>
* <pre>
* Changes: Mark Harwood 29/02/04
* Some bugfixing, some refactoring, some optimisation.
* - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
* - bugfix: No significant terms being created for fields with a termvector - because
* was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector)
* - refactor: moved common code into isNoiseWord()
* - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
* </pre>
*
* @author David Spencer
* @author Bruce Ritchie
* @author Mark Harwood
*/
public final class MoreLikeThis {
/**
* Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
* @see #getMaxNumTokensParsed
*/
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED=5000;
/**
* Default analyzer to parse source doc with.
* @see #getAnalyzer
*/
public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
/**
* Ignore terms with less than this frequency in the source doc.
* @see #getMinTermFreq
* @see #setMinTermFreq
*/
public static final int DEFAULT_MIN_TERM_FREQ = 2;
/**
* Ignore words which do not occur in at least this many docs.
* @see #getMinDocFreq
* @see #setMinDocFreq
*/
public static final int DEFALT_MIN_DOC_FREQ = 5;
/**
* Boost terms in query based on score.
* @see #isBoost
* @see #setBoost
*/
public static final boolean DEFAULT_BOOST = false;
/**
* Default field names. Null is used to specify that the field names should be looked
* up at runtime from the provided reader.
*/
public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents"};
/**
* Ignore words less than this length or if 0 then this has no effect.
* @see #getMinWordLen
* @see #setMinWordLen
*/
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
/**
* Ignore words greater than this length or if 0 then this has no effect.
* @see #getMaxWordLen
* @see #setMaxWordLen
*/
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
/**
* Default set of stopwords.
* If null means to allow stop words.
*
* @see #setStopWords
* @see #getStopWords
*/
public static final Set DEFAULT_STOP_WORDS = null;
/**
* Current set of stop words.
*/
private Set stopWords = DEFAULT_STOP_WORDS;
/**
* Return a Query with no more than this many terms.
*
* @see BooleanQuery#getMaxClauseCount
* @see #getMaxQueryTerms
* @see #setMaxQueryTerms
*/
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
/**
* Analyzer that will be used to parse the doc.
*/
private Analyzer analyzer = DEFAULT_ANALYZER;
/**
* Ignore words less freqent that this.
*/
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
/**
* Ignore words which do not occur in at least this many docs.
*/
private int minDocFreq = DEFALT_MIN_DOC_FREQ;
/**
* Should we apply a boost to the Query based on the scores?
*/
private boolean boost = DEFAULT_BOOST;
/**
* Field name we'll analyze.
*/
private String[] fieldNames = DEFAULT_FIELD_NAMES;
/**
* The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
*/
private int maxNumTokensParsed=DEFAULT_MAX_NUM_TOKENS_PARSED;
/**
* Ignore words if less than this len.
*/
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
/**
* Ignore words if greater than this len.
*/
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
/**
* Don't return a query longer than this.
*/
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
/**
* For idf() calculations.
*/
private Similarity similarity = new DefaultSimilarity();
/**
* IndexReader to use
*/
private final IndexReader ir;
/**
* Constructor requiring an IndexReader.
*/
public MoreLikeThis(IndexReader ir) {
this.ir = ir;
}
/**
* Returns an analyzer that will be used to parse source doc with. The default analyzer
* is the {@link #DEFAULT_ANALYZER}.
*
* @return the analyzer that will be used to parse source doc with.
* @see #DEFAULT_ANALYZER
*/
public Analyzer getAnalyzer() {
return analyzer;
}
/**
* Sets the analyzer to use. An analyzer is not required for generating a query with the
* {@link #like(int)} method, all other 'like' methods require an analyzer.
*
* @param analyzer the analyzer to use to tokenize text.
*/
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Returns the frequency below which terms will be ignored in the source doc. The default
* frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
*
* @return the frequency below which terms will be ignored in the source doc.
*/
public int getMinTermFreq() {
return minTermFreq;
}
/**
* Sets the frequency below which terms will be ignored in the source doc.
*
* @param minTermFreq the frequency below which terms will be ignored in the source doc.
*/
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
/**
* Returns the frequency at which words will be ignored which do not occur in at least this
* many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
*
* @return the frequency at which words will be ignored which do not occur in at least this
* many docs.
*/
public int getMinDocFreq() {
return minDocFreq;
}
/**
* Sets the frequency at which words will be ignored which do not occur in at least this
* many docs.
*
* @param minDocFreq the frequency at which words will be ignored which do not occur in at
* least this many docs.
*/
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
/**
* Returns whether to boost terms in query based on "score" or not. The default is
* {@link #DEFAULT_BOOST}.
*
* @return whether to boost terms in query based on "score" or not.
* @see #setBoost
*/
public boolean isBoost() {
return boost;
}
/**
* Sets whether to boost terms in query based on "score" or not.
*
* @param boost true to boost terms in query based on "score", false otherwise.
* @see #isBoost
*/
public void setBoost(boolean boost) {
this.boost = boost;
}
/**
* Returns the field names that will be used when generating the 'More Like This' query.
* The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
*
* @return the field names that will be used when generating the 'More Like This' query.
*/
public String[] getFieldNames() {
return fieldNames;
}
/**
* Sets the field names that will be used when generating the 'More Like This' query.
* Set this to null for the field names to be determined at runtime from the IndexReader
* provided in the constructor.
*
* @param fieldNames the field names that will be used when generating the 'More Like This'
* query.
*/
public void setFieldNames(String[] fieldNames) {
this.fieldNames = fieldNames;
}
/**
* Returns the minimum word length below which words will be ignored. Set this to 0 for no
* minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
*
* @return the minimum word length below which words will be ignored.
*/
public int getMinWordLen() {
return minWordLen;
}
/**
* Sets the minimum word length below which words will be ignored.
*
* @param minWordLen the minimum word length below which words will be ignored.
*/
public void setMinWordLen(int minWordLen) {
this.minWordLen = minWordLen;
}
/**
* Returns the maximum word length above which words will be ignored. Set this to 0 for no
* maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
*
* @return the maximum word length above which words will be ignored.
*/
public int getMaxWordLen() {
return maxWordLen;
}
/**
* Sets the maximum word length above which words will be ignored.
*
* @param maxWordLen the maximum word length above which words will be ignored.
*/
public void setMaxWordLen(int maxWordLen) {
this.maxWordLen = maxWordLen;
}
/**
* Set the set of stopwords.
* Any word in this set is considered "uninteresting" and ignored.
* Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
* for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
*
* @param stopWords set of stopwords, if null it means to allow stop words
*
* @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
* @see #getStopWords
*/
public void setStopWords(Set stopWords) {
this.stopWords = stopWords;
}
/**
* Get the current stop words being used.
* @see #setStopWords
*/
public Set getStopWords() {
return stopWords;
}
/**
* Returns the maximum number of query terms that will be included in any generated query.
* The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
*
* @return the maximum number of query terms that will be included in any generated query.
*/
public int getMaxQueryTerms() {
return maxQueryTerms;
}
/**
* Sets the maximum number of query terms that will be included in any generated query.
*
* @param maxQueryTerms the maximum number of query terms that will be included in any
* generated query.
*/
public void setMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
/**
* @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
*/
public int getMaxNumTokensParsed()
{
return maxNumTokensParsed;
}
/**
* @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
*/
public void setMaxNumTokensParsed(int i)
{
maxNumTokensParsed = i;
}
/**
* Return a query that will return docs like the passed lucene document ID.
*
* @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
* @return a query that will return docs like the passed lucene document ID.
*/
public Query like(int docNum) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return createQuery(retrieveTerms(docNum));
}
/**
* Return a query that will return docs like the passed file.
*
* @return a query that will return docs like the passed file.
*/
public Query like(File f) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
}
return like(new FileReader(f));
}
/**
* Return a query that will return docs like the passed URL.
*
* @return a query that will return docs like the passed URL.
*/
public Query like(URL u) throws IOException {
return like(new InputStreamReader(u.openConnection().getInputStream()));
}
/**
* Return a query that will return docs like the passed stream.
*
* @return a query that will return docs like the passed stream.
*/
public Query like(java.io.InputStream is) throws IOException {
return like(new InputStreamReader(is));
}
/**
* Return a query that will return docs like the passed Reader.
*
* @return a query that will return docs like the passed Reader.
*/
public Query like(Reader r) throws IOException {
return createQuery(retrieveTerms(r));
}
/**
* Create the More like query from a PriorityQueue
*/
private Query createQuery(PriorityQueue q) {
BooleanQuery query = new BooleanQuery();
Object cur;
int qterms = 0;
float bestScore = 0;
while (((cur = q.pop()) != null)) {
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost) {
if (qterms == 0) {
bestScore = ((Float) ar[2]).floatValue();
}
float myScore = ((Float) ar[2]).floatValue();
tq.setBoost(myScore / bestScore);
}
try {
query.add(tq, BooleanClause.Occur.SHOULD);
}
catch (BooleanQuery.TooManyClauses ignore) {
break;
}
qterms++;
if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
break;
}
}
return query;
}
/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue createQueue(Map words) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size()); // will order words by score
Iterator it = words.keySet().iterator();
while (it.hasNext()) { // for every word
String word = (String) it.next();
int tf = ((Int) words.get(word)).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++) {
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
// only really need 1st 3 entries, other ones are for troubleshooting
res.insert(new Object[]{word, // the word
topField, // the top field
new Float(score), // overall score
new Float(idf), // idf
new Integer(docFreq), // freq in all docs
new Integer(tf)
});
}
return res;
}
/**
* Describe the parameters that control how the "more like this" query is formed.
*/
public String describeParams() {
StringBuffer sb = new StringBuffer();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : \"");
String delim = "";
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
return sb.toString();
}
/**
* Test driver.
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
*/
public static void main(String[] a) throws Throwable {
String indexName = "localhost_index";
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
URL url = null;
for (int i = 0; i < a.length; i++) {
if (a[i].equals("-i")) {
indexName = a[++i];
}
else if (a[i].equals("-f")) {
fn = a[++i];
}
else if (a[i].equals("-url")) {
url = new URL(a[++i]);
}
}
PrintStream o = System.out;
IndexReader r = IndexReader.open(indexName);
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
MoreLikeThis mlt = new MoreLikeThis(r);
o.println("Query generation parameters:");
o.println(mlt.describeParams());
o.println();
Query query = null;
if (url != null) {
o.println("Parsing URL: " + url);
query = mlt.like(url);
}
else if (fn != null) {
o.println("Parsing file: " + fn);
query = mlt.like(new File(fn));
}
o.println("q: " + query);
o.println();
IndexSearcher searcher = new IndexSearcher(indexName);
Hits hits = searcher.search(query);
int len = hits.length();
o.println("found: " + len + " documents matching");
o.println();
for (int i = 0; i < Math.min(25, len); i++) {
Document d = hits.doc(i);
String summary = d.get( "summary");
o.println("score : " + hits.score(i));
o.println("url : " + d.get("url"));
o.println("\ttitle : " + d.get("title"));
if ( summary != null)
o.println("\tsummary: " + d.get("summary"));
o.println();
}
}
/**
* Find words for a more-like-this query former.
*
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue retrieveTerms(int docNum) throws IOException {
Map termFreqMap = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
// field does not store term vector info
if (vector == null) {
Document d=ir.document(docNum);
String text[]=d.getValues(fieldName);
if(text!=null)
{
for (int j = 0; j < text.length; j++) {
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else {
addTermFrequencies(termFreqMap, vector);
}
}
return createQueue(termFreqMap);
}
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
* @param termFreqMap a Map of terms and their frequencies
* @param vector List of terms and their frequencies for a doc/field
*/
private void addTermFrequencies(Map termFreqMap, TermFreqVector vector)
{
String[] terms = vector.getTerms();
int freqs[]=vector.getTermFrequencies();
for (int j = 0; j < terms.length; j++) {
String term = terms[j];
if(isNoiseWord(term)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(term);
if (cnt == null) {
cnt=new Int();
termFreqMap.put(term, cnt);
cnt.x=freqs[j];
}
else {
cnt.x+=freqs[j];
}
}
}
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName)
throws IOException
{
TokenStream ts = analyzer.tokenStream(fieldName, r);
org.apache.lucene.analysis.Token token;
int tokenCount=0;
while ((token = ts.next()) != null) { // for every token
String word = token.termText();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
break;
}
if(isNoiseWord(word)){
continue;
}
// increment frequency
Int cnt = (Int) termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
}
else {
cnt.x++;
}
}
}
/** determines if the passed term is likely to be of interest in "more like" comparisons
*
* @param term The word being considered
* @return true if should be ignored, false if should be used in further analysis
*/
private boolean isNoiseWord(String term)
{
int len = term.length();
if (minWordLen > 0 && len < minWordLen) {
return true;
}
if (maxWordLen > 0 && len > maxWordLen) {
return true;
}
if (stopWords != null && stopWords.contains( term)) {
return true;
}
return false;
}
/**
* Find words for a more-like-this query former.
* The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
* Each array has 6 elements.
* The elements are:
* <ol>
* <li> The word (String)
* <li> The top field that this word comes from (String)
* <li> The score for this word (Float)
* <li> The IDF value (Float)
* <li> The frequency of this word in the index (Integer)
* <li> The frequency of this word in the source document (Integer)
* </ol>
* This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
* This method is exposed so that you can identify the "interesting words" in a document.
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/
public PriorityQueue retrieveTerms(Reader r) throws IOException {
Map words = new HashMap();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
addTermFrequencies(r, words, fieldName);
}
return createQueue(words);
}
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
* @param r the source document
* @return the most interesting words in the document
*
* @see #retrieveTerms(java.io.Reader)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms( Reader r) throws IOException {
ArrayList al = new ArrayList( maxQueryTerms);
PriorityQueue pq = retrieveTerms( r);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((cur = pq.pop()) != null) && lim-- > 0) {
Object[] ar = (Object[]) cur;
al.add( ar[ 0]); // the 1st entry is the interesting word
}
String[] res = new String[ al.size()];
return (String[]) al.toArray( res);
}
/**
* PriorityQueue that orders words by score.
*/
private static class FreqQ extends PriorityQueue {
FreqQ (int s) {
initialize(s);
}
protected boolean lessThan(Object a, Object b) {
Object[] aa = (Object[]) a;
Object[] bb = (Object[]) b;
Float fa = (Float) aa[2];
Float fb = (Float) bb[2];
return fa.floatValue() > fb.floatValue();
}
}
/**
* Use for frequencies and to avoid renewing Integers.
*/
private static class Int {
int x;
Int() {
x = 1;
}
}
}

View File

@ -0,0 +1,123 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.search.similar;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similar.MoreLikeThis;
/**
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
* actual MoreLikeThis object and obtain the real Query object.
* @author maharwood
*/
public class MoreLikeThisQuery extends Query
{
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
float percentTermsToMatch=0.3f;
int minTermFrequency=1;
int maxQueryTerms=5;
/**
* @param docId
* @param moreLikeFields
*/
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
{
this.likeText=likeText;
this.moreLikeFields=moreLikeFields;
this.analyzer=analyzer;
}
public Query rewrite(IndexReader reader) throws IOException
{
MoreLikeThis mlt=new MoreLikeThis(reader);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);
mlt.setMinTermFreq(minTermFrequency);
mlt.setMaxQueryTerms(maxQueryTerms);
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
return bq;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
public String toString(String field)
{
return "like:"+likeText;
}
public float getPercentTermsToMatch() {
return percentTermsToMatch;
}
public void setPercentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
}
public Analyzer getAnalyzer()
{
return analyzer;
}
public void setAnalyzer(Analyzer analyzer)
{
this.analyzer = analyzer;
}
public String getLikeText()
{
return likeText;
}
public void setLikeText(String likeText)
{
this.likeText = likeText;
}
public int getMaxQueryTerms()
{
return maxQueryTerms;
}
public void setMaxQueryTerms(int maxQueryTerms)
{
this.maxQueryTerms = maxQueryTerms;
}
public int getMinTermFrequency()
{
return minTermFrequency;
}
public void setMinTermFrequency(int minTermFrequency)
{
this.minTermFrequency = minTermFrequency;
}
public String[] getMoreLikeFields()
{
return moreLikeFields;
}
public void setMoreLikeFields(String[] moreLikeFields)
{
this.moreLikeFields = moreLikeFields;
}
}

View File

@ -0,0 +1,118 @@
/**
* Copyright 2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similar;
import java.io.*;
import java.util.*;
import java.net.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.*;
/**
* Simple similarity measures.
*
*
* @see MoreLikeThis
*/
public final class SimilarityQueries
{
/**
*
*/
private SimilarityQueries()
{
}
/**
* Simple similarity query generators.
* Takes every unique word and forms a boolean query where all words are optional.
* After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
* The only caveat is the first hit returned <b>should be</b> your source document - you'll
* need to then ignore that.
*
* <p>
*
* So, if you have a code fragment like this:
* <br>
* <code>
* Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
* </code>
*
* <p>
*
* The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
*
* <p>
* The philosophy behind this method is "two documents are similar if they share lots of words".
* Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
*
* <P>
* This method is fail-safe in that if a long 'body' is passed in and
* {@link BooleanQuery#add BooleanQuery.add()} (used internally)
* throws
* {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
* query as it is will be returned.
*
*
*
*
*
* @param body the body of the document you want to find similar documents to
* @param a the analyzer to use to parse the body
* @param field the field you want to search on, probably something like "contents" or "body"
* @param stop optional set of stop words to ignore
* @return a query with all unique words in 'body'
* @throws IOException this can't happen...
*/
public static Query formSimilarQuery( String body,
Analyzer a,
String field,
Set stop)
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
org.apache.lucene.analysis.Token t;
BooleanQuery tmp = new BooleanQuery();
Set already = new HashSet(); // ignore dups
while ( (t = ts.next()) != null)
{
String word = t.termText();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
// ignore dups
if ( ! already.add( word)) continue;
// add to query
TermQuery tq = new TermQuery( new Term( field, word));
try
{
tmp.add( tq, false, false);
}
catch( BooleanQuery.TooManyClauses too)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}
}

View File

@ -0,0 +1,5 @@
<html>
<body>
Document similarity query generators.
</body>
</html>

View File

@ -0,0 +1,28 @@
<?xml version="1.0"?>
<project name="xml-query-parser" default="buildParser">
<description>
XML query parser
</description>
<import file="../contrib-build.xml"/>
<property name="queries.jar" location="../../build/contrib/queries/lucene-queries-${version}.jar"/>
<path id="classpath">
<pathelement path="${lucene.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${project.classpath}"/>
</path>
<target name="buildParser" depends="buildQueries,default" />
<target name="buildQueries" >
<echo>XML Parser building dependency ${queries.jar}</echo>
<ant antfile="../queries/build.xml" target="default" inheritall="false"/>
</target>
</project>

View File

@ -0,0 +1,33 @@
<html>
<body>
<h1>XML based query syntax
</h1>
<p>
This module contains:
<ul>
<li>a modular Lucene Query Parser where queries are expressed as XML</li>
<li>JUnit test</li>
<li>Example XML queries</li>
<li>Test index (subset of Reuters 21578)</li>
</ul>
</p>
<p>
The original motivation for creating this package was outlined and discussed <a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=113355526731460&w=2">here</a>.
</p>
<p>
Parser support includes:
<ul>
<li>"Span" queries</li>
<li>"Like this" queries</li>
<li>Boolean, Term, and UserInput (parsed with existing query parser)</li>
<li>BoostingQuery - a class that can downgrade scores for hits on
certain terms rather than the hard-line approach taken by BooleanClause.Occurs.MUST_NOT</li>
<li>FilteredQuery, RangeFilter, and "TermsFilter" for non-sequential terms</li>
<li>"FuzzyLikeThis" a new query which is a cross between "LikeThis" and "fuzzy" but with
better scoring of fuzzy terms than standard fuzzy queries</li>
<li>A modular design with expandable support for new query/filter types</li>
</ul>
</p>
<p>This code is dependent on the "queries" contrib module although the "CoreParser" can be compiled with just Lucene core if required</p>
</body>
</html>

View File

@ -0,0 +1,124 @@
package org.apache.lucene.xmlparser;
import java.io.InputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.builders.BooleanQueryBuilder;
import org.apache.lucene.xmlparser.builders.ConstantScoreQueryBuilder;
import org.apache.lucene.xmlparser.builders.RangeFilterBuilder;
import org.apache.lucene.xmlparser.builders.SpanFirstBuilder;
import org.apache.lucene.xmlparser.builders.SpanNearBuilder;
import org.apache.lucene.xmlparser.builders.SpanNotBuilder;
import org.apache.lucene.xmlparser.builders.SpanOrBuilder;
import org.apache.lucene.xmlparser.builders.SpanOrTermsBuilder;
import org.apache.lucene.xmlparser.builders.SpanQueryBuilderFactory;
import org.apache.lucene.xmlparser.builders.SpanTermBuilder;
import org.apache.lucene.xmlparser.builders.TermQueryBuilder;
import org.apache.lucene.xmlparser.builders.UserInputQueryBuilder;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
/**
* Assembles a QueryBuilder which uses only core Lucene Query objects
* @author Mark
*
*/
public class CoreParser implements QueryBuilder
{
protected Analyzer analyzer;
protected QueryParser parser;
protected QueryBuilderFactory queryFactory;
protected FilterBuilderFactory filterFactory;
public CoreParser(Analyzer analyzer, QueryParser parser)
{
this.analyzer=analyzer;
this.parser=parser;
filterFactory = new FilterBuilderFactory();
filterFactory.addBuilder("RangeFilter",new RangeFilterBuilder());
queryFactory = new QueryBuilderFactory();
queryFactory.addBuilder("TermQuery",new TermQueryBuilder());
queryFactory.addBuilder("BooleanQuery",new BooleanQueryBuilder(queryFactory));
queryFactory.addBuilder("UserQuery",new UserInputQueryBuilder(new QueryParser("contents", analyzer)));
queryFactory.addBuilder("FilteredQuery",new FilteredQueryBuilder(filterFactory,queryFactory));
queryFactory.addBuilder("ConstantScoreQuery",new ConstantScoreQueryBuilder(filterFactory));
SpanQueryBuilderFactory sqof=new SpanQueryBuilderFactory();
SpanNearBuilder snb=new SpanNearBuilder(sqof);
sqof.addBuilder("SpanNear",snb);
queryFactory.addBuilder("SpanNear",snb);
SpanTermBuilder snt=new SpanTermBuilder();
sqof.addBuilder("SpanTerm",snt);
queryFactory.addBuilder("SpanTerm",snt);
SpanOrBuilder sot=new SpanOrBuilder(sqof);
sqof.addBuilder("SpanOr",sot);
queryFactory.addBuilder("SpanOr",sot);
SpanOrTermsBuilder sots=new SpanOrTermsBuilder(analyzer);
sqof.addBuilder("SpanOrTerms",sots);
queryFactory.addBuilder("SpanOrTerms",sots);
SpanFirstBuilder sft=new SpanFirstBuilder(sqof);
sqof.addBuilder("SpanFirst",sft);
queryFactory.addBuilder("SpanFirst",sft);
SpanNotBuilder snot=new SpanNotBuilder(sqof);
sqof.addBuilder("SpanNot",snot);
queryFactory.addBuilder("SpanNot",snot);
}
public Query parse(InputStream xmlStream) throws ParserException
{
return getQuery(parseXML(xmlStream).getDocumentElement());
}
public void addQueryBuilder(String nodeName,QueryBuilder builder)
{
queryFactory.addBuilder(nodeName,builder);
}
public void addFilterBuilder(String nodeName,FilterBuilder builder)
{
filterFactory.addBuilder(nodeName,builder);
}
private static Document parseXML(InputStream pXmlFile) throws ParserException
{
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
try
{
db = dbf.newDocumentBuilder();
}
catch (Exception se)
{
throw new ParserException("XML Parser configuration error", se);
}
org.w3c.dom.Document doc = null;
try
{
doc = db.parse(pXmlFile);
}
catch (Exception se)
{
throw new ParserException("Error parsing XML stream:" + se, se);
}
return doc;
}
public Query getQuery(Element e) throws ParserException
{
return queryFactory.getQuery(e);
}
}

View File

@ -0,0 +1,25 @@
package org.apache.lucene.xmlparser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
public class CorePlusExtensionsParser extends CoreParser
{
public CorePlusExtensionsParser(Analyzer analyzer, QueryParser parser)
{
super(analyzer, parser);
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
String fields[]={"contents"};
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
queryFactory.addBuilder("FuzzyLikeThisQuery", new FuzzyLikeThisQueryBuilder(analyzer));
}
}

View File

@ -0,0 +1,198 @@
package org.apache.lucene.xmlparser;
import java.io.Reader;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class DOMUtils
{
/* Convenience method where there is only one child Element of a given name */
public static Element getChildByTagName(Element e, String name)
{
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
{
if( (kid.getNodeType()==Node.ELEMENT_NODE) && (name.equals(kid.getNodeName())) )
{
return (Element)kid;
}
}
return null;
}
/**
* Returns an attribute value from this node, or first parent node with this attribute defined
* @param element
* @param attributeName
* @return A non-zero-length value if defined, otherwise null
*/
public static String getAttributeWithInheritance(Element element, String attributeName)
{
String result=element.getAttribute(attributeName);
if( (result==null)|| ("".equals(result) ) )
{
Node n=element.getParentNode();
if((n==element)||(n==null))
{
return null;
}
Element parent=(Element) n;
return getAttributeWithInheritance(parent,attributeName);
}
return result;
}
/* Convenience method where there is only one child Element of a given name */
public static String getChildTextByTagName(Element e, String tagName)
{
Element child=getChildByTagName(e,tagName);
if(child!=null)
{
return getText(child);
}
return null;
}
/* Convenience method to append a new child with text*/
public static Element insertChild(Element parent, String tagName, String text)
{
Element child = parent.getOwnerDocument().createElement(tagName);
parent.appendChild(child);
if(text!=null)
{
child.appendChild(child.getOwnerDocument().createTextNode(text));
}
return child;
}
public static String getAttribute(Element element, String attributeName, String deflt)
{
String result=element.getAttribute(attributeName);
if( (result==null)|| ("".equals(result) ) )
{
return deflt;
}
return result;
}
public static float getAttribute(Element element, String attributeName, float deflt)
{
String result=element.getAttribute(attributeName);
if( (result==null)|| ("".equals(result) ) )
{
return deflt;
}
return Float.parseFloat(result);
}
public static int getAttribute(Element element, String attributeName, int deflt)
{
String result=element.getAttribute(attributeName);
if( (result==null)|| ("".equals(result) ) )
{
return deflt;
}
return Integer.parseInt(result);
}
public static boolean getAttribute(Element element, String attributeName,
boolean deflt)
{
String result = element.getAttribute(attributeName);
if ((result == null) || ("".equals(result)))
{
return deflt;
}
return Boolean.getBoolean(result);
}
/* Returns text of node and all child nodes - without markup */
//MH changed to Node from Element 25/11/2005
public static String getText(Node e)
{
StringBuffer sb=new StringBuffer();
getTextBuffer(e, sb);
return sb.toString();
}
public static Element getFirstChildElement(Element element)
{
for (Node kid = element.getFirstChild(); kid != null; kid = kid
.getNextSibling())
{
if (kid.getNodeType() == Node.ELEMENT_NODE)
{
return (Element) kid;
}
}
return null;
}
private static void getTextBuffer(Node e, StringBuffer sb)
{
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
{
switch(kid.getNodeType())
{
case Node.TEXT_NODE:
{
sb.append(kid.getNodeValue());
break;
}
case Node.ELEMENT_NODE:
{
getTextBuffer(kid, sb);
break;
}
case Node.ENTITY_REFERENCE_NODE:
{
getTextBuffer(kid, sb);
break;
}
}
}
}
/**
* Helper method to parse an XML file into a DOM tree, given a filename.
* @param pXmlFile name of the XML file to be parsed
* @return an org.w3c.dom.Document object
*/
public static Document loadXML(Reader is)
{
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = null;
try
{
db = dbf.newDocumentBuilder();
}
catch (Exception se)
{
throw new RuntimeException("Parser configuration error", se);
}
// Step 3: parse the input file
org.w3c.dom.Document doc = null;
try
{
doc = db.parse(new InputSource(is));
//doc = db.parse(is);
}
catch (Exception se)
{
throw new RuntimeException("Error parsing file:" + se, se);
}
return doc;
}
}

View File

@ -0,0 +1,14 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
import org.apache.lucene.search.Filter;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public interface FilterBuilder {
public Filter getFilter(Element e) throws ParserException;
}

View File

@ -0,0 +1,31 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
import java.util.HashMap;
import org.apache.lucene.search.Filter;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class FilterBuilderFactory implements FilterBuilder {
HashMap builders=new HashMap();
public Filter getFilter(Element n) throws ParserException {
FilterBuilder builder=(FilterBuilder) builders.get(n.getNodeName());
if(builder==null)
{
throw new ParserException("No FilterBuilder defined for node "+n.getNodeName());
}
return builder.getFilter(n);
}
public void addBuilder(String nodeName,FilterBuilder builder)
{
builders.put(nodeName,builder);
}
}

View File

@ -0,0 +1,71 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Query;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class FilteredQueryBuilder implements QueryBuilder {
private FilterBuilder filterFactory;
private QueryBuilder queryFactory;
public FilteredQueryBuilder(FilterBuilder filterFactory, QueryBuilder queryFactory)
{
this.filterFactory=filterFactory;
this.queryFactory=queryFactory;
}
/* (non-Javadoc)
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
*/
public Query getQuery(Element e) throws ParserException {
Element filterElement=DOMUtils.getChildByTagName(e,"Filter");
if(filterElement==null)
{
throw new ParserException("FilteredQuery missing \"Filter\" child element");
}
filterElement=DOMUtils.getFirstChildElement(filterElement);
Filter f=null;
if(filterElement!=null)
{
f=filterFactory.getFilter(filterElement);
}
else
{
throw new ParserException("FilteredQuery \"Filter\" element missing child query element ");
}
Element queryElement=DOMUtils.getChildByTagName(e,"Query");
if(queryElement==null)
{
throw new ParserException("FilteredQuery missing \"Query\" child element");
}
queryElement=DOMUtils.getFirstChildElement(queryElement);
Query q=null;
if(queryElement!=null)
{
q=queryFactory.getQuery(queryElement);
}
else
{
throw new ParserException("FilteredQuery \"Query\" element missing child query element ");
}
FilteredQuery fq = new FilteredQuery(q,f);
fq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return fq;
}
}

View File

@ -0,0 +1,40 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
/**
* @author maharwood
*/
public class ParserException extends Exception {
/**
*
*/
public ParserException() {
super();
// TODO Auto-generated constructor stub
}
/**
* @param message
*/
public ParserException(String message) {
super(message);
// TODO Auto-generated constructor stub
}
/**
* @param message
* @param cause
*/
public ParserException(String message, Throwable cause) {
super(message, cause);
// TODO Auto-generated constructor stub
}
/**
* @param cause
*/
public ParserException(Throwable cause) {
super(cause);
// TODO Auto-generated constructor stub
}
}

View File

@ -0,0 +1,15 @@
package org.apache.lucene.xmlparser;
import org.apache.lucene.search.Query;
import org.w3c.dom.Element;
/**
* Implemented by objects that produce Lucene Query objects from XML streams. Implementations are
* expected to be thread-safe so that they can be used to simultaneously parse multiple XML documents.
* @author maharwood
*/
public interface QueryBuilder {
public Query getQuery(Element e) throws ParserException;
}

View File

@ -0,0 +1,31 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
import java.util.HashMap;
import org.apache.lucene.search.Query;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class QueryBuilderFactory implements QueryBuilder {
HashMap builders=new HashMap();
public Query getQuery(Element n) throws ParserException {
QueryBuilder builder=(QueryBuilder) builders.get(n.getNodeName());
if(builder==null)
{
throw new ParserException("No QueryObjectBuilder defined for node "+n.getNodeName());
}
return builder.getQuery(n);
}
public void addBuilder(String nodeName,QueryBuilder builder)
{
builders.put(nodeName,builder);
}
}

View File

@ -0,0 +1,89 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* @author maharwood
*/
public class BooleanQueryBuilder implements QueryBuilder {
private QueryBuilder factory;
public BooleanQueryBuilder(QueryBuilder factory)
{
this.factory=factory;
}
/* (non-Javadoc)
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
*/
public Query getQuery(Element e) throws ParserException {
BooleanQuery bq=new BooleanQuery();
bq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
NodeList nl = e.getElementsByTagName("Clause");
for(int i=0;i<nl.getLength();i++)
{
Element clauseElem=(Element) nl.item(i);
BooleanClause.Occur occurs=getOccursValue(clauseElem);
//find the first element child which should contain a Query
Element clauseQuery=DOMUtils.getFirstChildElement(clauseElem);
if(clauseQuery!=null)
{
Query q=factory.getQuery(clauseQuery);
bq.add(new BooleanClause(q,occurs));
}
else
{
throw new ParserException("BooleanClause missing child query element ");
}
}
return bq;
}
private BooleanClause.Occur getOccursValue(Element clauseElem) throws ParserException
{
String occs=clauseElem.getAttribute("occurs");
BooleanClause.Occur occurs=BooleanClause.Occur.SHOULD;
if("must".equalsIgnoreCase(occs))
{
occurs=BooleanClause.Occur.MUST;
}
else
{
if("mustNot".equalsIgnoreCase(occs))
{
occurs=BooleanClause.Occur.MUST_NOT;
}
else
{
if(("should".equalsIgnoreCase(occs))||("".equals(occs)))
{
occurs=BooleanClause.Occur.SHOULD;
}
else
{
if(occs!=null)
{
throw new ParserException("Invalid value for \"occurs\" attribute of clause:"+occs);
}
}
}
}
return occurs;
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.BoostingQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
public class BoostingQueryBuilder implements QueryBuilder
{
private QueryBuilder factory;
float defaultBoost=0.01f;
public BoostingQueryBuilder (QueryBuilder factory)
{
this.factory=factory;
}
public Query getQuery(Element e) throws ParserException
{
Element mainQueryElem=DOMUtils.getChildByTagName(e,"Query");
if(mainQueryElem==null)
{
throw new ParserException("BoostingQuery missing a \"Query\" child element");
}
mainQueryElem=DOMUtils.getFirstChildElement(mainQueryElem);
if(mainQueryElem==null)
{
throw new ParserException("BoostingQuery \"Query\" element missing a child element");
}
Query mainQuery=factory.getQuery(mainQueryElem);
Element boostQueryElem=DOMUtils.getChildByTagName(e,"BoostQuery");
float boost=DOMUtils.getAttribute(boostQueryElem,"boost",defaultBoost);
if(boostQueryElem==null)
{
throw new ParserException("BoostingQuery missing a \"BoostQuery\" child element");
}
boostQueryElem=DOMUtils.getFirstChildElement(boostQueryElem);
if(boostQueryElem==null)
{
throw new ParserException("BoostingQuery \"BoostQuery\" element missing a child element");
}
Query boostQuery=factory.getQuery(boostQueryElem);
BoostingQuery bq = new BoostingQuery(mainQuery,boostQuery,boost);
bq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return bq;
}
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.FilterBuilderFactory;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
public class ConstantScoreQueryBuilder implements QueryBuilder
{
private FilterBuilderFactory filterFactory;
public ConstantScoreQueryBuilder(FilterBuilderFactory filterFactory)
{
this.filterFactory=filterFactory;
}
public Query getQuery(Element e) throws ParserException
{
Element filterElem=DOMUtils.getFirstChildElement(e);
if(filterElem==null)
{
throw new ParserException("ConstantScoreQuery missing child element with filter");
}
Query q=new ConstantScoreQuery(filterFactory.getFilter(filterElem));
q.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return q;
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.FuzzyLikeThisQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
public class FuzzyLikeThisQueryBuilder implements QueryBuilder
{
int defaultMaxNumTerms=50;
float defaultMinSimilarity=0.5f;
int defaultPrefixLength=1;
boolean defaultIgnoreTF=false;
private Analyzer analyzer;
public FuzzyLikeThisQueryBuilder(Analyzer analyzer)
{
this.analyzer=analyzer;
}
public Query getQuery(Element e) throws ParserException
{
NodeList nl = e.getElementsByTagName("Field");
int maxNumTerms=DOMUtils.getAttribute(e,"maxNumTerms",defaultMaxNumTerms);
FuzzyLikeThisQuery fbq=new FuzzyLikeThisQuery(maxNumTerms,analyzer);
fbq.setIgnoreTF(DOMUtils.getAttribute(e,"ignoreTF",defaultIgnoreTF));
for(int i=0;i<nl.getLength();i++)
{
Element fieldElem=(Element) nl.item(i);
float minSimilarity=DOMUtils.getAttribute(fieldElem,"minSimilarity",defaultMinSimilarity);
int prefixLength=DOMUtils.getAttribute(fieldElem,"prefixLength",defaultPrefixLength);
String fieldName=DOMUtils.getAttributeWithInheritance(fieldElem,"fieldName");
String value=DOMUtils.getText(fieldElem);
fbq.addTerms(value,fieldName,minSimilarity,prefixLength);
}
fbq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return fbq;
}
}

View File

@ -0,0 +1,58 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class LikeThisQueryBuilder implements QueryBuilder {
private Analyzer analyzer;
String defaultFieldNames [];
int defaultMaxQueryTerms=20;
int defaultMinTermFrequency=1;
float defaultPercentTermsToMatch=30; //default is a 3rd of selected terms must match
public LikeThisQueryBuilder(Analyzer analyzer,String [] defaultFieldNames)
{
this.analyzer=analyzer;
this.defaultFieldNames=defaultFieldNames;
}
/* (non-Javadoc)
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
*/
public Query getQuery(Element e) throws ParserException {
String fieldsList=e.getAttribute("fieldNames"); //a comma-delimited list of fields
String fields[]=defaultFieldNames;
if((fieldsList!=null)&&(fieldsList.trim().length()>0))
{
fields=fieldsList.trim().split(",");
//trim the fieldnames
for (int i = 0; i < fields.length; i++) {
fields[i]=fields[i].trim();
}
}
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer);
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
mlt.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return mlt;
}
}

View File

@ -0,0 +1,32 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.FilterBuilder;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class RangeFilterBuilder implements FilterBuilder {
public Filter getFilter(Element e) throws ParserException {
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
String lowerTerm=e.getAttribute("lowerTerm");
String upperTerm=e.getAttribute("upperTerm");
boolean includeLower=DOMUtils.getAttribute(e,"includeLower",true);
boolean includeUpper=DOMUtils.getAttribute(e,"includeUpper",true);
return new RangeFilter(fieldName,lowerTerm,upperTerm,includeLower,includeUpper);
}
}

View File

@ -0,0 +1,14 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
public abstract class SpanBuilderBase implements SpanQueryBuilder
{
public Query getQuery(Element e) throws ParserException
{
return getSpanQuery(e);
}
}

View File

@ -0,0 +1,31 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.spans.SpanFirstQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
public class SpanFirstBuilder extends SpanBuilderBase
{
SpanQueryBuilder factory;
public SpanFirstBuilder(SpanQueryBuilder factory)
{
super();
this.factory = factory;
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
int end=DOMUtils.getAttribute(e,"end",1);
Element child=DOMUtils.getFirstChildElement(e);
SpanQuery q=factory.getSpanQuery(child);
SpanFirstQuery sfq = new SpanFirstQuery(q,end);
sfq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return sfq;
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.xmlparser.builders;
import java.util.ArrayList;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
public class SpanNearBuilder extends SpanBuilderBase
{
SpanQueryBuilder factory;
public SpanNearBuilder(SpanQueryBuilder factory)
{
this.factory=factory;
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
String slopString=e.getAttribute("slop");
if((slopString==null)||(slopString.length()==0))
{
throw new ParserException("SpanTermQuery missing slop property ");
}
int slop=Integer.parseInt(slopString);
boolean inOrder=DOMUtils.getAttribute(e,"inOrder",false);
ArrayList spans=new ArrayList();
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
{
if (kid.getNodeType() == Node.ELEMENT_NODE)
{
spans.add(factory.getSpanQuery((Element) kid));
}
}
SpanQuery[] spanQueries=(SpanQuery[]) spans.toArray(new SpanQuery[spans.size()]);
SpanNearQuery snq=new SpanNearQuery(spanQueries,slop,inOrder);
return snq;
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
public class SpanNotBuilder extends SpanBuilderBase
{
SpanQueryBuilder factory;
/**
* @param factory
*/
public SpanNotBuilder(SpanQueryBuilder factory)
{
super();
this.factory = factory;
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
Element includeElem=DOMUtils.getChildByTagName(e,"Include");
if(includeElem!=null)
{
includeElem=DOMUtils.getFirstChildElement(includeElem);
}
if(includeElem==null)
{
throw new ParserException("SpanNotQuery missing Include child Element");
}
Element excludeElem=DOMUtils.getChildByTagName(e,"Exclude");
if(excludeElem!=null)
{
excludeElem=DOMUtils.getFirstChildElement(excludeElem);
}
if(excludeElem==null)
{
throw new ParserException("SpanNotQuery missing Exclude child Element");
}
SpanQuery include=factory.getSpanQuery(includeElem);
SpanQuery exclude=factory.getSpanQuery(excludeElem);
SpanNotQuery snq = new SpanNotQuery(include,exclude);
snq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return snq;
}
}

View File

@ -0,0 +1,40 @@
package org.apache.lucene.xmlparser.builders;
import java.util.ArrayList;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
public class SpanOrBuilder extends SpanBuilderBase
{
SpanQueryBuilder factory;
public SpanOrBuilder(SpanQueryBuilder factory)
{
super();
this.factory = factory;
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
ArrayList clausesList=new ArrayList();
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
{
if (kid.getNodeType() == Node.ELEMENT_NODE)
{
SpanQuery clause=factory.getSpanQuery((Element) kid);
clausesList.add(clause);
}
}
SpanQuery[] clauses=(SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]);
SpanOrQuery soq = new SpanOrQuery(clauses);
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return soq;
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.xmlparser.builders;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
public class SpanOrTermsBuilder extends SpanBuilderBase
{
Analyzer analyzer;
/**
* @param analyzer
*/
public SpanOrTermsBuilder(Analyzer analyzer)
{
super();
this.analyzer = analyzer;
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
if(fieldName==null)
{
throw new ParserException("Error: SpanOrTermsBuilder missing \"fieldName\" property");
}
String value=DOMUtils.getText(e);
try
{
ArrayList clausesList=new ArrayList();
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
Token token=ts.next();
while(token!=null)
{
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText()));
clausesList.add(stq);
token=ts.next();
}
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return soq;
}
catch(IOException ioe)
{
throw new ParserException("IOException parsing value:"+value);
}
}
}

View File

@ -0,0 +1,18 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public interface SpanQueryBuilder extends QueryBuilder{
public SpanQuery getSpanQuery(Element e) throws ParserException;
}

View File

@ -0,0 +1,34 @@
package org.apache.lucene.xmlparser.builders;
import java.util.HashMap;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class SpanQueryBuilderFactory implements SpanQueryBuilder {
HashMap builders=new HashMap();
public Query getQuery(Element e) throws ParserException {
return getSpanQuery(e);
}
public void addBuilder(String nodeName,SpanQueryBuilder builder)
{
builders.put(nodeName,builder);
}
public SpanQuery getSpanQuery(Element e) throws ParserException
{
SpanQueryBuilder builder=(SpanQueryBuilder) builders.get(e.getNodeName());
if(builder==null)
{
throw new ParserException("No SpanQueryObjectBuilder defined for node "+e.getNodeName());
}
return builder.getSpanQuery(e);
}
}

View File

@ -0,0 +1,31 @@
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
public class SpanTermBuilder extends SpanBuilderBase
{
public SpanQuery getSpanQuery(Element e) throws ParserException
{
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
String value=DOMUtils.getText(e);
if((fieldName==null)||(fieldName.length()==0))
{
throw new ParserException("SpanTermQuery missing fieldName property ");
}
if((value==null)||(value.length()==0))
{
throw new ParserException("TermQuery missing value property ");
}
SpanTermQuery stq = new SpanTermQuery(new Term(fieldName,value));
stq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return stq;
}
}

View File

@ -0,0 +1,37 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class TermQueryBuilder implements QueryBuilder {
public Query getQuery(Element e) throws ParserException {
String field=DOMUtils.getAttributeWithInheritance(e,"fieldName");
String value=DOMUtils.getText(e);
if((field==null)||(field.length()==0))
{
throw new ParserException("TermQuery element missing fieldName attribute");
}
if((value==null)||(value.length()==0))
{
throw new ParserException("TermQuery element missing child text property ");
}
TermQuery tq = new TermQuery(new Term(field,value));
tq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return tq;
}
}

View File

@ -0,0 +1,79 @@
package org.apache.lucene.xmlparser.builders;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.TermsFilter;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.FilterBuilder;
import org.apache.lucene.xmlparser.ParserException;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* @author maharwood
*
* @
*/
public class TermsFilterBuilder implements FilterBuilder
{
Analyzer analyzer;
/**
* @param analyzer
*/
public TermsFilterBuilder(Analyzer analyzer)
{
this.analyzer = analyzer;
}
/* (non-Javadoc)
* @see org.apache.lucene.xmlparser.FilterBuilder#process(org.w3c.dom.Element)
*/
public Filter getFilter(Element e) throws ParserException
{
TermsFilter tf=new TermsFilter();
NodeList nl = e.getElementsByTagName("Field");
for(int i=0;i<nl.getLength();i++)
{
Element fieldElem=(Element) nl.item(i);
String fieldName=DOMUtils.getAttributeWithInheritance(fieldElem,"fieldName");
if(fieldName==null)
{
throw new ParserException("TermsFilter missing \"fieldName\" element");
}
String text=DOMUtils.getText(fieldElem).trim();
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
try
{
Token token=ts.next();
Term term=null;
while(token!=null)
{
if(term==null)
{
term=new Term(fieldName,token.termText());
}
else
{
term=term.createTerm(token.termText()); //create from previous to save fieldName.intern overhead
}
tf.addTerm(term);
token=ts.next();
}
}
catch(IOException ioe)
{
throw new RuntimeException("Error constructing terms from index:"+ioe);
}
}
return tf;
}
}

View File

@ -0,0 +1,44 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser.builders;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;
import org.apache.lucene.xmlparser.QueryBuilder;
import org.w3c.dom.Element;
/**
* @author maharwood
*/
public class UserInputQueryBuilder implements QueryBuilder {
QueryParser parser;
/**
* @param parser
*/
public UserInputQueryBuilder(QueryParser parser) {
this.parser = parser;
}
/* (non-Javadoc)
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
*/
public Query getQuery(Element e) throws ParserException {
String text=DOMUtils.getText(e);
try {
Query q = parser.parse(text);
q.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
return q;
} catch (ParseException e1) {
throw new ParserException(e1.getMessage());
}
}
}

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>merger</TermQuery>
</Clause>
<Clause occurs="mustnot">
<TermQuery>sumitomo</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery>bank</TermQuery>
</Clause>
</BooleanQuery>

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<BoostingQuery>
<!-- Find docs about banks, preferably merger info and preferably not "World bank" -->
<Query>
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>merger</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery>bank</TermQuery>
</Clause>
</BooleanQuery>
</Query>
<BoostQuery boost="0.01">
<UserQuery>"world bank"</UserQuery>
</BoostQuery>
</BoostingQuery>

View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<ConstantScoreQuery>
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
</ConstantScoreQuery>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<FuzzyLikeThisQuery>
<!-- Matches on misspelt "Sumitomo" bank -->
<Field fieldName="contents">
Sumitimo bank
</Field>
</FuzzyLikeThisQuery>

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<LikeThisQuery percentTermsToMatch="5">
IRAQI TROOPS REPORTED PUSHING BACK IRANIANS Iraq said today its troops were pushing Iranian forces out of
positions they had initially occupied when they launched a new offensive near the southern port of
Basra early yesterday. A High Command communique said Iraqi troops had won a significant victory
and were continuing to advance. Iraq said it had foiled a three-pronged thrust some 10 km
(six miles) from Basra, but admitted the Iranians had occupied ground held by the Mohammed al-Qassem
unit, one of three divisions attacked. The communique said Iranian Revolutionary Guards were under
assault from warplanes, helicopter gunships, heavy artillery and tanks. "Our forces are continuing
their advance until they purge the last foothold" occupied by the Iranians, it said.
(Iran said its troops had killed or wounded more than 4,000 Iraqis and were stabilising their new positions.)
The Baghdad communique said Iraqi planes also destroyed oil installations at Iran's southwestern Ahvaz field
during a raid today. It denied an Iranian report that an Iraqi jet was shot down.
Iraq also reported a naval battle at the northern tip of the Gulf. Iraqi naval units and forces defending an
offshore terminal sank six Iranian out of 28 Iranian boats attempting to attack an offshore terminal,
the communique said. Reuter 3;
</LikeThisQuery>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<FilteredQuery>
<Query>
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>merger</TermQuery>
</Clause>
<Clause occurs="mustnot">
<TermQuery >sumitomo</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery>bank</TermQuery>
</Clause>
</BooleanQuery>
</Query>
<Filter>
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
</Filter>
</FilteredQuery>

View File

@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<SpanOr fieldName="contents">
<SpanNear slop="8" inOrder="false" >
<SpanOr>
<SpanTerm>killed</SpanTerm>
<SpanTerm>died</SpanTerm>
<SpanTerm>dead</SpanTerm>
</SpanOr>
<SpanOr>
<!-- a less verbose way of declaring SpanTerm declarations - these are analyzed
into a series of Tokens which are added as SpanTerm elements of a SpanOr
-->
<SpanOrTerms>miner miners</SpanOrTerms>
<!-- finds mine near worker or workers -->
<SpanNear slop="6" inOrder="false">
<SpanTerm>mine</SpanTerm>
<SpanOrTerms>worker workers</SpanOrTerms>
</SpanNear>
</SpanOr>
</SpanNear>
<SpanFirst end="10">
<SpanOrTerms>fire burn</SpanOrTerms>
</SpanFirst>
<!-- Other Span examples....
<SpanNot>
<Include>
<SpanNear slop="2" inOrder="2">
<SpanTerm>social</SpanTerm>
<SpanTerm>services</SpanTerm>
</SpanNear>
</Include>
<Exclude>
<SpanTerm>public</SpanTerm>
</Exclude>
</SpanNot>
-->
</SpanOr>

View File

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<TermQuery fieldName="contents">sumitomo</TermQuery>

View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<FilteredQuery>
<Query>
<BooleanQuery fieldName="contents">
<Clause occurs="should">
<TermQuery>merger</TermQuery>
</Clause>
<Clause occurs="must">
<TermQuery>bank</TermQuery>
</Clause>
</BooleanQuery>
</Query>
<Filter>
<!-- TermsFilter uses an analyzer to tokenize Field text and creates a filter for docs which
have ANY of the supplied terms. Unlike a RangeFilter this can be used for filtering on
multiple terms that are not necessarily in a sequence. An example might be a list of primary
keys from a database query result or perhaps a choice of "category" labels picked by the end
user.
As a filter, this is much faster than the equivalent query (a BooleanQuery with many
"should" TermQueries)
This example might be just a list of Saturdays ie not a contiguous range of values
which can be handled by rangefilter
-->
<TermsFilter>
<Field fieldName="date" >19870601 19870608 19870615</Field>
</TermsFilter>
</Filter>
</FilteredQuery>

View File

@ -0,0 +1,166 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.xmlparser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
/**
* @author maharwood
*/
public class TestParser extends TestCase {
CoreParser builder;
static Directory dir;
Analyzer analyzer=new StandardAnalyzer();
IndexReader reader;
private IndexSearcher searcher;
//CHANGE THIS TO SEE OUTPUT
boolean printResults=false;
/*
* @see TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
//initialize the parser
builder=new CorePlusExtensionsParser(analyzer,new QueryParser("contents", analyzer));
//initialize the index (done once, then cached in static data for use with ALL tests)
if(dir==null)
{
BufferedReader d = new BufferedReader(new InputStreamReader(TestParser.class.getResourceAsStream("reuters21578.txt")));
dir=new RAMDirectory();
IndexWriter writer=new IndexWriter(dir,analyzer,true);
String line = d.readLine();
while(line!=null)
{
int endOfDate=line.indexOf('\t');
String date=line.substring(0,endOfDate).trim();
String content=line.substring(endOfDate).trim();
org.apache.lucene.document.Document doc =new org.apache.lucene.document.Document();
doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
doc.add(new Field("contents",content,Field.Store.YES,Field.Index.TOKENIZED));
writer.addDocument(doc);
line=d.readLine();
}
d.close();
}
reader=IndexReader.open(dir);
searcher=new IndexSearcher(reader);
}
protected void tearDown() throws Exception {
reader.close();
searcher.close();
// dir.close();
}
public void testSimpleXML() throws ParserException, IOException
{
Query q=parse("TermQuery.xml");
dumpResults("TermQuery", q, 5);
}
public void testBooleanQueryXML() throws ParserException, IOException
{
Query q=parse("BooleanQuery.xml");
dumpResults("BooleanQuery", q, 5);
}
public void testRangeFilterQueryXML() throws ParserException, IOException
{
Query q=parse("RangeFilterQuery.xml");
dumpResults("RangeFilter", q, 5);
}
public void testUserQueryXML() throws ParserException, IOException
{
Query q=parse("UserInputQuery.xml");
dumpResults("UserInput with Filter", q, 5);
}
public void testLikeThisQueryXML() throws Exception
{
Query q=parse("LikeThisQuery.xml");
dumpResults("like this", q, 5);
}
public void testBoostingQueryXML() throws Exception
{
Query q=parse("BoostingQuery.xml");
dumpResults("boosting ",q, 5);
}
public void testFuzzyLikeThisQueryXML() throws Exception
{
Query q=parse("FuzzyLikeThisQuery.xml");
//show rewritten fuzzyLikeThisQuery - see what is being matched on
if(printResults)
{
System.out.println(q.rewrite(reader));
}
dumpResults("FuzzyLikeThis", q, 5);
}
public void testTermsFilterXML() throws Exception
{
Query q=parse("TermsFilterQuery.xml");
dumpResults("Terms Filter",q, 5);
}
public void testSpanTermXML() throws Exception
{
Query q=parse("SpanQuery.xml");
dumpResults("Span Query",q, 5);
}
public void testConstantScoreQueryXML() throws Exception
{
Query q=parse("ConstantScoreQuery.xml");
dumpResults("ConstantScoreQuery",q, 5);
}
//================= Helper methods ===================================
private Query parse(String xmlFileName) throws ParserException, IOException
{
InputStream xmlStream=TestParser.class.getResourceAsStream(xmlFileName);
Query result=builder.parse(xmlStream);
xmlStream.close();
return result;
}
private void dumpResults(String qType,Query q, int numDocs) throws IOException
{
Hits h = searcher.search(q);
assertTrue(qType +" should produce results ", h.length()>0);
if(printResults)
{
System.out.println("========="+qType+"============");
for(int i=0;i<Math.min(numDocs,h.length());i++)
{
org.apache.lucene.document.Document ldoc=h.doc(i);
System.out.println("["+ldoc.get("date")+"]"+ldoc.get("contents"));
}
System.out.println();
}
}
}

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<FilteredQuery>
<Query>
<UserQuery>"Bank of England"</UserQuery>
</Query>
<Filter>
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
</Filter>
</FilteredQuery>

File diff suppressed because one or more lines are too long