mirror of https://github.com/apache/lucene.git
Added XML-query-parser module for new extensible query parser that handles queries expressed as XML. Is dependent on new "queries" contrib module.
Added "queries" contrib module for various new query/filter classes. This area is also intended to consolidate existing query classes so have moved a copy of MoreLikeThis into here. Probably need to remove "similarity" module as a result, if no one objects. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@380874 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c73de87a8f
commit
87768c51c6
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="queries" default="default">
|
||||
|
||||
<description>
|
||||
Queries - various query object exotica not in core
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
</project>
|
|
@ -0,0 +1,27 @@
|
|||
This module contains a number of filter and query objects that add to core lucene.
|
||||
|
||||
The "MoreLikeThis" class from the "similarity" module has been copied into here.
|
||||
If people are generally happy with this move then the similarity module can be deleted, or at least a
|
||||
"Moved to queries module..." note left in its place.
|
||||
|
||||
==== FuzzyLikeThis - mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
|
||||
of fuzzy scoring factors. This generally produces good results for queries where users may provide details in a number of
|
||||
fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching. The query is fast because, like
|
||||
MoreLikeThis, it optimizes the query to only the most distinguishing terms.
|
||||
|
||||
==== BoostingQuery - effectively demotes search results that match a given query.
|
||||
Unlike the "NOT" clause, this still selects documents that contain undesirable terms,
|
||||
but reduces the overall score of docs containing these terms.
|
||||
|
||||
|
||||
==== TermsFilter - Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||
a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||
a choice of "category" labels picked by the end user.
|
||||
|
||||
|
||||
|
||||
|
||||
Mark Harwood
|
||||
25/02/2006
|
||||
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
package org.apache.lucene.search;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
/**
|
||||
* The BoostingQuery class can be used to effectively demote results that match a given query.
|
||||
* Unlike the "NOT" clause, this still selects documents that contain undesirable terms,
|
||||
* but reduces their overall score:
|
||||
*
|
||||
* Query balancedQuery = new BoostingQuery(positiveQuery, negativeQuery, 0.01f);
|
||||
* In this scenario the positiveQuery contains the mandatory, desirable criteria which is used to
|
||||
* select all matching documents, and the negativeQuery contains the undesirable elements which
|
||||
* are simply used to lessen the scores. Documents that match the negativeQuery have their score
|
||||
* multiplied by the supplied "boost" parameter, so this should be less than 1 to achieve a
|
||||
* demoting effect
|
||||
*
|
||||
* This code was originally made available here: [WWW] http://marc.theaimsgroup.com/?l=lucene-user&m=108058407130459&w=2
|
||||
* and is documented here: http://wiki.apache.org/jakarta-lucene/CommunityContributions
|
||||
*/
|
||||
public class BoostingQuery extends Query {
|
||||
private float boost; // the amount to boost by
|
||||
private Query match; // query to match
|
||||
private Query context; // boost when matches too
|
||||
|
||||
public BoostingQuery(Query match, Query context, float boost) {
|
||||
this.match = match;
|
||||
this.context = (Query)context.clone(); // clone before boost
|
||||
this.boost = boost;
|
||||
|
||||
context.setBoost(0.0f); // ignore context-only matches
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
BooleanQuery result = new BooleanQuery() {
|
||||
|
||||
public Similarity getSimilarity(Searcher searcher) {
|
||||
return new DefaultSimilarity() {
|
||||
|
||||
public float coord(int overlap, int max) {
|
||||
switch (overlap) {
|
||||
|
||||
case 1: // matched only one clause
|
||||
return 1.0f; // use the score as-is
|
||||
|
||||
case 2: // matched both clauses
|
||||
return boost; // multiply by boost
|
||||
|
||||
default:
|
||||
return 0.0f;
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
result.add(match, BooleanClause.Occur.MUST);
|
||||
result.add(context, BooleanClause.Occur.SHOULD);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString(String field) {
|
||||
return match.toString(field) + "/" + context.toString(field);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,302 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/**
|
||||
* Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
|
||||
* In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
|
||||
* of fuzzy scoring factors.
|
||||
* This generally produces good results for queries where users may provide details in a number of
|
||||
* fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
|
||||
* a fast query.
|
||||
*
|
||||
* For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
|
||||
* we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
|
||||
* TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
|
||||
* terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
|
||||
* term) and this is factored into the variant's boost. If the source query term does not exist in the
|
||||
* index the average IDF of the variants is used.
|
||||
* @author maharwood
|
||||
*/
|
||||
public class FuzzyLikeThisQuery extends Query
|
||||
{
|
||||
static Similarity sim=new DefaultSimilarity();
|
||||
Query rewrittenQuery=null;
|
||||
ArrayList fieldVals=new ArrayList();
|
||||
Analyzer analyzer;
|
||||
|
||||
ScoreTermQueue q;
|
||||
int MAX_VARIANTS_PER_TERM=50;
|
||||
boolean ignoreTF=false;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
|
||||
* @param analyzer
|
||||
*/
|
||||
public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
|
||||
{
|
||||
q=new ScoreTermQueue(maxNumTerms);
|
||||
this.analyzer=analyzer;
|
||||
}
|
||||
|
||||
class FieldVals
|
||||
{
|
||||
String queryString;
|
||||
String fieldName;
|
||||
float minSimilarity;
|
||||
int prefixLength;
|
||||
public FieldVals(String name, float similarity, int length, String queryString)
|
||||
{
|
||||
fieldName = name;
|
||||
minSimilarity = similarity;
|
||||
prefixLength = length;
|
||||
this.queryString = queryString;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds user input for "fuzzification"
|
||||
* @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
|
||||
* @param fieldName
|
||||
* @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
|
||||
* @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
|
||||
*/
|
||||
public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
|
||||
{
|
||||
fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
|
||||
}
|
||||
|
||||
|
||||
private void addTerms(IndexReader reader,FieldVals f) throws IOException
|
||||
{
|
||||
if(f.queryString==null) return;
|
||||
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
||||
Token token=ts.next();
|
||||
int corpusNumDocs=reader.numDocs();
|
||||
Term internSavingTemplateTerm =new Term(f.fieldName,""); //optimization to avoid constructing new Term() objects
|
||||
|
||||
while(token!=null)
|
||||
{
|
||||
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||
float minScore=0;
|
||||
Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
|
||||
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
|
||||
TermEnum origEnum = reader.terms(startTerm);
|
||||
int df=0;
|
||||
if(startTerm.equals(origEnum.term()))
|
||||
{
|
||||
df=origEnum.docFreq(); //store the df so all variants use same idf
|
||||
}
|
||||
int numVariants=0;
|
||||
int totalVariantDocFreqs=0;
|
||||
do
|
||||
{
|
||||
Term possibleMatch=fe.term();
|
||||
if(possibleMatch!=null)
|
||||
{
|
||||
numVariants++;
|
||||
totalVariantDocFreqs+=fe.docFreq();
|
||||
float score=fe.difference();
|
||||
if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
|
||||
ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
|
||||
variantsQ.insert(st);
|
||||
minScore = ((ScoreTerm)variantsQ.top()).score; // maintain minScore
|
||||
}
|
||||
}
|
||||
}
|
||||
while(fe.next());
|
||||
if(numVariants==0)
|
||||
{
|
||||
//no variants to rank here
|
||||
break;
|
||||
}
|
||||
int avgDf=totalVariantDocFreqs/numVariants;
|
||||
if(df==0)//no direct match we can use as df for all variants
|
||||
{
|
||||
df=avgDf; //use avg df of all variants
|
||||
}
|
||||
|
||||
// take the top variants (scored by edit distance) and reset the score
|
||||
// to include an IDF factor then add to the global queue for ranking overall top query terms
|
||||
int size = variantsQ.size();
|
||||
for(int i = 0; i < size; i++)
|
||||
{
|
||||
ScoreTerm st = (ScoreTerm) variantsQ.pop();
|
||||
st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
|
||||
q.insert(st);
|
||||
}
|
||||
token=ts.next();
|
||||
}
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException
|
||||
{
|
||||
if(rewrittenQuery!=null)
|
||||
{
|
||||
return rewrittenQuery;
|
||||
}
|
||||
//load up the list of possible terms
|
||||
for (Iterator iter = fieldVals.iterator(); iter.hasNext();)
|
||||
{
|
||||
FieldVals f = (FieldVals) iter.next();
|
||||
addTerms(reader,f);
|
||||
}
|
||||
//clear the list of fields
|
||||
fieldVals.clear();
|
||||
|
||||
BooleanQuery bq=new BooleanQuery();
|
||||
|
||||
|
||||
//create BooleanQueries to hold the variants for each token/field pair and ensure it
|
||||
// has no coord factor
|
||||
//Step 1: sort the termqueries by term/field
|
||||
HashMap variantQueries=new HashMap();
|
||||
int size = q.size();
|
||||
for(int i = 0; i < size; i++)
|
||||
{
|
||||
ScoreTerm st = (ScoreTerm) q.pop();
|
||||
ArrayList l=(ArrayList) variantQueries.get(st.fuzziedSourceTerm);
|
||||
if(l==null)
|
||||
{
|
||||
l=new ArrayList();
|
||||
variantQueries.put(st.fuzziedSourceTerm,l);
|
||||
}
|
||||
l.add(st);
|
||||
}
|
||||
//Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
|
||||
for (Iterator iter = variantQueries.values().iterator(); iter.hasNext();)
|
||||
{
|
||||
ArrayList variants = (ArrayList) iter.next();
|
||||
if(variants.size()==1)
|
||||
{
|
||||
//optimize where only one selected variant
|
||||
ScoreTerm st=(ScoreTerm) variants.get(0);
|
||||
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
|
||||
tq.setBoost(st.score); // set the boost to a mix of IDF and score
|
||||
bq.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
else
|
||||
{
|
||||
BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
|
||||
for (Iterator iterator2 = variants.iterator(); iterator2
|
||||
.hasNext();)
|
||||
{
|
||||
ScoreTerm st = (ScoreTerm) iterator2.next();
|
||||
TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
|
||||
tq.setBoost(st.score); // set the boost using the ScoreTerm's score
|
||||
termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
|
||||
}
|
||||
bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
|
||||
}
|
||||
}
|
||||
//TODO possible alternative step 3 - organize above booleans into a new layer of field-based
|
||||
// booleans with a minimum-should-match of NumFields-1?
|
||||
|
||||
this.rewrittenQuery=bq;
|
||||
return bq;
|
||||
}
|
||||
|
||||
//Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
|
||||
// term variants) then is reset with IDF for use in ranking against all other
|
||||
// terms/fields
|
||||
private static class ScoreTerm{
|
||||
public Term term;
|
||||
public float score;
|
||||
Term fuzziedSourceTerm;
|
||||
|
||||
public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
|
||||
this.term = term;
|
||||
this.score = score;
|
||||
this.fuzziedSourceTerm=fuzziedSourceTerm;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ScoreTermQueue extends PriorityQueue {
|
||||
public ScoreTermQueue(int size){
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
|
||||
*/
|
||||
protected boolean lessThan(Object a, Object b) {
|
||||
ScoreTerm termA = (ScoreTerm)a;
|
||||
ScoreTerm termB = (ScoreTerm)b;
|
||||
if (termA.score== termB.score)
|
||||
return termA.term.compareTo(termB.term) > 0;
|
||||
else
|
||||
return termA.score < termB.score;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
|
||||
private static class FuzzyTermQuery extends TermQuery
|
||||
{
|
||||
boolean ignoreTF;
|
||||
public FuzzyTermQuery(Term t, boolean ignoreTF)
|
||||
{
|
||||
super(t);
|
||||
this.ignoreTF=ignoreTF;
|
||||
}
|
||||
public Similarity getSimilarity(Searcher searcher)
|
||||
{
|
||||
Similarity result = super.getSimilarity(searcher);
|
||||
result = new SimilarityDelegator(result) {
|
||||
|
||||
public float tf(float freq)
|
||||
{
|
||||
if(ignoreTF)
|
||||
{
|
||||
return 1; //ignore tf
|
||||
}
|
||||
return super.tf(freq);
|
||||
}
|
||||
public float idf(int docFreq, int numDocs)
|
||||
{
|
||||
//IDF is already factored into individual term boosts
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||
*/
|
||||
public String toString(String field)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public boolean isIgnoreTF()
|
||||
{
|
||||
return ignoreTF;
|
||||
}
|
||||
|
||||
|
||||
public void setIgnoreTF(boolean ignoreTF)
|
||||
{
|
||||
this.ignoreTF = ignoreTF;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
|
||||
/**
|
||||
* Constructs a filter for docs matching any of the terms added to this class.
|
||||
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
||||
* equivalent query (a BooleanQuery with many "should" TermQueries)
|
||||
*
|
||||
* @author maharwood
|
||||
*/
|
||||
public class TermsFilter extends Filter
|
||||
{
|
||||
ArrayList termsList=new ArrayList();
|
||||
|
||||
/**
|
||||
* Adds a term to the list of acceptable terms
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term)
|
||||
{
|
||||
termsList.add(term);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Filter#bits(org.apache.lucene.index.IndexReader)
|
||||
*/
|
||||
public BitSet bits(IndexReader reader) throws IOException
|
||||
{
|
||||
BitSet result=new BitSet(reader.maxDoc());
|
||||
for (Iterator iter = termsList.iterator(); iter.hasNext();)
|
||||
{
|
||||
Term term = (Term) iter.next();
|
||||
TermDocs td=reader.termDocs(term);
|
||||
while (td.next())
|
||||
{
|
||||
result.set(td.doc());
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,926 @@
|
|||
/**
|
||||
* Copyright 2004-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.io.StringReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
|
||||
|
||||
/**
|
||||
* Generate "more like this" similarity queries.
|
||||
* Based on this mail:
|
||||
* <code><pre>
|
||||
* Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
|
||||
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
|
||||
* is usually fast enough. But looking up the docFreq() of every term in the document is
|
||||
* probably too slow.
|
||||
*
|
||||
* You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
|
||||
* or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
|
||||
* in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
|
||||
* reduce the number of terms under consideration. Another heuristic is that terms with a
|
||||
* high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
|
||||
* number of characters, not selecting anything less than, e.g., six or seven characters.
|
||||
* With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
|
||||
* that do a pretty good job of characterizing a document.
|
||||
*
|
||||
* It all depends on what you're trying to do. If you're trying to eek out that last percent
|
||||
* of precision and recall regardless of computational difficulty so that you can win a TREC
|
||||
* competition, then the techniques I mention above are useless. But if you're trying to
|
||||
* provide a "more like this" button on a search results page that does a decent job and has
|
||||
* good performance, such techniques might be useful.
|
||||
*
|
||||
* An efficient, effective "more-like-this" query generator would be a great contribution, if
|
||||
* anyone's interested. I'd imagine that it would take a Reader or a String (the document's
|
||||
* text), analyzer Analyzer, and return a set of representative terms using heuristics like those
|
||||
* above. The frequency and length thresholds could be parameters, etc.
|
||||
*
|
||||
* Doug
|
||||
* </pre></code>
|
||||
*
|
||||
*
|
||||
* <p>
|
||||
* <h3>Initial Usage</h3>
|
||||
*
|
||||
* This class has lots of options to try to make it efficient and flexible.
|
||||
* See the body of {@link #main main()} below in the source for real code, or
|
||||
* if you want pseudo code, the simpliest possible usage is as follows. The bold
|
||||
* fragment is specific to this class.
|
||||
*
|
||||
* <code><pre>
|
||||
*
|
||||
* IndexReader ir = ...
|
||||
* IndexSearcher is = ...
|
||||
* <b>
|
||||
* MoreLikeThis mlt = new MoreLikeThis(ir);
|
||||
* Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
|
||||
* Query query = mlt.like( target);
|
||||
* </b>
|
||||
* Hits hits = is.search(query);
|
||||
* <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
|
||||
* you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
|
||||
*
|
||||
* </pre></code>
|
||||
*
|
||||
* Thus you:
|
||||
* <ol>
|
||||
* <li> do your normal, Lucene setup for searching,
|
||||
* <li> create a MoreLikeThis,
|
||||
* <li> get the text of the doc you want to find similaries to
|
||||
* <li> then call one of the like() calls to generate a similarity query
|
||||
* <li> call the searcher to find the similar docs
|
||||
* </ol>
|
||||
*
|
||||
* <h3>More Advanced Usage</h3>
|
||||
*
|
||||
* You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
|
||||
* multiple fields (e.g. body and title) for similarity.
|
||||
* <p>
|
||||
*
|
||||
* Depending on the size of your index and the size and makeup of your documents you
|
||||
* may want to call the other set methods to control how the similarity queries are
|
||||
* generated:
|
||||
* <ul>
|
||||
* <li> {@link #setMinTermFreq setMinTermFreq(...)}
|
||||
* <li> {@link #setMinDocFreq setMinDocFreq(...)}
|
||||
* <li> {@link #setMinWordLen setMinWordLen(...)}
|
||||
* <li> {@link #setMaxWordLen setMaxWordLen(...)}
|
||||
* <li> {@link #setMaxQueryTerms setMaxQueryTerms(...)}
|
||||
* <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
|
||||
* <li> {@link #setStopWords setStopWord(...)}
|
||||
* </ul>
|
||||
*
|
||||
* <hr>
|
||||
* <pre>
|
||||
* Changes: Mark Harwood 29/02/04
|
||||
* Some bugfixing, some refactoring, some optimisation.
|
||||
* - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
|
||||
* - bugfix: No significant terms being created for fields with a termvector - because
|
||||
* was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector)
|
||||
* - refactor: moved common code into isNoiseWord()
|
||||
* - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
|
||||
* </pre>
|
||||
*
|
||||
* @author David Spencer
|
||||
* @author Bruce Ritchie
|
||||
* @author Mark Harwood
|
||||
*/
|
||||
public final class MoreLikeThis {
|
||||
|
||||
/**
|
||||
* Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
|
||||
* @see #getMaxNumTokensParsed
|
||||
*/
|
||||
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED=5000;
|
||||
|
||||
|
||||
/**
|
||||
* Default analyzer to parse source doc with.
|
||||
* @see #getAnalyzer
|
||||
*/
|
||||
public static final Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
|
||||
|
||||
/**
|
||||
* Ignore terms with less than this frequency in the source doc.
|
||||
* @see #getMinTermFreq
|
||||
* @see #setMinTermFreq
|
||||
*/
|
||||
public static final int DEFAULT_MIN_TERM_FREQ = 2;
|
||||
|
||||
/**
|
||||
* Ignore words which do not occur in at least this many docs.
|
||||
* @see #getMinDocFreq
|
||||
* @see #setMinDocFreq
|
||||
*/
|
||||
public static final int DEFALT_MIN_DOC_FREQ = 5;
|
||||
|
||||
/**
|
||||
* Boost terms in query based on score.
|
||||
* @see #isBoost
|
||||
* @see #setBoost
|
||||
*/
|
||||
public static final boolean DEFAULT_BOOST = false;
|
||||
|
||||
/**
|
||||
* Default field names. Null is used to specify that the field names should be looked
|
||||
* up at runtime from the provided reader.
|
||||
*/
|
||||
public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents"};
|
||||
|
||||
/**
|
||||
* Ignore words less than this length or if 0 then this has no effect.
|
||||
* @see #getMinWordLen
|
||||
* @see #setMinWordLen
|
||||
*/
|
||||
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
|
||||
|
||||
/**
|
||||
* Ignore words greater than this length or if 0 then this has no effect.
|
||||
* @see #getMaxWordLen
|
||||
* @see #setMaxWordLen
|
||||
*/
|
||||
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
|
||||
|
||||
/**
|
||||
* Default set of stopwords.
|
||||
* If null means to allow stop words.
|
||||
*
|
||||
* @see #setStopWords
|
||||
* @see #getStopWords
|
||||
*/
|
||||
public static final Set DEFAULT_STOP_WORDS = null;
|
||||
|
||||
/**
|
||||
* Current set of stop words.
|
||||
*/
|
||||
private Set stopWords = DEFAULT_STOP_WORDS;
|
||||
|
||||
/**
|
||||
* Return a Query with no more than this many terms.
|
||||
*
|
||||
* @see BooleanQuery#getMaxClauseCount
|
||||
* @see #getMaxQueryTerms
|
||||
* @see #setMaxQueryTerms
|
||||
*/
|
||||
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
|
||||
|
||||
/**
|
||||
* Analyzer that will be used to parse the doc.
|
||||
*/
|
||||
private Analyzer analyzer = DEFAULT_ANALYZER;
|
||||
|
||||
/**
|
||||
* Ignore words less freqent that this.
|
||||
*/
|
||||
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
|
||||
|
||||
/**
|
||||
* Ignore words which do not occur in at least this many docs.
|
||||
*/
|
||||
private int minDocFreq = DEFALT_MIN_DOC_FREQ;
|
||||
|
||||
/**
|
||||
* Should we apply a boost to the Query based on the scores?
|
||||
*/
|
||||
private boolean boost = DEFAULT_BOOST;
|
||||
|
||||
/**
|
||||
* Field name we'll analyze.
|
||||
*/
|
||||
private String[] fieldNames = DEFAULT_FIELD_NAMES;
|
||||
|
||||
/**
|
||||
* The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
*/
|
||||
private int maxNumTokensParsed=DEFAULT_MAX_NUM_TOKENS_PARSED;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Ignore words if less than this len.
|
||||
*/
|
||||
private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
|
||||
|
||||
/**
|
||||
* Ignore words if greater than this len.
|
||||
*/
|
||||
private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
|
||||
|
||||
/**
|
||||
* Don't return a query longer than this.
|
||||
*/
|
||||
private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
|
||||
|
||||
/**
|
||||
* For idf() calculations.
|
||||
*/
|
||||
private Similarity similarity = new DefaultSimilarity();
|
||||
|
||||
/**
|
||||
* IndexReader to use
|
||||
*/
|
||||
private final IndexReader ir;
|
||||
|
||||
/**
|
||||
* Constructor requiring an IndexReader.
|
||||
*/
|
||||
public MoreLikeThis(IndexReader ir) {
|
||||
this.ir = ir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an analyzer that will be used to parse source doc with. The default analyzer
|
||||
* is the {@link #DEFAULT_ANALYZER}.
|
||||
*
|
||||
* @return the analyzer that will be used to parse source doc with.
|
||||
* @see #DEFAULT_ANALYZER
|
||||
*/
|
||||
public Analyzer getAnalyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the analyzer to use. An analyzer is not required for generating a query with the
|
||||
* {@link #like(int)} method, all other 'like' methods require an analyzer.
|
||||
*
|
||||
* @param analyzer the analyzer to use to tokenize text.
|
||||
*/
|
||||
public void setAnalyzer(Analyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the frequency below which terms will be ignored in the source doc. The default
|
||||
* frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
|
||||
*
|
||||
* @return the frequency below which terms will be ignored in the source doc.
|
||||
*/
|
||||
public int getMinTermFreq() {
|
||||
return minTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the frequency below which terms will be ignored in the source doc.
|
||||
*
|
||||
* @param minTermFreq the frequency below which terms will be ignored in the source doc.
|
||||
*/
|
||||
public void setMinTermFreq(int minTermFreq) {
|
||||
this.minTermFreq = minTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the frequency at which words will be ignored which do not occur in at least this
|
||||
* many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
|
||||
*
|
||||
* @return the frequency at which words will be ignored which do not occur in at least this
|
||||
* many docs.
|
||||
*/
|
||||
public int getMinDocFreq() {
|
||||
return minDocFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the frequency at which words will be ignored which do not occur in at least this
|
||||
* many docs.
|
||||
*
|
||||
* @param minDocFreq the frequency at which words will be ignored which do not occur in at
|
||||
* least this many docs.
|
||||
*/
|
||||
public void setMinDocFreq(int minDocFreq) {
|
||||
this.minDocFreq = minDocFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether to boost terms in query based on "score" or not. The default is
|
||||
* {@link #DEFAULT_BOOST}.
|
||||
*
|
||||
* @return whether to boost terms in query based on "score" or not.
|
||||
* @see #setBoost
|
||||
*/
|
||||
public boolean isBoost() {
|
||||
return boost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether to boost terms in query based on "score" or not.
|
||||
*
|
||||
* @param boost true to boost terms in query based on "score", false otherwise.
|
||||
* @see #isBoost
|
||||
*/
|
||||
public void setBoost(boolean boost) {
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the field names that will be used when generating the 'More Like This' query.
|
||||
* The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
|
||||
*
|
||||
* @return the field names that will be used when generating the 'More Like This' query.
|
||||
*/
|
||||
public String[] getFieldNames() {
|
||||
return fieldNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the field names that will be used when generating the 'More Like This' query.
|
||||
* Set this to null for the field names to be determined at runtime from the IndexReader
|
||||
* provided in the constructor.
|
||||
*
|
||||
* @param fieldNames the field names that will be used when generating the 'More Like This'
|
||||
* query.
|
||||
*/
|
||||
public void setFieldNames(String[] fieldNames) {
|
||||
this.fieldNames = fieldNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the minimum word length below which words will be ignored. Set this to 0 for no
|
||||
* minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
|
||||
*
|
||||
* @return the minimum word length below which words will be ignored.
|
||||
*/
|
||||
public int getMinWordLen() {
|
||||
return minWordLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the minimum word length below which words will be ignored.
|
||||
*
|
||||
* @param minWordLen the minimum word length below which words will be ignored.
|
||||
*/
|
||||
public void setMinWordLen(int minWordLen) {
|
||||
this.minWordLen = minWordLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum word length above which words will be ignored. Set this to 0 for no
|
||||
* maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
|
||||
*
|
||||
* @return the maximum word length above which words will be ignored.
|
||||
*/
|
||||
public int getMaxWordLen() {
|
||||
return maxWordLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the maximum word length above which words will be ignored.
|
||||
*
|
||||
* @param maxWordLen the maximum word length above which words will be ignored.
|
||||
*/
|
||||
public void setMaxWordLen(int maxWordLen) {
|
||||
this.maxWordLen = maxWordLen;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the set of stopwords.
|
||||
* Any word in this set is considered "uninteresting" and ignored.
|
||||
* Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
|
||||
* for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
|
||||
*
|
||||
* @param stopWords set of stopwords, if null it means to allow stop words
|
||||
*
|
||||
* @see org.apache.lucene.analysis.StopFilter#makeStopSet StopFilter.makeStopSet()
|
||||
* @see #getStopWords
|
||||
*/
|
||||
public void setStopWords(Set stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current stop words being used.
|
||||
* @see #setStopWords
|
||||
*/
|
||||
public Set getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maximum number of query terms that will be included in any generated query.
|
||||
* The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
|
||||
*
|
||||
* @return the maximum number of query terms that will be included in any generated query.
|
||||
*/
|
||||
public int getMaxQueryTerms() {
|
||||
return maxQueryTerms;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the maximum number of query terms that will be included in any generated query.
|
||||
*
|
||||
* @param maxQueryTerms the maximum number of query terms that will be included in any
|
||||
* generated query.
|
||||
*/
|
||||
public void setMaxQueryTerms(int maxQueryTerms) {
|
||||
this.maxQueryTerms = maxQueryTerms;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
|
||||
*/
|
||||
public int getMaxNumTokensParsed()
|
||||
{
|
||||
return maxNumTokensParsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
*/
|
||||
public void setMaxNumTokensParsed(int i)
|
||||
{
|
||||
maxNumTokensParsed = i;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed lucene document ID.
|
||||
*
|
||||
* @param docNum the documentID of the lucene doc to generate the 'More Like This" query for.
|
||||
* @return a query that will return docs like the passed lucene document ID.
|
||||
*/
|
||||
public Query like(int docNum) throws IOException {
|
||||
if (fieldNames == null) {
|
||||
// gather list of valid fields from lucene
|
||||
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
|
||||
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
|
||||
}
|
||||
|
||||
return createQuery(retrieveTerms(docNum));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed file.
|
||||
*
|
||||
* @return a query that will return docs like the passed file.
|
||||
*/
|
||||
public Query like(File f) throws IOException {
|
||||
if (fieldNames == null) {
|
||||
// gather list of valid fields from lucene
|
||||
Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
|
||||
fieldNames = (String[]) fields.toArray(new String[fields.size()]);
|
||||
}
|
||||
|
||||
return like(new FileReader(f));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed URL.
|
||||
*
|
||||
* @return a query that will return docs like the passed URL.
|
||||
*/
|
||||
public Query like(URL u) throws IOException {
|
||||
return like(new InputStreamReader(u.openConnection().getInputStream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed stream.
|
||||
*
|
||||
* @return a query that will return docs like the passed stream.
|
||||
*/
|
||||
public Query like(java.io.InputStream is) throws IOException {
|
||||
return like(new InputStreamReader(is));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed Reader.
|
||||
*
|
||||
* @return a query that will return docs like the passed Reader.
|
||||
*/
|
||||
public Query like(Reader r) throws IOException {
|
||||
return createQuery(retrieveTerms(r));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the More like query from a PriorityQueue
|
||||
*/
|
||||
private Query createQuery(PriorityQueue q) {
|
||||
BooleanQuery query = new BooleanQuery();
|
||||
Object cur;
|
||||
int qterms = 0;
|
||||
float bestScore = 0;
|
||||
|
||||
while (((cur = q.pop()) != null)) {
|
||||
Object[] ar = (Object[]) cur;
|
||||
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
|
||||
|
||||
if (boost) {
|
||||
if (qterms == 0) {
|
||||
bestScore = ((Float) ar[2]).floatValue();
|
||||
}
|
||||
float myScore = ((Float) ar[2]).floatValue();
|
||||
|
||||
tq.setBoost(myScore / bestScore);
|
||||
}
|
||||
|
||||
try {
|
||||
query.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
catch (BooleanQuery.TooManyClauses ignore) {
|
||||
break;
|
||||
}
|
||||
|
||||
qterms++;
|
||||
if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a PriorityQueue from a word->tf map.
|
||||
*
|
||||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||
*/
|
||||
private PriorityQueue createQueue(Map words) throws IOException {
|
||||
// have collected all words in doc and their freqs
|
||||
int numDocs = ir.numDocs();
|
||||
FreqQ res = new FreqQ(words.size()); // will order words by score
|
||||
|
||||
Iterator it = words.keySet().iterator();
|
||||
while (it.hasNext()) { // for every word
|
||||
String word = (String) it.next();
|
||||
|
||||
int tf = ((Int) words.get(word)).x; // term freq in the source doc
|
||||
if (minTermFreq > 0 && tf < minTermFreq) {
|
||||
continue; // filter out words that don't occur enough times in the source
|
||||
}
|
||||
|
||||
// go through all the fields and find the largest document frequency
|
||||
String topField = fieldNames[0];
|
||||
int docFreq = 0;
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
int freq = ir.docFreq(new Term(fieldNames[i], word));
|
||||
topField = (freq > docFreq) ? fieldNames[i] : topField;
|
||||
docFreq = (freq > docFreq) ? freq : docFreq;
|
||||
}
|
||||
|
||||
if (minDocFreq > 0 && docFreq < minDocFreq) {
|
||||
continue; // filter out words that don't occur in enough docs
|
||||
}
|
||||
|
||||
if (docFreq == 0) {
|
||||
continue; // index update problem?
|
||||
}
|
||||
|
||||
float idf = similarity.idf(docFreq, numDocs);
|
||||
float score = tf * idf;
|
||||
|
||||
// only really need 1st 3 entries, other ones are for troubleshooting
|
||||
res.insert(new Object[]{word, // the word
|
||||
topField, // the top field
|
||||
new Float(score), // overall score
|
||||
new Float(idf), // idf
|
||||
new Integer(docFreq), // freq in all docs
|
||||
new Integer(tf)
|
||||
});
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe the parameters that control how the "more like this" query is formed.
|
||||
*/
|
||||
public String describeParams() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
|
||||
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
|
||||
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
|
||||
sb.append("\t" + "fieldNames : \"");
|
||||
String delim = "";
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
sb.append(delim).append(fieldName);
|
||||
delim = ", ";
|
||||
}
|
||||
sb.append("\n");
|
||||
sb.append("\t" + "boost : " + boost + "\n");
|
||||
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
|
||||
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test driver.
|
||||
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
|
||||
*/
|
||||
public static void main(String[] a) throws Throwable {
|
||||
String indexName = "localhost_index";
|
||||
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
|
||||
URL url = null;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
if (a[i].equals("-i")) {
|
||||
indexName = a[++i];
|
||||
}
|
||||
else if (a[i].equals("-f")) {
|
||||
fn = a[++i];
|
||||
}
|
||||
else if (a[i].equals("-url")) {
|
||||
url = new URL(a[++i]);
|
||||
}
|
||||
}
|
||||
|
||||
PrintStream o = System.out;
|
||||
IndexReader r = IndexReader.open(indexName);
|
||||
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
|
||||
|
||||
MoreLikeThis mlt = new MoreLikeThis(r);
|
||||
|
||||
o.println("Query generation parameters:");
|
||||
o.println(mlt.describeParams());
|
||||
o.println();
|
||||
|
||||
Query query = null;
|
||||
if (url != null) {
|
||||
o.println("Parsing URL: " + url);
|
||||
query = mlt.like(url);
|
||||
}
|
||||
else if (fn != null) {
|
||||
o.println("Parsing file: " + fn);
|
||||
query = mlt.like(new File(fn));
|
||||
}
|
||||
|
||||
o.println("q: " + query);
|
||||
o.println();
|
||||
IndexSearcher searcher = new IndexSearcher(indexName);
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
int len = hits.length();
|
||||
o.println("found: " + len + " documents matching");
|
||||
o.println();
|
||||
for (int i = 0; i < Math.min(25, len); i++) {
|
||||
Document d = hits.doc(i);
|
||||
String summary = d.get( "summary");
|
||||
o.println("score : " + hits.score(i));
|
||||
o.println("url : " + d.get("url"));
|
||||
o.println("\ttitle : " + d.get("title"));
|
||||
if ( summary != null)
|
||||
o.println("\tsummary: " + d.get("summary"));
|
||||
o.println();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find words for a more-like-this query former.
|
||||
*
|
||||
* @param docNum the id of the lucene document from which to find terms
|
||||
*/
|
||||
private PriorityQueue retrieveTerms(int docNum) throws IOException {
|
||||
Map termFreqMap = new HashMap();
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
|
||||
|
||||
// field does not store term vector info
|
||||
if (vector == null) {
|
||||
Document d=ir.document(docNum);
|
||||
String text[]=d.getValues(fieldName);
|
||||
if(text!=null)
|
||||
{
|
||||
for (int j = 0; j < text.length; j++) {
|
||||
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
addTermFrequencies(termFreqMap, vector);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return createQueue(termFreqMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds terms and frequencies found in vector into the Map termFreqMap
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param vector List of terms and their frequencies for a doc/field
|
||||
*/
|
||||
private void addTermFrequencies(Map termFreqMap, TermFreqVector vector)
|
||||
{
|
||||
String[] terms = vector.getTerms();
|
||||
int freqs[]=vector.getTermFrequencies();
|
||||
for (int j = 0; j < terms.length; j++) {
|
||||
String term = terms[j];
|
||||
|
||||
if(isNoiseWord(term)){
|
||||
continue;
|
||||
}
|
||||
// increment frequency
|
||||
Int cnt = (Int) termFreqMap.get(term);
|
||||
if (cnt == null) {
|
||||
cnt=new Int();
|
||||
termFreqMap.put(term, cnt);
|
||||
cnt.x=freqs[j];
|
||||
}
|
||||
else {
|
||||
cnt.x+=freqs[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Adds term frequencies found by tokenizing text from reader into the Map words
|
||||
* @param r a source of text to be tokenized
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param fieldName Used by analyzer for any special per-field analysis
|
||||
*/
|
||||
private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName)
|
||||
throws IOException
|
||||
{
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
||||
org.apache.lucene.analysis.Token token;
|
||||
int tokenCount=0;
|
||||
while ((token = ts.next()) != null) { // for every token
|
||||
String word = token.termText();
|
||||
tokenCount++;
|
||||
if(tokenCount>maxNumTokensParsed)
|
||||
{
|
||||
break;
|
||||
}
|
||||
if(isNoiseWord(word)){
|
||||
continue;
|
||||
}
|
||||
|
||||
// increment frequency
|
||||
Int cnt = (Int) termFreqMap.get(word);
|
||||
if (cnt == null) {
|
||||
termFreqMap.put(word, new Int());
|
||||
}
|
||||
else {
|
||||
cnt.x++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** determines if the passed term is likely to be of interest in "more like" comparisons
|
||||
*
|
||||
* @param term The word being considered
|
||||
* @return true if should be ignored, false if should be used in further analysis
|
||||
*/
|
||||
private boolean isNoiseWord(String term)
|
||||
{
|
||||
int len = term.length();
|
||||
if (minWordLen > 0 && len < minWordLen) {
|
||||
return true;
|
||||
}
|
||||
if (maxWordLen > 0 && len > maxWordLen) {
|
||||
return true;
|
||||
}
|
||||
if (stopWords != null && stopWords.contains( term)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Find words for a more-like-this query former.
|
||||
* The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
|
||||
* Each array has 6 elements.
|
||||
* The elements are:
|
||||
* <ol>
|
||||
* <li> The word (String)
|
||||
* <li> The top field that this word comes from (String)
|
||||
* <li> The score for this word (Float)
|
||||
* <li> The IDF value (Float)
|
||||
* <li> The frequency of this word in the index (Integer)
|
||||
* <li> The frequency of this word in the source document (Integer)
|
||||
* </ol>
|
||||
* This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
|
||||
* This method is exposed so that you can identify the "interesting words" in a document.
|
||||
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
|
||||
*
|
||||
* @param r the reader that has the content of the document
|
||||
* @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||
*
|
||||
* @see #retrieveInterestingTerms
|
||||
*/
|
||||
public PriorityQueue retrieveTerms(Reader r) throws IOException {
|
||||
Map words = new HashMap();
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
}
|
||||
return createQueue(words);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience routine to make it easy to return the most interesting words in a document.
|
||||
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
|
||||
* @param r the source document
|
||||
* @return the most interesting words in the document
|
||||
*
|
||||
* @see #retrieveTerms(java.io.Reader)
|
||||
* @see #setMaxQueryTerms
|
||||
*/
|
||||
public String[] retrieveInterestingTerms( Reader r) throws IOException {
|
||||
ArrayList al = new ArrayList( maxQueryTerms);
|
||||
PriorityQueue pq = retrieveTerms( r);
|
||||
Object cur;
|
||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||
// we just want to return the top words
|
||||
while (((cur = pq.pop()) != null) && lim-- > 0) {
|
||||
Object[] ar = (Object[]) cur;
|
||||
al.add( ar[ 0]); // the 1st entry is the interesting word
|
||||
}
|
||||
String[] res = new String[ al.size()];
|
||||
return (String[]) al.toArray( res);
|
||||
}
|
||||
|
||||
/**
|
||||
* PriorityQueue that orders words by score.
|
||||
*/
|
||||
private static class FreqQ extends PriorityQueue {
|
||||
FreqQ (int s) {
|
||||
initialize(s);
|
||||
}
|
||||
|
||||
protected boolean lessThan(Object a, Object b) {
|
||||
Object[] aa = (Object[]) a;
|
||||
Object[] bb = (Object[]) b;
|
||||
Float fa = (Float) aa[2];
|
||||
Float fb = (Float) bb[2];
|
||||
return fa.floatValue() > fb.floatValue();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Use for frequencies and to avoid renewing Integers.
|
||||
*/
|
||||
private static class Int {
|
||||
int x;
|
||||
|
||||
Int() {
|
||||
x = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
||||
|
||||
/**
|
||||
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
|
||||
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
|
||||
* actual MoreLikeThis object and obtain the real Query object.
|
||||
* @author maharwood
|
||||
*/
|
||||
public class MoreLikeThisQuery extends Query
|
||||
{
|
||||
|
||||
|
||||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
private Analyzer analyzer;
|
||||
float percentTermsToMatch=0.3f;
|
||||
int minTermFrequency=1;
|
||||
int maxQueryTerms=5;
|
||||
|
||||
|
||||
/**
|
||||
* @param docId
|
||||
* @param moreLikeFields
|
||||
*/
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
|
||||
{
|
||||
this.likeText=likeText;
|
||||
this.moreLikeFields=moreLikeFields;
|
||||
this.analyzer=analyzer;
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException
|
||||
{
|
||||
MoreLikeThis mlt=new MoreLikeThis(reader);
|
||||
|
||||
mlt.setFieldNames(moreLikeFields);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMinTermFreq(minTermFrequency);
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
|
||||
return bq;
|
||||
}
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||
*/
|
||||
public String toString(String field)
|
||||
{
|
||||
return "like:"+likeText;
|
||||
}
|
||||
|
||||
public float getPercentTermsToMatch() {
|
||||
return percentTermsToMatch;
|
||||
}
|
||||
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
||||
this.percentTermsToMatch = percentTermsToMatch;
|
||||
}
|
||||
|
||||
public Analyzer getAnalyzer()
|
||||
{
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
public void setAnalyzer(Analyzer analyzer)
|
||||
{
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public String getLikeText()
|
||||
{
|
||||
return likeText;
|
||||
}
|
||||
|
||||
public void setLikeText(String likeText)
|
||||
{
|
||||
this.likeText = likeText;
|
||||
}
|
||||
|
||||
public int getMaxQueryTerms()
|
||||
{
|
||||
return maxQueryTerms;
|
||||
}
|
||||
|
||||
public void setMaxQueryTerms(int maxQueryTerms)
|
||||
{
|
||||
this.maxQueryTerms = maxQueryTerms;
|
||||
}
|
||||
|
||||
public int getMinTermFrequency()
|
||||
{
|
||||
return minTermFrequency;
|
||||
}
|
||||
|
||||
public void setMinTermFrequency(int minTermFrequency)
|
||||
{
|
||||
this.minTermFrequency = minTermFrequency;
|
||||
}
|
||||
|
||||
public String[] getMoreLikeFields()
|
||||
{
|
||||
return moreLikeFields;
|
||||
}
|
||||
|
||||
public void setMoreLikeFields(String[] moreLikeFields)
|
||||
{
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
/**
|
||||
* Copyright 2004 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.net.*;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
/**
|
||||
* Simple similarity measures.
|
||||
*
|
||||
*
|
||||
* @see MoreLikeThis
|
||||
*/
|
||||
public final class SimilarityQueries
|
||||
{
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private SimilarityQueries()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple similarity query generators.
|
||||
* Takes every unique word and forms a boolean query where all words are optional.
|
||||
* After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
|
||||
* The only caveat is the first hit returned <b>should be</b> your source document - you'll
|
||||
* need to then ignore that.
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
* So, if you have a code fragment like this:
|
||||
* <br>
|
||||
* <code>
|
||||
* Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
|
||||
* </code>
|
||||
*
|
||||
* <p>
|
||||
*
|
||||
|
||||
* The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
|
||||
*
|
||||
* <p>
|
||||
* The philosophy behind this method is "two documents are similar if they share lots of words".
|
||||
* Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
|
||||
*
|
||||
* <P>
|
||||
* This method is fail-safe in that if a long 'body' is passed in and
|
||||
* {@link BooleanQuery#add BooleanQuery.add()} (used internally)
|
||||
* throws
|
||||
* {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
|
||||
* query as it is will be returned.
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
*
|
||||
* @param body the body of the document you want to find similar documents to
|
||||
* @param a the analyzer to use to parse the body
|
||||
* @param field the field you want to search on, probably something like "contents" or "body"
|
||||
* @param stop optional set of stop words to ignore
|
||||
* @return a query with all unique words in 'body'
|
||||
* @throws IOException this can't happen...
|
||||
*/
|
||||
public static Query formSimilarQuery( String body,
|
||||
Analyzer a,
|
||||
String field,
|
||||
Set stop)
|
||||
throws IOException
|
||||
{
|
||||
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
BooleanQuery tmp = new BooleanQuery();
|
||||
Set already = new HashSet(); // ignore dups
|
||||
while ( (t = ts.next()) != null)
|
||||
{
|
||||
String word = t.termText();
|
||||
// ignore opt stop words
|
||||
if ( stop != null &&
|
||||
stop.contains( word)) continue;
|
||||
// ignore dups
|
||||
if ( ! already.add( word)) continue;
|
||||
// add to query
|
||||
TermQuery tq = new TermQuery( new Term( field, word));
|
||||
try
|
||||
{
|
||||
tmp.add( tq, false, false);
|
||||
}
|
||||
catch( BooleanQuery.TooManyClauses too)
|
||||
{
|
||||
// fail-safe, just return what we have, not the end of the world
|
||||
break;
|
||||
}
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
<html>
|
||||
<body>
|
||||
Document similarity query generators.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="xml-query-parser" default="buildParser">
|
||||
|
||||
<description>
|
||||
XML query parser
|
||||
</description>
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<property name="queries.jar" location="../../build/contrib/queries/lucene-queries-${version}.jar"/>
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${lucene.jar}"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${project.classpath}"/>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="buildParser" depends="buildQueries,default" />
|
||||
|
||||
<target name="buildQueries" >
|
||||
<echo>XML Parser building dependency ${queries.jar}</echo>
|
||||
<ant antfile="../queries/build.xml" target="default" inheritall="false"/>
|
||||
</target>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,33 @@
|
|||
<html>
|
||||
<body>
|
||||
<h1>XML based query syntax
|
||||
</h1>
|
||||
<p>
|
||||
This module contains:
|
||||
<ul>
|
||||
<li>a modular Lucene Query Parser where queries are expressed as XML</li>
|
||||
<li>JUnit test</li>
|
||||
<li>Example XML queries</li>
|
||||
<li>Test index (subset of Reuters 21578)</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>
|
||||
The original motivation for creating this package was outlined and discussed <a href="http://marc.theaimsgroup.com/?l=lucene-dev&m=113355526731460&w=2">here</a>.
|
||||
</p>
|
||||
<p>
|
||||
Parser support includes:
|
||||
<ul>
|
||||
<li>"Span" queries</li>
|
||||
<li>"Like this" queries</li>
|
||||
<li>Boolean, Term, and UserInput (parsed with existing query parser)</li>
|
||||
<li>BoostingQuery - a class that can downgrade scores for hits on
|
||||
certain terms rather than the hard-line approach taken by BooleanClause.Occurs.MUST_NOT</li>
|
||||
<li>FilteredQuery, RangeFilter, and "TermsFilter" for non-sequential terms</li>
|
||||
<li>"FuzzyLikeThis" a new query which is a cross between "LikeThis" and "fuzzy" but with
|
||||
better scoring of fuzzy terms than standard fuzzy queries</li>
|
||||
<li>A modular design with expandable support for new query/filter types</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>This code is dependent on the "queries" contrib module although the "CoreParser" can be compiled with just Lucene core if required</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,124 @@
|
|||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.builders.BooleanQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.ConstantScoreQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.RangeFilterBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanFirstBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanNearBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanNotBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanOrBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanOrTermsBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.SpanQueryBuilderFactory;
|
||||
import org.apache.lucene.xmlparser.builders.SpanTermBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.TermQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.UserInputQueryBuilder;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* Assembles a QueryBuilder which uses only core Lucene Query objects
|
||||
* @author Mark
|
||||
*
|
||||
*/
|
||||
public class CoreParser implements QueryBuilder
|
||||
{
|
||||
|
||||
protected Analyzer analyzer;
|
||||
protected QueryParser parser;
|
||||
protected QueryBuilderFactory queryFactory;
|
||||
protected FilterBuilderFactory filterFactory;
|
||||
|
||||
public CoreParser(Analyzer analyzer, QueryParser parser)
|
||||
{
|
||||
this.analyzer=analyzer;
|
||||
this.parser=parser;
|
||||
filterFactory = new FilterBuilderFactory();
|
||||
filterFactory.addBuilder("RangeFilter",new RangeFilterBuilder());
|
||||
|
||||
|
||||
queryFactory = new QueryBuilderFactory();
|
||||
queryFactory.addBuilder("TermQuery",new TermQueryBuilder());
|
||||
queryFactory.addBuilder("BooleanQuery",new BooleanQueryBuilder(queryFactory));
|
||||
queryFactory.addBuilder("UserQuery",new UserInputQueryBuilder(new QueryParser("contents", analyzer)));
|
||||
queryFactory.addBuilder("FilteredQuery",new FilteredQueryBuilder(filterFactory,queryFactory));
|
||||
queryFactory.addBuilder("ConstantScoreQuery",new ConstantScoreQueryBuilder(filterFactory));
|
||||
|
||||
SpanQueryBuilderFactory sqof=new SpanQueryBuilderFactory();
|
||||
|
||||
SpanNearBuilder snb=new SpanNearBuilder(sqof);
|
||||
sqof.addBuilder("SpanNear",snb);
|
||||
queryFactory.addBuilder("SpanNear",snb);
|
||||
|
||||
SpanTermBuilder snt=new SpanTermBuilder();
|
||||
sqof.addBuilder("SpanTerm",snt);
|
||||
queryFactory.addBuilder("SpanTerm",snt);
|
||||
|
||||
SpanOrBuilder sot=new SpanOrBuilder(sqof);
|
||||
sqof.addBuilder("SpanOr",sot);
|
||||
queryFactory.addBuilder("SpanOr",sot);
|
||||
|
||||
SpanOrTermsBuilder sots=new SpanOrTermsBuilder(analyzer);
|
||||
sqof.addBuilder("SpanOrTerms",sots);
|
||||
queryFactory.addBuilder("SpanOrTerms",sots);
|
||||
|
||||
SpanFirstBuilder sft=new SpanFirstBuilder(sqof);
|
||||
sqof.addBuilder("SpanFirst",sft);
|
||||
queryFactory.addBuilder("SpanFirst",sft);
|
||||
|
||||
SpanNotBuilder snot=new SpanNotBuilder(sqof);
|
||||
sqof.addBuilder("SpanNot",snot);
|
||||
queryFactory.addBuilder("SpanNot",snot);
|
||||
}
|
||||
|
||||
public Query parse(InputStream xmlStream) throws ParserException
|
||||
{
|
||||
return getQuery(parseXML(xmlStream).getDocumentElement());
|
||||
}
|
||||
|
||||
public void addQueryBuilder(String nodeName,QueryBuilder builder)
|
||||
{
|
||||
queryFactory.addBuilder(nodeName,builder);
|
||||
}
|
||||
public void addFilterBuilder(String nodeName,FilterBuilder builder)
|
||||
{
|
||||
filterFactory.addBuilder(nodeName,builder);
|
||||
}
|
||||
|
||||
private static Document parseXML(InputStream pXmlFile) throws ParserException
|
||||
{
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder db = null;
|
||||
try
|
||||
{
|
||||
db = dbf.newDocumentBuilder();
|
||||
}
|
||||
catch (Exception se)
|
||||
{
|
||||
throw new ParserException("XML Parser configuration error", se);
|
||||
}
|
||||
org.w3c.dom.Document doc = null;
|
||||
try
|
||||
{
|
||||
doc = db.parse(pXmlFile);
|
||||
}
|
||||
catch (Exception se)
|
||||
{
|
||||
throw new ParserException("Error parsing XML stream:" + se, se);
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
public Query getQuery(Element e) throws ParserException
|
||||
{
|
||||
return queryFactory.getQuery(e);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.xmlparser.builders.BoostingQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.FuzzyLikeThisQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.LikeThisQueryBuilder;
|
||||
import org.apache.lucene.xmlparser.builders.TermsFilterBuilder;
|
||||
|
||||
public class CorePlusExtensionsParser extends CoreParser
|
||||
{
|
||||
|
||||
public CorePlusExtensionsParser(Analyzer analyzer, QueryParser parser)
|
||||
{
|
||||
super(analyzer, parser);
|
||||
filterFactory.addBuilder("TermsFilter",new TermsFilterBuilder(analyzer));
|
||||
String fields[]={"contents"};
|
||||
queryFactory.addBuilder("LikeThisQuery",new LikeThisQueryBuilder(analyzer,fields));
|
||||
queryFactory.addBuilder("BoostingQuery", new BoostingQueryBuilder(queryFactory));
|
||||
queryFactory.addBuilder("FuzzyLikeThisQuery", new FuzzyLikeThisQueryBuilder(analyzer));
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,198 @@
|
|||
package org.apache.lucene.xmlparser;
|
||||
import java.io.Reader;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
public class DOMUtils
|
||||
{
|
||||
/* Convenience method where there is only one child Element of a given name */
|
||||
public static Element getChildByTagName(Element e, String name)
|
||||
{
|
||||
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
|
||||
{
|
||||
if( (kid.getNodeType()==Node.ELEMENT_NODE) && (name.equals(kid.getNodeName())) )
|
||||
{
|
||||
return (Element)kid;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an attribute value from this node, or first parent node with this attribute defined
|
||||
* @param element
|
||||
* @param attributeName
|
||||
* @return A non-zero-length value if defined, otherwise null
|
||||
*/
|
||||
public static String getAttributeWithInheritance(Element element, String attributeName)
|
||||
{
|
||||
String result=element.getAttribute(attributeName);
|
||||
if( (result==null)|| ("".equals(result) ) )
|
||||
{
|
||||
Node n=element.getParentNode();
|
||||
if((n==element)||(n==null))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
Element parent=(Element) n;
|
||||
return getAttributeWithInheritance(parent,attributeName);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Convenience method where there is only one child Element of a given name */
|
||||
public static String getChildTextByTagName(Element e, String tagName)
|
||||
{
|
||||
Element child=getChildByTagName(e,tagName);
|
||||
if(child!=null)
|
||||
{
|
||||
return getText(child);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/* Convenience method to append a new child with text*/
|
||||
public static Element insertChild(Element parent, String tagName, String text)
|
||||
{
|
||||
Element child = parent.getOwnerDocument().createElement(tagName);
|
||||
parent.appendChild(child);
|
||||
if(text!=null)
|
||||
{
|
||||
child.appendChild(child.getOwnerDocument().createTextNode(text));
|
||||
}
|
||||
return child;
|
||||
}
|
||||
|
||||
public static String getAttribute(Element element, String attributeName, String deflt)
|
||||
{
|
||||
String result=element.getAttribute(attributeName);
|
||||
if( (result==null)|| ("".equals(result) ) )
|
||||
{
|
||||
return deflt;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
public static float getAttribute(Element element, String attributeName, float deflt)
|
||||
{
|
||||
String result=element.getAttribute(attributeName);
|
||||
if( (result==null)|| ("".equals(result) ) )
|
||||
{
|
||||
return deflt;
|
||||
}
|
||||
return Float.parseFloat(result);
|
||||
}
|
||||
|
||||
public static int getAttribute(Element element, String attributeName, int deflt)
|
||||
{
|
||||
String result=element.getAttribute(attributeName);
|
||||
if( (result==null)|| ("".equals(result) ) )
|
||||
{
|
||||
return deflt;
|
||||
}
|
||||
return Integer.parseInt(result);
|
||||
}
|
||||
|
||||
public static boolean getAttribute(Element element, String attributeName,
|
||||
boolean deflt)
|
||||
{
|
||||
String result = element.getAttribute(attributeName);
|
||||
if ((result == null) || ("".equals(result)))
|
||||
{
|
||||
return deflt;
|
||||
}
|
||||
return Boolean.getBoolean(result);
|
||||
}
|
||||
|
||||
/* Returns text of node and all child nodes - without markup */
|
||||
//MH changed to Node from Element 25/11/2005
|
||||
public static String getText(Node e)
|
||||
{
|
||||
StringBuffer sb=new StringBuffer();
|
||||
getTextBuffer(e, sb);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static Element getFirstChildElement(Element element)
|
||||
{
|
||||
for (Node kid = element.getFirstChild(); kid != null; kid = kid
|
||||
.getNextSibling())
|
||||
{
|
||||
if (kid.getNodeType() == Node.ELEMENT_NODE)
|
||||
{
|
||||
return (Element) kid;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static void getTextBuffer(Node e, StringBuffer sb)
|
||||
{
|
||||
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
|
||||
{
|
||||
switch(kid.getNodeType())
|
||||
{
|
||||
case Node.TEXT_NODE:
|
||||
{
|
||||
sb.append(kid.getNodeValue());
|
||||
break;
|
||||
}
|
||||
case Node.ELEMENT_NODE:
|
||||
{
|
||||
getTextBuffer(kid, sb);
|
||||
break;
|
||||
}
|
||||
case Node.ENTITY_REFERENCE_NODE:
|
||||
{
|
||||
getTextBuffer(kid, sb);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to parse an XML file into a DOM tree, given a filename.
|
||||
* @param pXmlFile name of the XML file to be parsed
|
||||
* @return an org.w3c.dom.Document object
|
||||
*/
|
||||
public static Document loadXML(Reader is)
|
||||
{
|
||||
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder db = null;
|
||||
|
||||
try
|
||||
{
|
||||
db = dbf.newDocumentBuilder();
|
||||
}
|
||||
catch (Exception se)
|
||||
{
|
||||
throw new RuntimeException("Parser configuration error", se);
|
||||
}
|
||||
|
||||
// Step 3: parse the input file
|
||||
org.w3c.dom.Document doc = null;
|
||||
try
|
||||
{
|
||||
doc = db.parse(new InputSource(is));
|
||||
//doc = db.parse(is);
|
||||
}
|
||||
catch (Exception se)
|
||||
{
|
||||
throw new RuntimeException("Error parsing file:" + se, se);
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public interface FilterBuilder {
|
||||
public Filter getFilter(Element e) throws ParserException;
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class FilterBuilderFactory implements FilterBuilder {
|
||||
|
||||
HashMap builders=new HashMap();
|
||||
|
||||
public Filter getFilter(Element n) throws ParserException {
|
||||
FilterBuilder builder=(FilterBuilder) builders.get(n.getNodeName());
|
||||
if(builder==null)
|
||||
{
|
||||
throw new ParserException("No FilterBuilder defined for node "+n.getNodeName());
|
||||
}
|
||||
return builder.getFilter(n);
|
||||
}
|
||||
public void addBuilder(String nodeName,FilterBuilder builder)
|
||||
{
|
||||
builders.put(nodeName,builder);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.FilteredQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class FilteredQueryBuilder implements QueryBuilder {
|
||||
|
||||
private FilterBuilder filterFactory;
|
||||
private QueryBuilder queryFactory;
|
||||
|
||||
public FilteredQueryBuilder(FilterBuilder filterFactory, QueryBuilder queryFactory)
|
||||
{
|
||||
this.filterFactory=filterFactory;
|
||||
this.queryFactory=queryFactory;
|
||||
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
|
||||
*/
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
Element filterElement=DOMUtils.getChildByTagName(e,"Filter");
|
||||
if(filterElement==null)
|
||||
{
|
||||
throw new ParserException("FilteredQuery missing \"Filter\" child element");
|
||||
}
|
||||
filterElement=DOMUtils.getFirstChildElement(filterElement);
|
||||
Filter f=null;
|
||||
if(filterElement!=null)
|
||||
{
|
||||
f=filterFactory.getFilter(filterElement);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ParserException("FilteredQuery \"Filter\" element missing child query element ");
|
||||
}
|
||||
|
||||
|
||||
Element queryElement=DOMUtils.getChildByTagName(e,"Query");
|
||||
if(queryElement==null)
|
||||
{
|
||||
throw new ParserException("FilteredQuery missing \"Query\" child element");
|
||||
}
|
||||
queryElement=DOMUtils.getFirstChildElement(queryElement);
|
||||
Query q=null;
|
||||
if(queryElement!=null)
|
||||
{
|
||||
q=queryFactory.getQuery(queryElement);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ParserException("FilteredQuery \"Query\" element missing child query element ");
|
||||
}
|
||||
|
||||
|
||||
FilteredQuery fq = new FilteredQuery(q,f);
|
||||
fq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return fq;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class ParserException extends Exception {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public ParserException() {
|
||||
super();
|
||||
// TODO Auto-generated constructor stub
|
||||
}
|
||||
/**
|
||||
* @param message
|
||||
*/
|
||||
public ParserException(String message) {
|
||||
super(message);
|
||||
// TODO Auto-generated constructor stub
|
||||
}
|
||||
/**
|
||||
* @param message
|
||||
* @param cause
|
||||
*/
|
||||
public ParserException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
// TODO Auto-generated constructor stub
|
||||
}
|
||||
/**
|
||||
* @param cause
|
||||
*/
|
||||
public ParserException(Throwable cause) {
|
||||
super(cause);
|
||||
// TODO Auto-generated constructor stub
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* Implemented by objects that produce Lucene Query objects from XML streams. Implementations are
|
||||
* expected to be thread-safe so that they can be used to simultaneously parse multiple XML documents.
|
||||
* @author maharwood
|
||||
*/
|
||||
public interface QueryBuilder {
|
||||
|
||||
public Query getQuery(Element e) throws ParserException;
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class QueryBuilderFactory implements QueryBuilder {
|
||||
|
||||
HashMap builders=new HashMap();
|
||||
|
||||
public Query getQuery(Element n) throws ParserException {
|
||||
QueryBuilder builder=(QueryBuilder) builders.get(n.getNodeName());
|
||||
if(builder==null)
|
||||
{
|
||||
throw new ParserException("No QueryObjectBuilder defined for node "+n.getNodeName());
|
||||
}
|
||||
return builder.getQuery(n);
|
||||
}
|
||||
public void addBuilder(String nodeName,QueryBuilder builder)
|
||||
{
|
||||
builders.put(nodeName,builder);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class BooleanQueryBuilder implements QueryBuilder {
|
||||
|
||||
private QueryBuilder factory;
|
||||
|
||||
public BooleanQueryBuilder(QueryBuilder factory)
|
||||
{
|
||||
this.factory=factory;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
|
||||
*/
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
BooleanQuery bq=new BooleanQuery();
|
||||
bq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
NodeList nl = e.getElementsByTagName("Clause");
|
||||
for(int i=0;i<nl.getLength();i++)
|
||||
{
|
||||
Element clauseElem=(Element) nl.item(i);
|
||||
BooleanClause.Occur occurs=getOccursValue(clauseElem);
|
||||
|
||||
//find the first element child which should contain a Query
|
||||
Element clauseQuery=DOMUtils.getFirstChildElement(clauseElem);
|
||||
if(clauseQuery!=null)
|
||||
{
|
||||
Query q=factory.getQuery(clauseQuery);
|
||||
bq.add(new BooleanClause(q,occurs));
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new ParserException("BooleanClause missing child query element ");
|
||||
}
|
||||
}
|
||||
|
||||
return bq;
|
||||
}
|
||||
private BooleanClause.Occur getOccursValue(Element clauseElem) throws ParserException
|
||||
{
|
||||
String occs=clauseElem.getAttribute("occurs");
|
||||
BooleanClause.Occur occurs=BooleanClause.Occur.SHOULD;
|
||||
if("must".equalsIgnoreCase(occs))
|
||||
{
|
||||
occurs=BooleanClause.Occur.MUST;
|
||||
}
|
||||
else
|
||||
{
|
||||
if("mustNot".equalsIgnoreCase(occs))
|
||||
{
|
||||
occurs=BooleanClause.Occur.MUST_NOT;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(("should".equalsIgnoreCase(occs))||("".equals(occs)))
|
||||
{
|
||||
occurs=BooleanClause.Occur.SHOULD;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(occs!=null)
|
||||
{
|
||||
throw new ParserException("Invalid value for \"occurs\" attribute of clause:"+occs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return occurs;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.BoostingQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
public class BoostingQueryBuilder implements QueryBuilder
|
||||
{
|
||||
|
||||
private QueryBuilder factory;
|
||||
float defaultBoost=0.01f;
|
||||
|
||||
public BoostingQueryBuilder (QueryBuilder factory)
|
||||
{
|
||||
this.factory=factory;
|
||||
}
|
||||
|
||||
public Query getQuery(Element e) throws ParserException
|
||||
{
|
||||
|
||||
Element mainQueryElem=DOMUtils.getChildByTagName(e,"Query");
|
||||
if(mainQueryElem==null)
|
||||
{
|
||||
throw new ParserException("BoostingQuery missing a \"Query\" child element");
|
||||
}
|
||||
mainQueryElem=DOMUtils.getFirstChildElement(mainQueryElem);
|
||||
if(mainQueryElem==null)
|
||||
{
|
||||
throw new ParserException("BoostingQuery \"Query\" element missing a child element");
|
||||
}
|
||||
Query mainQuery=factory.getQuery(mainQueryElem);
|
||||
|
||||
|
||||
Element boostQueryElem=DOMUtils.getChildByTagName(e,"BoostQuery");
|
||||
float boost=DOMUtils.getAttribute(boostQueryElem,"boost",defaultBoost);
|
||||
if(boostQueryElem==null)
|
||||
{
|
||||
throw new ParserException("BoostingQuery missing a \"BoostQuery\" child element");
|
||||
}
|
||||
boostQueryElem=DOMUtils.getFirstChildElement(boostQueryElem);
|
||||
if(boostQueryElem==null)
|
||||
{
|
||||
throw new ParserException("BoostingQuery \"BoostQuery\" element missing a child element");
|
||||
}
|
||||
Query boostQuery=factory.getQuery(boostQueryElem);
|
||||
|
||||
BoostingQuery bq = new BoostingQuery(mainQuery,boostQuery,boost);
|
||||
bq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return bq;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.FilterBuilderFactory;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class ConstantScoreQueryBuilder implements QueryBuilder
|
||||
{
|
||||
private FilterBuilderFactory filterFactory;
|
||||
|
||||
public ConstantScoreQueryBuilder(FilterBuilderFactory filterFactory)
|
||||
{
|
||||
this.filterFactory=filterFactory;
|
||||
}
|
||||
|
||||
public Query getQuery(Element e) throws ParserException
|
||||
{
|
||||
Element filterElem=DOMUtils.getFirstChildElement(e);
|
||||
if(filterElem==null)
|
||||
{
|
||||
throw new ParserException("ConstantScoreQuery missing child element with filter");
|
||||
}
|
||||
Query q=new ConstantScoreQuery(filterFactory.getFilter(filterElem));
|
||||
q.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return q;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.FuzzyLikeThisQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
|
||||
public class FuzzyLikeThisQueryBuilder implements QueryBuilder
|
||||
{
|
||||
int defaultMaxNumTerms=50;
|
||||
float defaultMinSimilarity=0.5f;
|
||||
int defaultPrefixLength=1;
|
||||
boolean defaultIgnoreTF=false;
|
||||
private Analyzer analyzer;
|
||||
|
||||
public FuzzyLikeThisQueryBuilder(Analyzer analyzer)
|
||||
{
|
||||
this.analyzer=analyzer;
|
||||
}
|
||||
|
||||
public Query getQuery(Element e) throws ParserException
|
||||
{
|
||||
NodeList nl = e.getElementsByTagName("Field");
|
||||
int maxNumTerms=DOMUtils.getAttribute(e,"maxNumTerms",defaultMaxNumTerms);
|
||||
FuzzyLikeThisQuery fbq=new FuzzyLikeThisQuery(maxNumTerms,analyzer);
|
||||
fbq.setIgnoreTF(DOMUtils.getAttribute(e,"ignoreTF",defaultIgnoreTF));
|
||||
for(int i=0;i<nl.getLength();i++)
|
||||
{
|
||||
Element fieldElem=(Element) nl.item(i);
|
||||
float minSimilarity=DOMUtils.getAttribute(fieldElem,"minSimilarity",defaultMinSimilarity);
|
||||
int prefixLength=DOMUtils.getAttribute(fieldElem,"prefixLength",defaultPrefixLength);
|
||||
String fieldName=DOMUtils.getAttributeWithInheritance(fieldElem,"fieldName");
|
||||
|
||||
String value=DOMUtils.getText(fieldElem);
|
||||
fbq.addTerms(value,fieldName,minSimilarity,prefixLength);
|
||||
}
|
||||
fbq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
|
||||
return fbq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class LikeThisQueryBuilder implements QueryBuilder {
|
||||
|
||||
private Analyzer analyzer;
|
||||
String defaultFieldNames [];
|
||||
int defaultMaxQueryTerms=20;
|
||||
int defaultMinTermFrequency=1;
|
||||
float defaultPercentTermsToMatch=30; //default is a 3rd of selected terms must match
|
||||
|
||||
public LikeThisQueryBuilder(Analyzer analyzer,String [] defaultFieldNames)
|
||||
{
|
||||
this.analyzer=analyzer;
|
||||
this.defaultFieldNames=defaultFieldNames;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
|
||||
*/
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
String fieldsList=e.getAttribute("fieldNames"); //a comma-delimited list of fields
|
||||
String fields[]=defaultFieldNames;
|
||||
if((fieldsList!=null)&&(fieldsList.trim().length()>0))
|
||||
{
|
||||
fields=fieldsList.trim().split(",");
|
||||
//trim the fieldnames
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
fields[i]=fields[i].trim();
|
||||
}
|
||||
}
|
||||
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer);
|
||||
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
|
||||
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
|
||||
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
|
||||
|
||||
mlt.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
|
||||
return mlt;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.RangeFilter;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.FilterBuilder;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class RangeFilterBuilder implements FilterBuilder {
|
||||
|
||||
|
||||
public Filter getFilter(Element e) throws ParserException {
|
||||
|
||||
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
|
||||
|
||||
String lowerTerm=e.getAttribute("lowerTerm");
|
||||
String upperTerm=e.getAttribute("upperTerm");
|
||||
boolean includeLower=DOMUtils.getAttribute(e,"includeLower",true);
|
||||
boolean includeUpper=DOMUtils.getAttribute(e,"includeUpper",true);
|
||||
return new RangeFilter(fieldName,lowerTerm,upperTerm,includeLower,includeUpper);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public abstract class SpanBuilderBase implements SpanQueryBuilder
|
||||
{
|
||||
public Query getQuery(Element e) throws ParserException
|
||||
{
|
||||
return getSpanQuery(e);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanFirstQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class SpanFirstBuilder extends SpanBuilderBase
|
||||
{
|
||||
SpanQueryBuilder factory;
|
||||
|
||||
public SpanFirstBuilder(SpanQueryBuilder factory)
|
||||
{
|
||||
super();
|
||||
this.factory = factory;
|
||||
}
|
||||
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
int end=DOMUtils.getAttribute(e,"end",1);
|
||||
Element child=DOMUtils.getFirstChildElement(e);
|
||||
SpanQuery q=factory.getSpanQuery(child);
|
||||
|
||||
SpanFirstQuery sfq = new SpanFirstQuery(q,end);
|
||||
|
||||
sfq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return sfq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
public class SpanNearBuilder extends SpanBuilderBase
|
||||
{
|
||||
SpanQueryBuilder factory;
|
||||
public SpanNearBuilder(SpanQueryBuilder factory)
|
||||
{
|
||||
this.factory=factory;
|
||||
}
|
||||
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
String slopString=e.getAttribute("slop");
|
||||
if((slopString==null)||(slopString.length()==0))
|
||||
{
|
||||
throw new ParserException("SpanTermQuery missing slop property ");
|
||||
}
|
||||
int slop=Integer.parseInt(slopString);
|
||||
boolean inOrder=DOMUtils.getAttribute(e,"inOrder",false);
|
||||
ArrayList spans=new ArrayList();
|
||||
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
|
||||
{
|
||||
if (kid.getNodeType() == Node.ELEMENT_NODE)
|
||||
{
|
||||
spans.add(factory.getSpanQuery((Element) kid));
|
||||
}
|
||||
}
|
||||
SpanQuery[] spanQueries=(SpanQuery[]) spans.toArray(new SpanQuery[spans.size()]);
|
||||
SpanNearQuery snq=new SpanNearQuery(spanQueries,slop,inOrder);
|
||||
return snq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanNotQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class SpanNotBuilder extends SpanBuilderBase
|
||||
{
|
||||
|
||||
SpanQueryBuilder factory;
|
||||
|
||||
/**
|
||||
* @param factory
|
||||
*/
|
||||
public SpanNotBuilder(SpanQueryBuilder factory)
|
||||
{
|
||||
super();
|
||||
this.factory = factory;
|
||||
}
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
Element includeElem=DOMUtils.getChildByTagName(e,"Include");
|
||||
if(includeElem!=null)
|
||||
{
|
||||
includeElem=DOMUtils.getFirstChildElement(includeElem);
|
||||
}
|
||||
if(includeElem==null)
|
||||
{
|
||||
throw new ParserException("SpanNotQuery missing Include child Element");
|
||||
}
|
||||
Element excludeElem=DOMUtils.getChildByTagName(e,"Exclude");
|
||||
if(excludeElem!=null)
|
||||
{
|
||||
excludeElem=DOMUtils.getFirstChildElement(excludeElem);
|
||||
}
|
||||
if(excludeElem==null)
|
||||
{
|
||||
throw new ParserException("SpanNotQuery missing Exclude child Element");
|
||||
}
|
||||
SpanQuery include=factory.getSpanQuery(includeElem);
|
||||
SpanQuery exclude=factory.getSpanQuery(excludeElem);
|
||||
|
||||
SpanNotQuery snq = new SpanNotQuery(include,exclude);
|
||||
|
||||
snq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return snq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
public class SpanOrBuilder extends SpanBuilderBase
|
||||
{
|
||||
|
||||
SpanQueryBuilder factory;
|
||||
|
||||
public SpanOrBuilder(SpanQueryBuilder factory)
|
||||
{
|
||||
super();
|
||||
this.factory = factory;
|
||||
}
|
||||
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
ArrayList clausesList=new ArrayList();
|
||||
for (Node kid = e.getFirstChild(); kid != null; kid = kid.getNextSibling())
|
||||
{
|
||||
if (kid.getNodeType() == Node.ELEMENT_NODE)
|
||||
{
|
||||
SpanQuery clause=factory.getSpanQuery((Element) kid);
|
||||
clausesList.add(clause);
|
||||
}
|
||||
}
|
||||
SpanQuery[] clauses=(SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]);
|
||||
SpanOrQuery soq = new SpanOrQuery(clauses);
|
||||
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return soq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class SpanOrTermsBuilder extends SpanBuilderBase
|
||||
{
|
||||
Analyzer analyzer;
|
||||
|
||||
|
||||
/**
|
||||
* @param analyzer
|
||||
*/
|
||||
public SpanOrTermsBuilder(Analyzer analyzer)
|
||||
{
|
||||
super();
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
|
||||
if(fieldName==null)
|
||||
{
|
||||
throw new ParserException("Error: SpanOrTermsBuilder missing \"fieldName\" property");
|
||||
}
|
||||
|
||||
String value=DOMUtils.getText(e);
|
||||
|
||||
try
|
||||
{
|
||||
ArrayList clausesList=new ArrayList();
|
||||
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
||||
Token token=ts.next();
|
||||
while(token!=null)
|
||||
{
|
||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText()));
|
||||
clausesList.add(stq);
|
||||
token=ts.next();
|
||||
}
|
||||
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return soq;
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
throw new ParserException("IOException parsing value:"+value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public interface SpanQueryBuilder extends QueryBuilder{
|
||||
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException;
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class SpanQueryBuilderFactory implements SpanQueryBuilder {
|
||||
|
||||
HashMap builders=new HashMap();
|
||||
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
return getSpanQuery(e);
|
||||
}
|
||||
public void addBuilder(String nodeName,SpanQueryBuilder builder)
|
||||
{
|
||||
builders.put(nodeName,builder);
|
||||
}
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
SpanQueryBuilder builder=(SpanQueryBuilder) builders.get(e.getNodeName());
|
||||
if(builder==null)
|
||||
{
|
||||
throw new ParserException("No SpanQueryObjectBuilder defined for node "+e.getNodeName());
|
||||
}
|
||||
return builder.getSpanQuery(e);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class SpanTermBuilder extends SpanBuilderBase
|
||||
{
|
||||
|
||||
public SpanQuery getSpanQuery(Element e) throws ParserException
|
||||
{
|
||||
String fieldName=DOMUtils.getAttributeWithInheritance(e,"fieldName");
|
||||
String value=DOMUtils.getText(e);
|
||||
if((fieldName==null)||(fieldName.length()==0))
|
||||
{
|
||||
throw new ParserException("SpanTermQuery missing fieldName property ");
|
||||
}
|
||||
if((value==null)||(value.length()==0))
|
||||
{
|
||||
throw new ParserException("TermQuery missing value property ");
|
||||
}
|
||||
SpanTermQuery stq = new SpanTermQuery(new Term(fieldName,value));
|
||||
|
||||
stq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return stq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class TermQueryBuilder implements QueryBuilder {
|
||||
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
String field=DOMUtils.getAttributeWithInheritance(e,"fieldName");
|
||||
String value=DOMUtils.getText(e);
|
||||
if((field==null)||(field.length()==0))
|
||||
{
|
||||
throw new ParserException("TermQuery element missing fieldName attribute");
|
||||
}
|
||||
if((value==null)||(value.length()==0))
|
||||
{
|
||||
throw new ParserException("TermQuery element missing child text property ");
|
||||
}
|
||||
TermQuery tq = new TermQuery(new Term(field,value));
|
||||
|
||||
tq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return tq;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.search.TermsFilter;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.FilterBuilder;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*
|
||||
* @
|
||||
*/
|
||||
public class TermsFilterBuilder implements FilterBuilder
|
||||
{
|
||||
Analyzer analyzer;
|
||||
|
||||
/**
|
||||
* @param analyzer
|
||||
*/
|
||||
public TermsFilterBuilder(Analyzer analyzer)
|
||||
{
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.xmlparser.FilterBuilder#process(org.w3c.dom.Element)
|
||||
*/
|
||||
public Filter getFilter(Element e) throws ParserException
|
||||
{
|
||||
TermsFilter tf=new TermsFilter();
|
||||
NodeList nl = e.getElementsByTagName("Field");
|
||||
for(int i=0;i<nl.getLength();i++)
|
||||
{
|
||||
Element fieldElem=(Element) nl.item(i);
|
||||
String fieldName=DOMUtils.getAttributeWithInheritance(fieldElem,"fieldName");
|
||||
|
||||
if(fieldName==null)
|
||||
{
|
||||
throw new ParserException("TermsFilter missing \"fieldName\" element");
|
||||
}
|
||||
String text=DOMUtils.getText(fieldElem).trim();
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
try
|
||||
{
|
||||
Token token=ts.next();
|
||||
Term term=null;
|
||||
while(token!=null)
|
||||
{
|
||||
if(term==null)
|
||||
{
|
||||
term=new Term(fieldName,token.termText());
|
||||
}
|
||||
else
|
||||
{
|
||||
term=term.createTerm(token.termText()); //create from previous to save fieldName.intern overhead
|
||||
}
|
||||
tf.addTerm(term);
|
||||
token=ts.next();
|
||||
}
|
||||
}
|
||||
catch(IOException ioe)
|
||||
{
|
||||
throw new RuntimeException("Error constructing terms from index:"+ioe);
|
||||
}
|
||||
}
|
||||
return tf;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser.builders;
|
||||
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
import org.apache.lucene.xmlparser.QueryBuilder;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class UserInputQueryBuilder implements QueryBuilder {
|
||||
|
||||
QueryParser parser;
|
||||
|
||||
/**
|
||||
* @param parser
|
||||
*/
|
||||
public UserInputQueryBuilder(QueryParser parser) {
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
|
||||
*/
|
||||
public Query getQuery(Element e) throws ParserException {
|
||||
String text=DOMUtils.getText(e);
|
||||
try {
|
||||
Query q = parser.parse(text);
|
||||
q.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
return q;
|
||||
} catch (ParseException e1) {
|
||||
throw new ParserException(e1.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>merger</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="mustnot">
|
||||
<TermQuery>sumitomo</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery>bank</TermQuery>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<BoostingQuery>
|
||||
<!-- Find docs about banks, preferably merger info and preferably not "World bank" -->
|
||||
<Query>
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>merger</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery>bank</TermQuery>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
||||
</Query>
|
||||
<BoostQuery boost="0.01">
|
||||
<UserQuery>"world bank"</UserQuery>
|
||||
</BoostQuery>
|
||||
</BoostingQuery>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ConstantScoreQuery>
|
||||
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
|
||||
</ConstantScoreQuery>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<FuzzyLikeThisQuery>
|
||||
<!-- Matches on misspelt "Sumitomo" bank -->
|
||||
<Field fieldName="contents">
|
||||
Sumitimo bank
|
||||
</Field>
|
||||
</FuzzyLikeThisQuery>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<LikeThisQuery percentTermsToMatch="5">
|
||||
IRAQI TROOPS REPORTED PUSHING BACK IRANIANS Iraq said today its troops were pushing Iranian forces out of
|
||||
positions they had initially occupied when they launched a new offensive near the southern port of
|
||||
Basra early yesterday. A High Command communique said Iraqi troops had won a significant victory
|
||||
and were continuing to advance. Iraq said it had foiled a three-pronged thrust some 10 km
|
||||
(six miles) from Basra, but admitted the Iranians had occupied ground held by the Mohammed al-Qassem
|
||||
unit, one of three divisions attacked. The communique said Iranian Revolutionary Guards were under
|
||||
assault from warplanes, helicopter gunships, heavy artillery and tanks. "Our forces are continuing
|
||||
their advance until they purge the last foothold" occupied by the Iranians, it said.
|
||||
(Iran said its troops had killed or wounded more than 4,000 Iraqis and were stabilising their new positions.)
|
||||
The Baghdad communique said Iraqi planes also destroyed oil installations at Iran's southwestern Ahvaz field
|
||||
during a raid today. It denied an Iranian report that an Iraqi jet was shot down.
|
||||
Iraq also reported a naval battle at the northern tip of the Gulf. Iraqi naval units and forces defending an
|
||||
offshore terminal sank six Iranian out of 28 Iranian boats attempting to attack an offshore terminal,
|
||||
the communique said. Reuter 3;
|
||||
</LikeThisQuery>
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<FilteredQuery>
|
||||
<Query>
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>merger</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="mustnot">
|
||||
<TermQuery >sumitomo</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery>bank</TermQuery>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
||||
</Query>
|
||||
|
||||
<Filter>
|
||||
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
|
||||
</Filter>
|
||||
|
||||
</FilteredQuery>
|
|
@ -0,0 +1,38 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<SpanOr fieldName="contents">
|
||||
<SpanNear slop="8" inOrder="false" >
|
||||
<SpanOr>
|
||||
<SpanTerm>killed</SpanTerm>
|
||||
<SpanTerm>died</SpanTerm>
|
||||
<SpanTerm>dead</SpanTerm>
|
||||
</SpanOr>
|
||||
<SpanOr>
|
||||
<!-- a less verbose way of declaring SpanTerm declarations - these are analyzed
|
||||
into a series of Tokens which are added as SpanTerm elements of a SpanOr
|
||||
-->
|
||||
<SpanOrTerms>miner miners</SpanOrTerms>
|
||||
<!-- finds mine near worker or workers -->
|
||||
<SpanNear slop="6" inOrder="false">
|
||||
<SpanTerm>mine</SpanTerm>
|
||||
<SpanOrTerms>worker workers</SpanOrTerms>
|
||||
</SpanNear>
|
||||
</SpanOr>
|
||||
</SpanNear>
|
||||
<SpanFirst end="10">
|
||||
<SpanOrTerms>fire burn</SpanOrTerms>
|
||||
</SpanFirst>
|
||||
<!-- Other Span examples....
|
||||
|
||||
<SpanNot>
|
||||
<Include>
|
||||
<SpanNear slop="2" inOrder="2">
|
||||
<SpanTerm>social</SpanTerm>
|
||||
<SpanTerm>services</SpanTerm>
|
||||
</SpanNear>
|
||||
</Include>
|
||||
<Exclude>
|
||||
<SpanTerm>public</SpanTerm>
|
||||
</Exclude>
|
||||
</SpanNot>
|
||||
-->
|
||||
</SpanOr>
|
|
@ -0,0 +1,2 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<TermQuery fieldName="contents">sumitomo</TermQuery>
|
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<FilteredQuery>
|
||||
<Query>
|
||||
<BooleanQuery fieldName="contents">
|
||||
<Clause occurs="should">
|
||||
<TermQuery>merger</TermQuery>
|
||||
</Clause>
|
||||
<Clause occurs="must">
|
||||
<TermQuery>bank</TermQuery>
|
||||
</Clause>
|
||||
</BooleanQuery>
|
||||
</Query>
|
||||
<Filter>
|
||||
<!-- TermsFilter uses an analyzer to tokenize Field text and creates a filter for docs which
|
||||
have ANY of the supplied terms. Unlike a RangeFilter this can be used for filtering on
|
||||
multiple terms that are not necessarily in a sequence. An example might be a list of primary
|
||||
keys from a database query result or perhaps a choice of "category" labels picked by the end
|
||||
user.
|
||||
As a filter, this is much faster than the equivalent query (a BooleanQuery with many
|
||||
"should" TermQueries)
|
||||
|
||||
This example might be just a list of Saturdays ie not a contiguous range of values
|
||||
which can be handled by rangefilter
|
||||
-->
|
||||
<TermsFilter>
|
||||
<Field fieldName="date" >19870601 19870608 19870615</Field>
|
||||
</TermsFilter>
|
||||
</Filter>
|
||||
|
||||
</FilteredQuery>
|
|
@ -0,0 +1,166 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.xmlparser;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
/**
|
||||
* @author maharwood
|
||||
*/
|
||||
public class TestParser extends TestCase {
|
||||
|
||||
CoreParser builder;
|
||||
static Directory dir;
|
||||
Analyzer analyzer=new StandardAnalyzer();
|
||||
IndexReader reader;
|
||||
private IndexSearcher searcher;
|
||||
|
||||
//CHANGE THIS TO SEE OUTPUT
|
||||
boolean printResults=false;
|
||||
|
||||
|
||||
/*
|
||||
* @see TestCase#setUp()
|
||||
*/
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
//initialize the parser
|
||||
builder=new CorePlusExtensionsParser(analyzer,new QueryParser("contents", analyzer));
|
||||
|
||||
//initialize the index (done once, then cached in static data for use with ALL tests)
|
||||
if(dir==null)
|
||||
{
|
||||
BufferedReader d = new BufferedReader(new InputStreamReader(TestParser.class.getResourceAsStream("reuters21578.txt")));
|
||||
dir=new RAMDirectory();
|
||||
IndexWriter writer=new IndexWriter(dir,analyzer,true);
|
||||
String line = d.readLine();
|
||||
while(line!=null)
|
||||
{
|
||||
int endOfDate=line.indexOf('\t');
|
||||
String date=line.substring(0,endOfDate).trim();
|
||||
String content=line.substring(endOfDate).trim();
|
||||
org.apache.lucene.document.Document doc =new org.apache.lucene.document.Document();
|
||||
doc.add(new Field("date",date,Field.Store.YES,Field.Index.TOKENIZED));
|
||||
doc.add(new Field("contents",content,Field.Store.YES,Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
line=d.readLine();
|
||||
}
|
||||
d.close();
|
||||
}
|
||||
reader=IndexReader.open(dir);
|
||||
searcher=new IndexSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
protected void tearDown() throws Exception {
|
||||
reader.close();
|
||||
searcher.close();
|
||||
// dir.close();
|
||||
|
||||
}
|
||||
public void testSimpleXML() throws ParserException, IOException
|
||||
{
|
||||
Query q=parse("TermQuery.xml");
|
||||
dumpResults("TermQuery", q, 5);
|
||||
}
|
||||
public void testBooleanQueryXML() throws ParserException, IOException
|
||||
{
|
||||
Query q=parse("BooleanQuery.xml");
|
||||
dumpResults("BooleanQuery", q, 5);
|
||||
}
|
||||
public void testRangeFilterQueryXML() throws ParserException, IOException
|
||||
{
|
||||
Query q=parse("RangeFilterQuery.xml");
|
||||
dumpResults("RangeFilter", q, 5);
|
||||
}
|
||||
public void testUserQueryXML() throws ParserException, IOException
|
||||
{
|
||||
Query q=parse("UserInputQuery.xml");
|
||||
dumpResults("UserInput with Filter", q, 5);
|
||||
}
|
||||
public void testLikeThisQueryXML() throws Exception
|
||||
{
|
||||
Query q=parse("LikeThisQuery.xml");
|
||||
dumpResults("like this", q, 5);
|
||||
}
|
||||
public void testBoostingQueryXML() throws Exception
|
||||
{
|
||||
Query q=parse("BoostingQuery.xml");
|
||||
dumpResults("boosting ",q, 5);
|
||||
}
|
||||
public void testFuzzyLikeThisQueryXML() throws Exception
|
||||
{
|
||||
Query q=parse("FuzzyLikeThisQuery.xml");
|
||||
//show rewritten fuzzyLikeThisQuery - see what is being matched on
|
||||
if(printResults)
|
||||
{
|
||||
System.out.println(q.rewrite(reader));
|
||||
}
|
||||
dumpResults("FuzzyLikeThis", q, 5);
|
||||
}
|
||||
public void testTermsFilterXML() throws Exception
|
||||
{
|
||||
Query q=parse("TermsFilterQuery.xml");
|
||||
dumpResults("Terms Filter",q, 5);
|
||||
}
|
||||
public void testSpanTermXML() throws Exception
|
||||
{
|
||||
Query q=parse("SpanQuery.xml");
|
||||
dumpResults("Span Query",q, 5);
|
||||
}
|
||||
public void testConstantScoreQueryXML() throws Exception
|
||||
{
|
||||
Query q=parse("ConstantScoreQuery.xml");
|
||||
dumpResults("ConstantScoreQuery",q, 5);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//================= Helper methods ===================================
|
||||
private Query parse(String xmlFileName) throws ParserException, IOException
|
||||
{
|
||||
InputStream xmlStream=TestParser.class.getResourceAsStream(xmlFileName);
|
||||
Query result=builder.parse(xmlStream);
|
||||
xmlStream.close();
|
||||
return result;
|
||||
}
|
||||
private void dumpResults(String qType,Query q, int numDocs) throws IOException
|
||||
{
|
||||
Hits h = searcher.search(q);
|
||||
assertTrue(qType +" should produce results ", h.length()>0);
|
||||
if(printResults)
|
||||
{
|
||||
System.out.println("========="+qType+"============");
|
||||
for(int i=0;i<Math.min(numDocs,h.length());i++)
|
||||
{
|
||||
org.apache.lucene.document.Document ldoc=h.doc(i);
|
||||
System.out.println("["+ldoc.get("date")+"]"+ldoc.get("contents"));
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<FilteredQuery>
|
||||
<Query>
|
||||
<UserQuery>"Bank of England"</UserQuery>
|
||||
</Query>
|
||||
<Filter>
|
||||
<RangeFilter fieldName="date" lowerTerm="19870409" upperTerm="19870412"/>
|
||||
</Filter>
|
||||
|
||||
</FilteredQuery>
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue