From 33345f7af96337c7df1936db17264554fbd1a3fd Mon Sep 17 00:00:00 2001 From: Erik Hatcher Date: Fri, 9 Apr 2004 00:34:31 +0000 Subject: [PATCH] Initial commit of Mark Harwood's Highlighter package git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150972 13f79535-47bb-0310-9956-ffa450edef68 --- sandbox/contributions/highlighter/build.xml | 10 + .../lucene/search/highlight/Formatter.java | 38 ++ .../lucene/search/highlight/Fragmenter.java | 40 ++ .../lucene/search/highlight/Highlighter.java | 430 +++++++++++++++++ .../search/highlight/HighlighterTest.java | 442 ++++++++++++++++++ .../lucene/search/highlight/QueryScorer.java | 113 +++++ .../search/highlight/QueryTermExtractor.java | 115 +++++ .../lucene/search/highlight/Scorer.java | 48 ++ .../search/highlight/SimpleFragmenter.java | 84 ++++ .../search/highlight/SimpleHTMLFormatter.java | 57 +++ .../lucene/search/highlight/TextFragment.java | 70 +++ .../lucene/search/highlight/WeightedTerm.java | 64 +++ .../lucene/search/highlight/package.html | 28 ++ 13 files changed, 1539 insertions(+) create mode 100644 sandbox/contributions/highlighter/build.xml create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java create mode 100644 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java create mode 100755 sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html diff --git a/sandbox/contributions/highlighter/build.xml b/sandbox/contributions/highlighter/build.xml new file mode 100644 index 00000000000..11e95afddb3 --- /dev/null +++ b/sandbox/contributions/highlighter/build.xml @@ -0,0 +1,10 @@ + + + + + + Hits highlighter + + + + diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java new file mode 100644 index 00000000000..dd9da353a87 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java @@ -0,0 +1,38 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Processes terms found in the original text, typically by applying some form + * of mark-up to highlight terms in HTML search results pages. + * + */ +public interface Formatter +{ + /** + * Highlights a search term. For example, an HTML Formatter could simply do: + * + *

return "<b>" + term + "</b>";
+ * + * @param originalTermText (unstemmed) term text to highlight + * @param stemmedTerm the stemmed form of the originalTermText + * @param startOffset the position of the originalTermText in the text being highlighted + * + * @return highlighted term text + */ + String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset); +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java new file mode 100644 index 00000000000..a1bd01c4037 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java @@ -0,0 +1,40 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; + +/** + * Implements the policy for breaking text into multiple fragments for consideration + * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis + * of detecting end of sentences in the text. + * @author mark@searcharea.co.uk + */ +public interface Fragmenter +{ + /** + * Initializes the Fragmenter + * @param originalText + */ + public void start(String originalText); + + /** + * Test to see if this token from the stream should be held in a new TextFragment + * @param token + * @return + */ + public boolean isNewFragment(Token nextToken); +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java new file mode 100644 index 00000000000..2058ec2293d --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -0,0 +1,430 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.util.PriorityQueue; + +/** + * Class used to markup highlighted terms found in the best sections of a + * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter} + * and tokenizers. + * @author mark@searcharea.co.uk + */ +public class Highlighter +{ + + public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024; + private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE; + private Formatter formatter; + private Fragmenter textFragmenter=new SimpleFragmenter(); + private Scorer fragmentScorer=null; + + public Highlighter(Scorer fragmentScorer) + { + this(new SimpleHTMLFormatter(),fragmentScorer); + } + + + public Highlighter(Formatter formatter, Scorer fragmentScorer) + { + this.formatter = formatter; + this.fragmentScorer = fragmentScorer; + } + + + + + /** + * Highlights chosen terms in a text, extracting the most relevant section. + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragment with the highest score + * is returned + * + * @param tokenStream a stream of tokens identified in the text parameter, including offset information. + * This is typically produced by an analyzer re-parsing a document's + * text. Some work may be done on retrieving TokenStreams more efficently + * by adding support for storing original text position data in the Lucene + * index but this support is not currently available (as of Lucene 1.4 rc2). + * @param text text to highlight terms in + * + * @return highlighted text fragment or null if no terms found + */ + public final String getBestFragment(TokenStream tokenStream, String text) + throws IOException + { + String[] results = getBestFragments(tokenStream,text, 1); + if (results.length > 0) + { + return results[0]; + } + return null; + } + /** + * Highlights chosen terms in a text, extracting the most relevant sections. + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragments with the highest scores + * are returned as an array of strings in order of score (contiguous fragments are merged into + * one in their original order to improve readability) + * + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * + * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) + */ + public final String[] getBestFragments( + TokenStream tokenStream, + String text, + int maxNumFragments) + throws IOException + { + maxNumFragments = Math.max(1, maxNumFragments); //sanity check + StringBuffer newText = new StringBuffer(); + + TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments); + + mergeContiguousFragments(frag); + + //Get text + ArrayList fragTexts = new ArrayList(); + int n = 0; + for (int i = 0; i < frag.length; i++) + { + if ((frag[i] != null) && (frag[i].getScore() > 0)) + { + fragTexts.add( + newText.substring( + frag[i].textStartPos, + frag[i].textEndPos)); + } + } + return (String[]) fragTexts.toArray(new String[0]); + } + + /** + * Low level api to get the most relevant sections of the document + * @param tokenStream + * @param text + * @param maxNumFragments + * @return + * @throws IOException + */ + private final TextFragment[] getBestDocFragments( + TokenStream tokenStream, + String text, + StringBuffer newText, + int maxNumFragments) + throws IOException + { + ArrayList docFrags = new ArrayList(); + + TextFragment currentFrag = new TextFragment(newText.length(), docFrags.size()); + fragmentScorer.startFragment(currentFrag); + docFrags.add(currentFrag); + + FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); + + try + { + org.apache.lucene.analysis.Token token; + String tokenText; + int startOffset; + int endOffset; + int lastEndOffset = 0; + textFragmenter.start(text); + + while ((token = tokenStream.next()) != null) + { + + startOffset = token.startOffset(); + endOffset = token.endOffset(); + //FIXME an issue was reported with CJKTokenizer that I couldnt reproduce + // where the analyzer was producing overlapping tokens. + // I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1) + // but cant be sure so I'll just leave this comment in for now + tokenText = text.substring(startOffset, endOffset); + + + // append text between end of last token (or beginning of text) and start of current token + if (startOffset > lastEndOffset) + newText.append(text.substring(lastEndOffset, startOffset)); + + // does query contain current token? + float score=fragmentScorer.getTokenScore(token); + newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset)); + + + if(textFragmenter.isNewFragment(token)) + { + currentFrag.setScore(fragmentScorer.getFragmentScore()); + //record stats for a new fragment + currentFrag.textEndPos = newText.length(); + currentFrag =new TextFragment(newText.length(), docFrags.size()); + fragmentScorer.startFragment(currentFrag); + docFrags.add(currentFrag); + } + + lastEndOffset = endOffset; + if(lastEndOffset>maxDocBytesToAnalyze) + { + break; + } + } + currentFrag.setScore(fragmentScorer.getFragmentScore()); + + + // append text after end of last token + if (lastEndOffset < text.length()) + newText.append(text.substring(lastEndOffset)); + + currentFrag.textEndPos = newText.length(); + + //sort the most relevant sections of the text + int minScore = 0; + for (Iterator i = docFrags.iterator(); i.hasNext();) + { + currentFrag = (TextFragment) i.next(); + + //If you are running with a version of Lucene before 11th Sept 03 + // you do not have PriorityQueue.insert() - so uncomment the code below + /* + if (currentFrag.getScore() >= minScore) + { + fragQueue.put(currentFrag); + if (fragQueue.size() > maxNumFragments) + { // if hit queue overfull + fragQueue.pop(); // remove lowest in hit queue + minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore + } + + + } + */ + //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 + //fix to PriorityQueue. The correct method to use here is the new "insert" method + // USE ABOVE CODE IF THIS DOES NOT COMPILE! + fragQueue.insert(currentFrag); + } + + //return the most relevant fragments + TextFragment frag[] = new TextFragment[fragQueue.size()]; + for (int i = frag.length - 1; i >= 0; i--) + { + frag[i] = (TextFragment) fragQueue.pop(); + } + return frag; + + } + finally + { + if (tokenStream != null) + { + try + { + tokenStream.close(); + } + catch (Exception e) + { + } + } + } + } + + + /** Improves readability of a score-sorted list of TextFragments by merging any fragments + * that were contiguous in the original text into one larger fragment with the correct order. + * This will leave a "null" in the array entry for the lesser scored fragment. + * + * @param frag An array of document fragments in descending score + */ + private void mergeContiguousFragments(TextFragment[] frag) + { + boolean mergingStillBeingDone; + if (frag.length > 1) + do + { + mergingStillBeingDone = false; //initialise loop control flag + //for each fragment, scan other frags looking for contiguous blocks + for (int i = 0; i < frag.length; i++) + { + if (frag[i] == null) + { + continue; + } + //merge any contiguous blocks + for (int x = 0; x < frag.length; x++) + { + if (frag[x] == null) + { + continue; + } + if (frag[i] == null) + { + break; + } + TextFragment frag1 = null; + TextFragment frag2 = null; + int frag1Num = 0; + int frag2Num = 0; + int bestScoringFragNum; + int worstScoringFragNum; + //if blocks are contiguous.... + if (frag[i].follows(frag[x])) + { + frag1 = frag[x]; + frag1Num = x; + frag2 = frag[i]; + frag2Num = i; + } + else + if (frag[x].follows(frag[i])) + { + frag1 = frag[i]; + frag1Num = i; + frag2 = frag[x]; + frag2Num = x; + } + //merging required.. + if (frag1 != null) + { + if (frag1.getScore() > frag2.getScore()) + { + bestScoringFragNum = frag1Num; + worstScoringFragNum = frag2Num; + } + else + { + bestScoringFragNum = frag2Num; + worstScoringFragNum = frag1Num; + } + frag1.merge(frag2); + frag[worstScoringFragNum] = null; + mergingStillBeingDone = true; + frag[bestScoringFragNum] = frag1; + } + } + } + } + while (mergingStillBeingDone); + } + + + /** + * Highlights terms in the text , extracting the most relevant sections + * and concatenating the chosen fragments with a separator (typically "..."). + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragments with the highest scores + * are returned in order as "separator" delimited strings. + * + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * @param separator the separator used to intersperse the document fragments (typically "...") + * + * @return highlighted text + */ + public final String getBestFragments( + TokenStream tokenStream, + String text, + int maxNumFragments, + String separator) + throws IOException + { + String sections[] = getBestFragments(tokenStream,text, maxNumFragments); + StringBuffer result = new StringBuffer(); + for (int i = 0; i < sections.length; i++) + { + if (i > 0) + { + result.append(separator); + } + result.append(sections[i]); + } + return result.toString(); + } + + /** + * @return the maximum number of bytes to be tokenized per doc + */ + public int getMaxDocBytesToAnalyze() + { + return maxDocBytesToAnalyze; + } + + /** + * @param byteCount the maximum number of bytes to be tokenized per doc + * (This can improve performance with large documents) + */ + public void setMaxDocBytesToAnalyze(int byteCount) + { + maxDocBytesToAnalyze = byteCount; + } + + /** + * @return + */ + public Fragmenter getTextFragmenter() + { + return textFragmenter; + } + + /** + * @param fragmenter + */ + public void setTextFragmenter(Fragmenter fragmenter) + { + textFragmenter = fragmenter; + } + + /** + * @return Object used to score each text fragment + */ + public Scorer getFragmentScorer() + { + return fragmentScorer; + } + + + /** + * @param scorer + */ + public void setFragmentScorer(Scorer scorer) + { + fragmentScorer = scorer; + } + + +} +class FragmentQueue extends PriorityQueue +{ + public FragmentQueue(int size) + { + initialize(size); + } + + public final boolean lessThan(Object a, Object b) + { + TextFragment fragA = (TextFragment) a; + TextFragment fragB = (TextFragment) b; + if (fragA.getScore() == fragB.getScore()) + return fragA.fragNum > fragB.fragNum; + else + return fragA.getScore() < fragB.getScore(); + } +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java new file mode 100644 index 00000000000..82b9fb7e82d --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java @@ -0,0 +1,442 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +//import org.apache.lucene.analysis.cjk.CJKAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.store.RAMDirectory; + +/** + * JUnit Test for Highlighter class. + * @author mark@searcharea.co.uk + */ +public class HighlighterTest extends TestCase implements Formatter +{ + private IndexReader reader; + private static final String FIELD_NAME = "contents"; + private Query query; + RAMDirectory ramDir; + public Searcher searcher = null; + public Hits hits = null; + int numHighlights = 0; + Analyzer analyzer=new StandardAnalyzer(); + + String texts[] = + { + "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot", + "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy", + "JFK has been shot", + "John Kennedy has been shot", + "This text has a typo in referring to Keneddy" }; + + /** + * Constructor for HighlightExtractorTest. + * @param arg0 + */ + public HighlighterTest(String arg0) + { + super(arg0); + } + + public void testSimpleHighlighter() throws Exception + { + doSearching("Kennedy"); + Highlighter highlighter = new Highlighter(new QueryScorer(query)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + int maxNumFragmentsRequired = 2; + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + + String result = + highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "..."); + System.out.println("\t" + result); + } + //Not sure we can assert anything here - just running to check we dont throw any exceptions + } + + + + public void testGetBestFragmentsSimpleQuery() throws Exception + { + doSearching("Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } + public void testGetFuzzyFragments() throws Exception + { + doSearching("Kinnedy~"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } + + public void testGetWildCardFragments() throws Exception + { + doSearching("K?nnedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } + public void testGetMidWildCardFragments() throws Exception + { + doSearching("K*dy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); + } + public void testGetRangeFragments() throws Exception + { + doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); + } + + public void testGetBestFragmentsPhrase() throws Exception + { + doSearching("\"John Kennedy\""); + doStandardHighlights(); + //Currently highlights "John" and "Kennedy" separately + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); + } + + public void testGetBestFragmentsMultiTerm() throws Exception + { + doSearching("John Kenn*"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); + } + public void testGetBestFragmentsWithOr() throws Exception + { + doSearching("JFK OR Kennedy"); + doStandardHighlights(); + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); + } + + + public void testGetBestSingleFragment() throws Exception + { + doSearching("Kennedy"); +// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer()); + Highlighter highlighter =new Highlighter(this,new QueryScorer(query)); + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + String result = highlighter.getBestFragment(tokenStream,text); + System.out.println("\t" + result); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } + + public void testGetBestSingleFragmentWithWeights() throws Exception + { + WeightedTerm[]wTerms=new WeightedTerm[2]; + wTerms[0]=new WeightedTerm(10f,"hello"); + wTerms[1]=new WeightedTerm(1f,"kennedy"); + Highlighter highlighter =new Highlighter(new QueryScorer(wTerms)); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + String result = highlighter.getBestFragment(tokenStream,texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: "+result + , "Hello".equals(result)); + + //readjust weights + wTerms[1].setWeight(50f); + tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); + highlighter =new Highlighter(new QueryScorer(wTerms)); + highlighter.setTextFragmenter(new SimpleFragmenter(2)); + + result = highlighter.getBestFragment(tokenStream,texts[0]).trim(); + assertTrue("Failed to find best section using weighted terms. Found: "+result + , "kennedy".equals(result)); + } + + + + public void testGetSimpleHighlight() throws Exception + { + doSearching("Kennedy"); + Highlighter highlighter = + new Highlighter(this,new QueryScorer(query)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + + String result = highlighter.getBestFragment(tokenStream,text); + System.out.println("\t" + result); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } + + public void testMaxSizeHighlight() throws Exception + { + doSearching("meat"); + Highlighter highlighter = + new Highlighter(this,new QueryScorer(query)); + highlighter.setMaxDocBytesToAnalyze(30); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); + String result = highlighter.getBestFragment(tokenStream,texts[0]); + assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + + "us from finding matches for this record" + numHighlights + + " found", numHighlights == 0); + } + + + + public void testUnRewrittenQuery() throws IOException, ParseException + { + //test to show how rewritten query can still be used + searcher = new IndexSearcher(ramDir); + Analyzer analyzer=new StandardAnalyzer(); + Query query = QueryParser.parse("JF? or Kenned*", FIELD_NAME, analyzer); + System.out.println("Searching with primitive query"); + //forget to set this and... + //query=query.rewrite(reader); + Hits hits = searcher.search(query); + + //create an instance of the highlighter with the tags used to surround highlighted text +// QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer()); + Highlighter highlighter = + new Highlighter(this,new QueryScorer(query)); + + highlighter.setTextFragmenter(new SimpleFragmenter(40)); + + int maxNumFragmentsRequired = 3; + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + + String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"..."); + System.out.println(highlightedText); + } + //We expect to have zero highlights if the query is multi-terms and is not rewritten! + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0); + } + + public void testNoFragments() throws Exception + { + doSearching("AnInvalidQueryWhichShouldYieldNoResults"); + Highlighter highlighter = + new Highlighter(this,new QueryScorer(query)); + + int highlightFragmentSizeInBytes = 40; + for (int i = 0; i < texts.length; i++) + { + String text = texts[i]; + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + + String result = highlighter.getBestFragment(tokenStream,text); + assertNull("The highlight result should be null for text with no query terms", result); + } + } + + public void testMultiSearcher() throws Exception + { + //setup index 1 + RAMDirectory ramDir1 = new RAMDirectory(); + IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "multiOne", true, true, true); + d.add(f); + writer1.addDocument(d); + writer1.optimize(); + writer1.close(); + IndexReader reader1 = IndexReader.open(ramDir1); + + //setup index 2 + RAMDirectory ramDir2 = new RAMDirectory(); + IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true); + d = new Document(); + f = new Field(FIELD_NAME, "multiTwo", true, true, true); + d.add(f); + writer2.addDocument(d); + writer2.optimize(); + writer2.close(); + IndexReader reader2 = IndexReader.open(ramDir2); + + + + IndexSearcher searchers[]=new IndexSearcher[2]; + searchers[0] = new IndexSearcher(ramDir1); + searchers[1] = new IndexSearcher(ramDir2); + MultiSearcher multiSearcher=new MultiSearcher(searchers); + query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + //at this point the multisearcher calls combine(query[]) + hits = multiSearcher.search(query); + + //query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer()); + Query expandedQueries[]=new Query[2]; + expandedQueries[0]=query.rewrite(reader1); + expandedQueries[1]=query.rewrite(reader2); + query=query.combine(expandedQueries); + + + //create an instance of the highlighter with the tags used to surround highlighted text + Highlighter highlighter = + new Highlighter(this,new QueryScorer(query)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + String highlightedText = highlighter.getBestFragment(tokenStream,text); + System.out.println(highlightedText); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2); + + + + } + +/* + + public void testBigramAnalyzer() throws IOException, ParseException + { + //test to ensure analyzers with none-consecutive start/end offsets + //dont double-highlight text + //setup index 1 + RAMDirectory ramDir = new RAMDirectory(); + Analyzer bigramAnalyzer=new CJKAnalyzer(); + IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true); + Document d = new Document(); + Field f = new Field(FIELD_NAME, "java abc def", true, true, true); + d.add(f); + writer.addDocument(d); + writer.close(); + IndexReader reader = IndexReader.open(ramDir); + + IndexSearcher searcher=new IndexSearcher(reader); + query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + + Highlighter highlighter = + new Highlighter(this,new QueryFragmentScorer(query)); + + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text)); + String highlightedText = highlighter.getBestFragment(tokenStream,text); + System.out.println(highlightedText); + } + + } +*/ + + + public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset) + { + if(score<=0) + { + return originalText; + } + numHighlights++; //update stats used in assertions + return "" + originalText + ""; + } + + public void doSearching(String queryString) throws Exception + { + searcher = new IndexSearcher(ramDir); + query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer()); + //for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! + query=query.rewrite(reader); + System.out.println("Searching for: " + query.toString(FIELD_NAME)); + hits = searcher.search(query); + } + + void doStandardHighlights() throws Exception + { + Highlighter highlighter =new Highlighter(this,new QueryScorer(query)); + highlighter.setTextFragmenter(new SimpleFragmenter(20)); + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + int maxNumFragmentsRequired = 2; + String fragmentSeparator = "..."; + TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text)); + + String result = + highlighter.getBestFragments( + tokenStream, + text, + maxNumFragmentsRequired, + fragmentSeparator); + System.out.println("\t" + result); + } + } + + /* + * @see TestCase#setUp() + */ + protected void setUp() throws Exception + { + ramDir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true); + for (int i = 0; i < texts.length; i++) + { + addDoc(writer, texts[i]); + } + + writer.optimize(); + writer.close(); + reader = IndexReader.open(ramDir); + numHighlights = 0; + } + + private void addDoc(IndexWriter writer, String text) throws IOException + { + Document d = new Document(); + Field f = new Field(FIELD_NAME, text, true, true, true); + d.add(f); + writer.addDocument(d); + + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception + { + super.tearDown(); + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java new file mode 100644 index 00000000000..76996fee0f2 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java @@ -0,0 +1,113 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.search.Query; + +/** + * {@link Scorer} implementation which scores text fragments by the number of unique query terms found. + * This class uses the {@link QueryTermExtractor} class to process determine the query terms and + * their boosts to be used. + * @author mark@searcharea.co.uk + */ +//TODO: provide option to roll idf into the scoring equation by passing a IndexReader. +//TODO: provide option to boost score of fragments near beginning of document +// based on fragment.getFragNum() +public class QueryScorer implements Scorer +{ + TextFragment currentTextFragment=null; + HashSet uniqueTermsInFragment; + float totalScore=0; + private HashMap termsToFind; + + + /** + * + * @param query a Lucene query (ideally rewritten using query.rewrite + * before being passed to this class and the searcher) + */ + public QueryScorer(Query query) + { + this(QueryTermExtractor.getTerms(query)); + } + + + public QueryScorer(WeightedTerm []weightedTerms ) + { + termsToFind = new HashMap(); + for (int i = 0; i < weightedTerms.length; i++) + { + termsToFind.put(weightedTerms[i].term,weightedTerms[i]); + } + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) + { + uniqueTermsInFragment = new HashSet(); + currentTextFragment=newFragment; + totalScore=0; + + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token) + */ + public float getTokenScore(Token token) + { + String termText=token.termText(); + + WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText); + if(queryTerm==null) + { + //not a query term - return + return 0; + } + //found a query term - is it unique in this doc? + if(!uniqueTermsInFragment.contains(termText)) + { + totalScore+=queryTerm.getWeight(); + uniqueTermsInFragment.add(termText); + } + return queryTerm.getWeight(); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment) + */ + public float getFragmentScore() + { + return totalScore; + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() + */ + public void allFragmentsProcessed() + { + //this class has no special operations to perform at end of processing + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java new file mode 100644 index 00000000000..7f6168f71a2 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java @@ -0,0 +1,115 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashSet; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * Utility class used to extract the terms used in a query, plus any weights. + * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes + * so the caller must pass a rewritten query (see query.rewrite) to obtain a list of + * expanded terms. + * + */ +public final class QueryTermExtractor +{ + + /** + * Extracts all terms texts of a given Query into an array of WeightedTerms + * + * @param query Query to extract term texts from + * @return an array of the terms used in a query, plus their weights. + * @throws IOException + */ + public static final WeightedTerm[] getTerms(Query query) + { + return getTerms(query,false); + } + + + /** + * Extracts all terms texts of a given Query into an array of WeightedTerms + * + * @param query Query to extract term texts from + * @param prohibited true to extract "prohibited" terms, too + * @return an array of the terms used in a query, plus their weights. + * @throws IOException + */ + public static final WeightedTerm[] getTerms(Query query, boolean prohibited) + { + HashSet terms=new HashSet(); + getTerms(query,terms,prohibited); + return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]); + } + + private static final void getTerms(Query query, HashSet terms,boolean prohibited) + { + if (query instanceof BooleanQuery) + getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited); + else + if (query instanceof PhraseQuery) + getTermsFromPhraseQuery((PhraseQuery) query, terms); + else + if (query instanceof TermQuery) + getTermsFromTermQuery((TermQuery) query, terms); +// else +// if ((query instanceof PrefixQuery) +// || (query instanceof RangeQuery) +// || (query instanceof MultiTermQuery)) +// { +// //client should call rewrite BEFORE calling highlighter +// // Query expandedQuery = rewrite(reader, query); +// // getTerms(reader, expandedQuery, terms, prohibited); +// } + } + + private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited) + { + BooleanClause[] queryClauses = query.getClauses(); + int i; + + for (i = 0; i < queryClauses.length; i++) + { + if (prohibited || !queryClauses[i].prohibited) + getTerms(queryClauses[i].query, terms, prohibited); + } + } + + private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms) + { + Term[] queryTerms = query.getTerms(); + int i; + + for (i = 0; i < queryTerms.length; i++) + { + terms.add(new WeightedTerm(query.getBoost(),queryTerms[i].text())); + } + } + + private static final void getTermsFromTermQuery(TermQuery query, HashSet terms) + { + terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text())); + } + + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java new file mode 100644 index 00000000000..548f34e5d44 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; + +/** + * Adds to the score for a fragment based on its tokens + * @author mark@searcharea.co.uk + */ +public interface Scorer +{ + /** + * called when a new fragment is started for consideration + * @param newFragment + */ + public void startFragment(TextFragment newFragment); + + /** + * Called for each token in the current fragment + * @param token The token to be scored + * @return a score which is passed to the TermHighlighter class to influence the mark-up of the text + * (this return value is NOT used to score the fragment) + */ + public float getTokenScore(Token token); + + + /** + * Called when the highlighter has no more tokens for the current fragment - the scorer will typically + * call setScore() on the fragment passed in startFragment to record total info + * + */ + public float getFragmentScore(); + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java new file mode 100644 index 00000000000..d4de11b1239 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java @@ -0,0 +1,84 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Token; + +/** + * {@link Fragmenter} implementation which breaks text up into same-size + * fragments with no concerns over spotting sentence boundaries. + * @author mark@searcharea.co.uk + */ +public class SimpleFragmenter implements Fragmenter +{ + private static final int DEFAULT_FRAGMENT_SIZE =100; + private int currentNumFrags; + private int fragmentSize; + + + public SimpleFragmenter() + { + this(DEFAULT_FRAGMENT_SIZE); + } + + + /** + * + * @param fragmentSize size in bytes of each fragment + */ + public SimpleFragmenter(int fragmentSize) + { + this.fragmentSize=fragmentSize; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) + */ + public void start(String originalText) + { + currentNumFrags=1; + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) + */ + public boolean isNewFragment(Token token) + { + boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags); + if(isNewFrag) + { + currentNumFrags++; + } + return isNewFrag; + } + + /** + * @return size in bytes of each fragment + */ + public int getFragmentSize() + { + return fragmentSize; + } + + /** + * @param size size in bytes of each fragment + */ + public void setFragmentSize(int size) + { + fragmentSize = size; + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java new file mode 100644 index 00000000000..aa8a39ef13d --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java @@ -0,0 +1,57 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Simple {@link Formatter} implementation to highlight terms with a pre and post tag + * @author MAHarwood + * + */ +public class SimpleHTMLFormatter implements Formatter +{ + String preTag; + String postTag; + + public SimpleHTMLFormatter(String preTag, String postTag) + { + this.preTag = preTag; + this.postTag = postTag; + } + + /** + * Default constructor uses HTML: <B> tags to markup terms + * + **/ + public SimpleHTMLFormatter() + { + this.preTag = ""; + this.postTag = ""; + } + + public String highlightTerm(String originalText, String term, float score, int startOffset) + { + if(score<=0) + { + return originalText; + } + StringBuffer sb = new StringBuffer(); + sb.append(preTag); + sb.append(originalText); + sb.append(postTag); + return sb.toString(); + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java new file mode 100644 index 00000000000..e264a56e560 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java @@ -0,0 +1,70 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Low-level class used to record information about a section of a document + * with a score. + * @author MAHarwood + * + * + */ +public class TextFragment +{ + int fragNum; + int textStartPos; + int textEndPos; + float score; + + public TextFragment(int textStartPos, int fragNum) + { + this.textStartPos = textStartPos; + this.fragNum = fragNum; + } + void setScore(float score) + { + this.score=score; + } + public float getScore() + { + return score; + } + /** + * @param frag2 Fragment to be merged into this one + */ + public void merge(TextFragment frag2) + { + textEndPos = frag2.textEndPos; + } + /** + * @param fragment + * @return true if this fragment follows the one passed + */ + public boolean follows(TextFragment fragment) + { + return textStartPos == fragment.textEndPos; + } + + /** + * @return the fragment sequence number + */ + public int getFragNum() + { + return fragNum; + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java new file mode 100644 index 00000000000..eed4164ff24 --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java @@ -0,0 +1,64 @@ +package org.apache.lucene.search.highlight; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Lightweight class to hold term and a weight value used for scoring this term + * @author Mark Harwood + */ +public class WeightedTerm +{ + float weight; // multiplier + String term; //stemmed form + public WeightedTerm (float weight,String term) + { + this.weight=weight; + this.term=term; + } + + + /** + * @return the term value (stemmed) + */ + public String getTerm() + { + return term; + } + + /** + * @return the weight associated with this term + */ + public float getWeight() + { + return weight; + } + + /** + * @param term the term value (stemmed) + */ + public void setTerm(String term) + { + this.term = term; + } + + /** + * @param weight the weight associated with this term + */ + public void setWeight(float weight) + { + this.weight = weight; + } + +} diff --git a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html new file mode 100755 index 00000000000..40cf5ec372b --- /dev/null +++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html @@ -0,0 +1,28 @@ + + +The highlight package contains classes to provide "keyword in context" features +typically used to highlight search terms in the text of results pages.
+The Highlighter class is the central component and can be used to extract the +most interesting sections of a piece of text and highlight them, with the help of +Fragmenter, FragmentScorer and Formatter classes. +

Example Usage

+ +
+		IndexSearcher searcher = new IndexSearcher(ramDir);
+		Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+		query=query.rewrite(reader); //required to expand search terms
+		Hits hits = searcher.search(query);
+
+		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			// Get 3 best fragments and seperate with a "..." 
+			String result = highlighter.getBestFragments(tokenStream,text,3,"...");
+			System.out.println(result);
+		}
+
+ + + \ No newline at end of file