Initial commit of Mark Harwood's Highlighter package

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150972 13f79535-47bb-0310-9956-ffa450edef68
2004-04-09 00:34:31 +00:00 · 2004-04-09 00:34:31 +00:00 · 33345f7af9
parent 45898ec436
commit 33345f7af9
13 changed files with 1539 additions and 0 deletions
--- a/sandbox/contributions/highlighter/build.xml
+++ b/sandbox/contributions/highlighter/build.xml
@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+
+<project name="highlighter" default="default">
+
+  <description>
+    Hits highlighter
+  </description>
+
+  <import file="../common.xml"/>
+</project>
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
@ -0,0 +1,38 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Processes terms found in the original text, typically by applying some form 
+ * of mark-up to highlight terms in HTML search results pages.
+ *
+ */
+public interface Formatter
+{
+  /**
+   * Highlights a search term. For example, an HTML Formatter could simply do:
+   *
+   * <p><dl><dt></dt><dd><code>return "&lt;b&gt;" + term + "&lt;/b&gt;";</code></dd></dl>
+   *
+   * @param originalTermText (unstemmed) term text to highlight
+   * @param stemmedTerm the stemmed form of the originalTermText
+   * @param startOffset the position of the originalTermText in the text being highlighted  
+   *
+   * @return highlighted term text
+   */
+  String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
@ -0,0 +1,40 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Implements the policy for breaking text into multiple fragments for consideration
+ * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
+ * of detecting end of sentences in the text. 
+ * @author mark@searcharea.co.uk
+ */
+public interface Fragmenter
+{
+	/**
+	 * Initializes the Fragmenter
+	 * @param originalText
+	 */
+	public void start(String originalText);
+
+	/**
+	 * Test to see if this token from the stream should be held in a new TextFragment
+	 * @param token
+	 * @return
+	 */
+	public boolean isNewFragment(Token nextToken);
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -0,0 +1,430 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Class used to markup highlighted terms found in the best sections of a 
+ * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter} 
+ * and tokenizers.
+ * @author mark@searcharea.co.uk
+ */
+public class Highlighter
+{
+
+	public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
+	private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+	private Formatter formatter;
+	private Fragmenter textFragmenter=new SimpleFragmenter();
+	private Scorer fragmentScorer=null;
+
+	public Highlighter(Scorer fragmentScorer)
+	{
+		this(new SimpleHTMLFormatter(),fragmentScorer);
+	}
+	
+	
+	public Highlighter(Formatter formatter, Scorer fragmentScorer)
+	{
+		this.formatter = formatter;
+		this.fragmentScorer = fragmentScorer;
+	}
+	
+
+
+
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant section.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragment with the highest score
+	 * is returned
+	 *
+	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information. 
+	 * This is typically produced by an analyzer re-parsing a document's 
+	 * text. Some work may be done on retrieving TokenStreams more efficently 
+	 * by adding support for storing original text position data in the Lucene
+	 * index but this support is not currently available (as of Lucene 1.4 rc2).  
+	 * @param text text to highlight terms in
+	 *
+	 * @return highlighted text fragment or null if no terms found
+	 */
+	public final String getBestFragment(TokenStream tokenStream, String text)
+		throws IOException
+	{
+		String[] results = getBestFragments(tokenStream,text, 1);
+		if (results.length > 0)
+		{
+			return results[0];
+		}
+		return null;
+	}
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant sections.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned as an array of strings in order of score (contiguous fragments are merged into 
+	 * one in their original order to improve readability)
+	 *
+	 * @param text        	text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 *
+	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 */
+	public final String[] getBestFragments(
+		TokenStream tokenStream,	
+		String text,
+		int maxNumFragments)
+		throws IOException
+	{
+		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
+		StringBuffer newText = new StringBuffer();
+		
+		TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, maxNumFragments);
+
+		mergeContiguousFragments(frag);
+
+		//Get text
+		ArrayList fragTexts = new ArrayList();
+		int n = 0;
+		for (int i = 0; i < frag.length; i++)
+		{
+			if ((frag[i] != null) && (frag[i].getScore() > 0))
+			{
+				fragTexts.add(
+					newText.substring(
+						frag[i].textStartPos,
+						frag[i].textEndPos));
+			}
+		}
+		return (String[]) fragTexts.toArray(new String[0]);
+	}
+
+	/**
+	 * Low level api to get the most relevant sections of the document
+	 * @param tokenStream
+	 * @param text
+	 * @param maxNumFragments
+	 * @return 
+	 * @throws IOException
+	 */
+	private final TextFragment[] getBestDocFragments(
+		TokenStream tokenStream,	
+		String text,
+		StringBuffer newText,
+		int maxNumFragments)
+		throws IOException
+	{
+		ArrayList docFrags = new ArrayList();
+
+		TextFragment currentFrag =	new TextFragment(newText.length(), docFrags.size());
+		fragmentScorer.startFragment(currentFrag);
+		docFrags.add(currentFrag);
+	
+		FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
+
+		try
+		{
+			org.apache.lucene.analysis.Token token;
+			String tokenText;
+			int startOffset;
+			int endOffset;
+			int lastEndOffset = 0;
+			textFragmenter.start(text);
+
+			while ((token = tokenStream.next()) != null)
+			{
+				
+				startOffset = token.startOffset();
+				endOffset = token.endOffset();		
+				//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
+				// where the analyzer was producing overlapping tokens.
+				// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
+				// but cant be sure so I'll just leave this comment in for now
+				tokenText = text.substring(startOffset, endOffset);
+
+
+				// append text between end of last token (or beginning of text) and start of current token
+				if (startOffset > lastEndOffset)
+					newText.append(text.substring(lastEndOffset, startOffset));
+
+				// does query contain current token?
+				float score=fragmentScorer.getTokenScore(token);			
+				newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
+				
+
+				if(textFragmenter.isNewFragment(token))
+				{
+					currentFrag.setScore(fragmentScorer.getFragmentScore());
+					//record stats for a new fragment
+					currentFrag.textEndPos = newText.length();
+					currentFrag =new TextFragment(newText.length(), docFrags.size());
+					fragmentScorer.startFragment(currentFrag);
+					docFrags.add(currentFrag);
+				}
+
+				lastEndOffset = endOffset;
+				if(lastEndOffset>maxDocBytesToAnalyze)
+				{
+					break;
+				}
+			}
+			currentFrag.setScore(fragmentScorer.getFragmentScore());
+			
+
+			// append text after end of last token
+			if (lastEndOffset < text.length())
+				newText.append(text.substring(lastEndOffset));
+
+			currentFrag.textEndPos = newText.length();
+
+			//sort the most relevant sections of the text
+			int minScore = 0;
+			for (Iterator i = docFrags.iterator(); i.hasNext();)
+			{
+				currentFrag = (TextFragment) i.next();
+
+				//If you are running with a version of Lucene before 11th Sept 03
+				// you do not have PriorityQueue.insert() - so uncomment the code below					
+				/*
+									if (currentFrag.getScore() >= minScore)
+									{
+										fragQueue.put(currentFrag);
+										if (fragQueue.size() > maxNumFragments)
+										{ // if hit queue overfull
+											fragQueue.pop(); // remove lowest in hit queue
+											minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
+										}
+										
+					
+									}
+				*/
+				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+				//fix to PriorityQueue. The correct method to use here is the new "insert" method
+				// USE ABOVE CODE IF THIS DOES NOT COMPILE!
+				fragQueue.insert(currentFrag);
+			}
+
+			//return the most relevant fragments
+			TextFragment frag[] = new TextFragment[fragQueue.size()];
+			for (int i = frag.length - 1; i >= 0; i--)
+			{
+				frag[i] = (TextFragment) fragQueue.pop();
+			}
+			return frag;
+
+		}
+		finally
+		{
+			if (tokenStream != null)
+			{
+				try
+				{
+					tokenStream.close();
+				}
+				catch (Exception e)
+				{
+				}
+			}
+		}
+	}
+
+
+	/** Improves readability of a score-sorted list of TextFragments by merging any fragments 
+	 * that were contiguous in the original text into one larger fragment with the correct order.
+	 * This will leave a "null" in the array entry for the lesser scored fragment. 
+	 * 
+	 * @param frag An array of document fragments in descending score
+	 */
+	private void mergeContiguousFragments(TextFragment[] frag)
+	{
+		boolean mergingStillBeingDone;
+		if (frag.length > 1)
+			do
+			{
+				mergingStillBeingDone = false; //initialise loop control flag
+				//for each fragment, scan other frags looking for contiguous blocks
+				for (int i = 0; i < frag.length; i++)
+				{
+					if (frag[i] == null)
+					{
+						continue;
+					}
+					//merge any contiguous blocks 
+					for (int x = 0; x < frag.length; x++)
+					{
+						if (frag[x] == null)
+						{
+							continue;
+						}
+						if (frag[i] == null)
+						{
+							break;
+						}
+						TextFragment frag1 = null;
+						TextFragment frag2 = null;
+						int frag1Num = 0;
+						int frag2Num = 0;
+						int bestScoringFragNum;
+						int worstScoringFragNum;
+						//if blocks are contiguous....
+						if (frag[i].follows(frag[x]))
+						{
+							frag1 = frag[x];
+							frag1Num = x;
+							frag2 = frag[i];
+							frag2Num = i;
+						}
+						else
+							if (frag[x].follows(frag[i]))
+							{
+								frag1 = frag[i];
+								frag1Num = i;
+								frag2 = frag[x];
+								frag2Num = x;
+							}
+						//merging required..
+						if (frag1 != null)
+						{
+							if (frag1.getScore() > frag2.getScore())
+							{
+								bestScoringFragNum = frag1Num;
+								worstScoringFragNum = frag2Num;
+							}
+							else
+							{
+								bestScoringFragNum = frag2Num;
+								worstScoringFragNum = frag1Num;
+							}
+							frag1.merge(frag2);
+							frag[worstScoringFragNum] = null;
+							mergingStillBeingDone = true;
+							frag[bestScoringFragNum] = frag1;
+						}
+					}
+				}
+			}
+			while (mergingStillBeingDone);
+	}
+	
+	
+	/**
+	 * Highlights terms in the  text , extracting the most relevant sections
+	 * and concatenating the chosen fragments with a separator (typically "...").
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned in order as "separator" delimited strings.
+	 *
+	 * @param text        text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 * @param separator  the separator used to intersperse the document fragments (typically "...")
+	 *
+	 * @return highlighted text
+	 */
+	public final String getBestFragments(
+		TokenStream tokenStream,	
+		String text,
+		int maxNumFragments,
+		String separator)
+		throws IOException
+	{
+		String sections[] =	getBestFragments(tokenStream,text, maxNumFragments);
+		StringBuffer result = new StringBuffer();
+		for (int i = 0; i < sections.length; i++)
+		{
+			if (i > 0)
+			{
+				result.append(separator);
+			}
+			result.append(sections[i]);
+		}
+		return result.toString();
+	}
+
+	/**
+	 * @return the maximum number of bytes to be tokenized per doc 
+	 */
+	public int getMaxDocBytesToAnalyze()
+	{
+		return maxDocBytesToAnalyze;
+	}
+
+	/**
+	 * @param byteCount the maximum number of bytes to be tokenized per doc
+	 * (This can improve performance with large documents)
+	 */
+	public void setMaxDocBytesToAnalyze(int byteCount)
+	{
+		maxDocBytesToAnalyze = byteCount;
+	}
+
+	/**
+	 * @return
+	 */
+	public Fragmenter getTextFragmenter()
+	{
+		return textFragmenter;
+	}
+
+	/**
+	 * @param fragmenter
+	 */
+	public void setTextFragmenter(Fragmenter fragmenter)
+	{
+		textFragmenter = fragmenter;
+	}
+
+	/**
+	 * @return Object used to score each text fragment 
+	 */
+	public Scorer getFragmentScorer()
+	{
+		return fragmentScorer;
+	}
+
+
+	/**
+	 * @param scorer
+	 */
+	public void setFragmentScorer(Scorer scorer)
+	{
+		fragmentScorer = scorer;
+	}
+
+
+}
+class FragmentQueue extends PriorityQueue
+{
+	public FragmentQueue(int size)
+	{
+		initialize(size);
+	}
+
+	public final boolean lessThan(Object a, Object b)
+	{
+		TextFragment fragA = (TextFragment) a;
+		TextFragment fragB = (TextFragment) b;
+		if (fragA.getScore() == fragB.getScore())
+			return fragA.fragNum > fragB.fragNum;
+		else
+			return fragA.getScore() < fragB.getScore();
+	}
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java
@ -0,0 +1,442 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ * JUnit Test for Highlighter class.
+ * @author mark@searcharea.co.uk
+ */
+public class HighlighterTest extends TestCase implements Formatter
+{
+	private IndexReader reader;
+	private static final String FIELD_NAME = "contents";
+	private Query query;
+	RAMDirectory ramDir;
+	public Searcher searcher = null;
+	public Hits hits = null;
+	int numHighlights = 0;
+	Analyzer analyzer=new StandardAnalyzer();
+
+	String texts[] =
+		{
+			"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
+			"This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
+			"JFK has been shot",
+			"John Kennedy has been shot",
+			"This text has a typo in referring to Keneddy" };
+
+	/**
+	 * Constructor for HighlightExtractorTest.
+	 * @param arg0
+	 */
+	public HighlighterTest(String arg0)
+	{
+		super(arg0);
+	}
+
+	public void testSimpleHighlighter() throws Exception
+	{
+		doSearching("Kennedy");
+		Highlighter highlighter =	new Highlighter(new QueryScorer(query));
+		highlighter.setTextFragmenter(new SimpleFragmenter(40));			
+		int maxNumFragmentsRequired = 2;
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			
+			String result =
+				highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
+			System.out.println("\t" + result);
+		}
+		//Not sure we can assert anything here - just running to check we dont throw any exceptions 
+	}
+
+
+
+	public void testGetBestFragmentsSimpleQuery() throws Exception
+	{
+		doSearching("Kennedy");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+	}
+	public void testGetFuzzyFragments() throws Exception
+	{
+		doSearching("Kinnedy~");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+	}
+
+	public void testGetWildCardFragments() throws Exception
+	{
+		doSearching("K?nnedy");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+	}
+	public void testGetMidWildCardFragments() throws Exception
+	{
+		doSearching("K*dy");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
+	}
+	public void testGetRangeFragments() throws Exception
+	{
+		doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower case
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
+	}
+
+	public void testGetBestFragmentsPhrase() throws Exception
+	{
+		doSearching("\"John Kennedy\"");
+		doStandardHighlights();
+		//Currently highlights "John" and "Kennedy" separately
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
+	}
+
+	public void testGetBestFragmentsMultiTerm() throws Exception
+	{
+		doSearching("John Kenn*");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
+	}
+	public void testGetBestFragmentsWithOr() throws Exception
+	{
+		doSearching("JFK OR Kennedy");
+		doStandardHighlights();
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5);
+	}
+
+
+	public void testGetBestSingleFragment() throws Exception
+	{
+		doSearching("Kennedy");
+//		QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
+		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
+		highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			String result = highlighter.getBestFragment(tokenStream,text);
+			System.out.println("\t" + result);
+		}
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+	}
+	
+	public void testGetBestSingleFragmentWithWeights() throws Exception
+	{
+		WeightedTerm[]wTerms=new WeightedTerm[2];
+		wTerms[0]=new WeightedTerm(10f,"hello");
+		wTerms[1]=new WeightedTerm(1f,"kennedy");
+		Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
+		TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
+		highlighter.setTextFragmenter(new SimpleFragmenter(2));
+		
+		String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
+		assertTrue("Failed to find best section using weighted terms. Found: "+result
+			, "<B>Hello</B>".equals(result));
+
+		//readjust weights
+		wTerms[1].setWeight(50f);
+		tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
+		highlighter =new Highlighter(new QueryScorer(wTerms));
+		highlighter.setTextFragmenter(new SimpleFragmenter(2));
+		
+		result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
+		assertTrue("Failed to find best section using weighted terms. Found: "+result
+			, "<B>kennedy</B>".equals(result));
+	}
+	
+	
+	
+	public void testGetSimpleHighlight() throws Exception
+	{
+		doSearching("Kennedy");
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			
+			String result = highlighter.getBestFragment(tokenStream,text);
+			System.out.println("\t" + result);
+		}
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
+	}
+
+	public void testMaxSizeHighlight() throws Exception
+	{
+		doSearching("meat");
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+		highlighter.setMaxDocBytesToAnalyze(30);
+		TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
+		String result = highlighter.getBestFragment(tokenStream,texts[0]);
+		assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
+			"us from finding matches for this record" + numHighlights +
+			 " found", numHighlights == 0);
+	}
+
+
+	
+	public void testUnRewrittenQuery() throws IOException, ParseException
+	{
+		//test to show how rewritten query can still be used
+		searcher = new IndexSearcher(ramDir);
+		Analyzer analyzer=new StandardAnalyzer();
+		Query query = QueryParser.parse("JF? or Kenned*", FIELD_NAME, analyzer);
+		System.out.println("Searching with primitive query");
+		//forget to set this and...
+		//query=query.rewrite(reader);
+		Hits hits = searcher.search(query);
+
+		//create an instance of the highlighter with the tags used to surround highlighted text
+//		QueryHighlightExtractor highlighter = new QueryHighlightExtractor(this, query, new StandardAnalyzer());
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+
+		highlighter.setTextFragmenter(new SimpleFragmenter(40));		
+
+		int maxNumFragmentsRequired = 3;
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			
+			String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
+			System.out.println(highlightedText);
+		}
+		//We expect to have zero highlights if the query is multi-terms and is not rewritten!
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
+	}
+	
+	public void testNoFragments() throws Exception
+	{
+		doSearching("AnInvalidQueryWhichShouldYieldNoResults");
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+
+		int highlightFragmentSizeInBytes = 40;
+		for (int i = 0; i < texts.length; i++)
+		{
+			String text = texts[i];
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			
+			String result = highlighter.getBestFragment(tokenStream,text);
+			assertNull("The highlight result should be null for text with no query terms", result);
+		}
+	}
+	
+	public void testMultiSearcher() throws Exception
+	{
+		//setup index 1
+		RAMDirectory ramDir1 = new RAMDirectory();
+		IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
+		Document d = new Document();
+		Field f = new Field(FIELD_NAME, "multiOne", true, true, true);
+		d.add(f);		
+		writer1.addDocument(d);
+		writer1.optimize();
+		writer1.close();
+		IndexReader reader1 = IndexReader.open(ramDir1);
+
+		//setup index 2
+		RAMDirectory ramDir2 = new RAMDirectory();
+		IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
+		d = new Document();
+		f = new Field(FIELD_NAME, "multiTwo", true, true, true);
+		d.add(f);		
+		writer2.addDocument(d);
+		writer2.optimize();
+		writer2.close();
+		IndexReader reader2 = IndexReader.open(ramDir2);
+
+		
+
+		IndexSearcher searchers[]=new IndexSearcher[2]; 
+		searchers[0] = new IndexSearcher(ramDir1);
+		searchers[1] = new IndexSearcher(ramDir2);
+		MultiSearcher multiSearcher=new MultiSearcher(searchers);
+		query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
+		System.out.println("Searching for: " + query.toString(FIELD_NAME));
+		//at this point the multisearcher calls combine(query[])
+		hits = multiSearcher.search(query);
+
+		//query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer());
+		Query expandedQueries[]=new Query[2];
+		expandedQueries[0]=query.rewrite(reader1);
+		expandedQueries[1]=query.rewrite(reader2);
+		query=query.combine(expandedQueries);
+		
+		
+		//create an instance of the highlighter with the tags used to surround highlighted text
+		Highlighter highlighter =
+			new Highlighter(this,new QueryScorer(query));
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			String highlightedText = highlighter.getBestFragment(tokenStream,text);
+			System.out.println(highlightedText);
+		}
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);		
+		
+		
+		
+	}
+	
+/*	
+
+	public void testBigramAnalyzer() throws IOException, ParseException
+	{
+		//test to ensure analyzers with none-consecutive start/end offsets
+		//dont double-highlight text
+		//setup index 1
+		RAMDirectory ramDir = new RAMDirectory();
+		Analyzer bigramAnalyzer=new CJKAnalyzer();
+		IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);
+		Document d = new Document();
+		Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
+		d.add(f);
+		writer.addDocument(d);		
+		writer.close();
+		IndexReader reader = IndexReader.open(ramDir);
+
+		IndexSearcher searcher=new IndexSearcher(reader); 
+		query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
+		System.out.println("Searching for: " + query.toString(FIELD_NAME));
+		hits = searcher.search(query);
+
+		Highlighter highlighter =
+			new Highlighter(this,new QueryFragmentScorer(query));
+
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			String highlightedText = highlighter.getBestFragment(tokenStream,text);
+			System.out.println(highlightedText);
+		}		
+		
+	}
+*/	
+
+
+	public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
+	{
+		if(score<=0)
+		{
+			return originalText;
+		}
+		numHighlights++; //update stats used in assertions
+		return "<b>" + originalText + "</b>";
+	}
+
+	public void doSearching(String queryString) throws Exception
+	{
+		searcher = new IndexSearcher(ramDir);
+		query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer());
+		//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! 
+		query=query.rewrite(reader);
+		System.out.println("Searching for: " + query.toString(FIELD_NAME));
+		hits = searcher.search(query);
+	}
+
+	void doStandardHighlights() throws Exception
+	{
+		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
+		highlighter.setTextFragmenter(new SimpleFragmenter(20));
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			int maxNumFragmentsRequired = 2;
+			String fragmentSeparator = "...";
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			
+			String result =
+				highlighter.getBestFragments(
+					tokenStream,
+					text,
+					maxNumFragmentsRequired,
+					fragmentSeparator);
+			System.out.println("\t" + result);
+		}
+	}
+
+	/*
+	 * @see TestCase#setUp()
+	 */
+	protected void setUp() throws Exception
+	{
+		ramDir = new RAMDirectory();
+		IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), true);
+		for (int i = 0; i < texts.length; i++)
+		{
+			addDoc(writer, texts[i]);
+		}
+
+		writer.optimize();
+		writer.close();
+		reader = IndexReader.open(ramDir);
+		numHighlights = 0;
+	}
+
+	private void addDoc(IndexWriter writer, String text) throws IOException
+	{
+		Document d = new Document();
+		Field f = new Field(FIELD_NAME, text, true, true, true);
+		d.add(f);
+		writer.addDocument(d);
+
+	}
+
+	/*
+	 * @see TestCase#tearDown()
+	 */
+	protected void tearDown() throws Exception
+	{
+		super.tearDown();
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
@ -0,0 +1,113 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.search.Query;
+
+/**
+ * {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
+ * This class uses the {@link QueryTermExtractor} class to process determine the query terms and 
+ * their boosts to be used. 
+ * @author mark@searcharea.co.uk
+ */
+//TODO: provide option to roll idf into the scoring equation by passing a IndexReader.
+//TODO: provide option to boost score of fragments near beginning of document 
+// based on fragment.getFragNum()
+public class QueryScorer implements Scorer
+{
+	TextFragment currentTextFragment=null;
+	HashSet uniqueTermsInFragment;
+	float totalScore=0;
+	private HashMap termsToFind;
+	
+
+	/**
+	 * 
+	 * @param query a Lucene query (ideally rewritten using query.rewrite 
+	 * before being passed to this class and the searcher)
+	 */
+	public QueryScorer(Query query)
+	{
+		this(QueryTermExtractor.getTerms(query));
+	}
+
+
+	public QueryScorer(WeightedTerm []weightedTerms	)
+	{
+		termsToFind = new HashMap();
+		for (int i = 0; i < weightedTerms.length; i++)
+		{
+			termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
+		}
+	}
+	
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
+	 */
+	public void startFragment(TextFragment newFragment)
+	{
+		uniqueTermsInFragment = new HashSet();
+		currentTextFragment=newFragment;
+		totalScore=0;
+		
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
+	 */
+	public float getTokenScore(Token token)
+	{
+		String termText=token.termText();
+		
+		WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
+		if(queryTerm==null)
+		{
+			//not a query term - return
+			return 0;
+		}
+		//found a query term - is it unique in this doc?
+		if(!uniqueTermsInFragment.contains(termText))
+		{
+			totalScore+=queryTerm.getWeight();
+			uniqueTermsInFragment.add(termText);
+		}
+		return queryTerm.getWeight();
+	}
+	
+	
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
+	 */
+	public float getFragmentScore()
+	{
+		return totalScore;		
+	}
+
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
+	 */
+	public void allFragmentsProcessed()
+	{
+		//this class has no special operations to perform at end of processing
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
@ -0,0 +1,115 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+/**
+ * Utility class used to extract the terms used in a query, plus any weights.
+ * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes
+ * so the caller must pass a rewritten query (see query.rewrite) to obtain a list of 
+ * expanded terms. 
+ * 
+ */
+public final class QueryTermExtractor
+{
+
+	/**
+	 * Extracts all terms texts of a given Query into an array of WeightedTerms
+	 *
+	 * @param query      Query to extract term texts from
+	 * @return an array of the terms used in a query, plus their weights.
+	 * @throws IOException
+	 */
+	public static final WeightedTerm[] getTerms(Query query) 
+	{
+		return getTerms(query,false);
+	}
+
+
+	/**
+	 * Extracts all terms texts of a given Query into an array of WeightedTerms
+	 *
+	 * @param query      Query to extract term texts from
+	 * @param prohibited <code>true</code> to extract "prohibited" terms, too
+     * @return an array of the terms used in a query, plus their weights.
+     * @throws IOException
+     */
+	public static final WeightedTerm[] getTerms(Query query, boolean prohibited) 
+	{
+		HashSet terms=new HashSet();
+		getTerms(query,terms,prohibited);
+		return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);
+	}
+
+	private static final void getTerms(Query query, HashSet terms,boolean prohibited) 
+	{
+		if (query instanceof BooleanQuery)
+			getTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited);
+		else
+			if (query instanceof PhraseQuery)
+				getTermsFromPhraseQuery((PhraseQuery) query, terms);
+			else
+				if (query instanceof TermQuery)
+					getTermsFromTermQuery((TermQuery) query, terms);
+//				else
+//					if ((query instanceof PrefixQuery)
+//						|| (query instanceof RangeQuery)
+//						|| (query instanceof MultiTermQuery))
+//					{
+//						//client should call rewrite BEFORE calling highlighter
+//						//						Query expandedQuery = rewrite(reader, query);
+//						//				getTerms(reader, expandedQuery, terms, prohibited);
+//					}
+	}
+
+	private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet terms, boolean prohibited)
+	{
+		BooleanClause[] queryClauses = query.getClauses();
+		int i;
+
+		for (i = 0; i < queryClauses.length; i++)
+		{
+			if (prohibited || !queryClauses[i].prohibited)
+				getTerms(queryClauses[i].query, terms, prohibited);
+		}
+	}
+
+	private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet terms)
+	{
+		Term[] queryTerms = query.getTerms();
+		int i;
+
+		for (i = 0; i < queryTerms.length; i++)
+		{
+			terms.add(new WeightedTerm(query.getBoost(),queryTerms[i].text()));
+		}
+	}
+
+	private static final void getTermsFromTermQuery(TermQuery query, HashSet terms)
+	{
+		terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text()));
+	}
+
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
@ -0,0 +1,48 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Adds to the score for a fragment based on its tokens
+ * @author mark@searcharea.co.uk
+ */
+public interface Scorer
+{
+	/**
+	 * called when a new fragment is started for consideration
+	 * @param newFragment
+	 */
+	public void startFragment(TextFragment newFragment);
+
+	/**
+	 * Called for each token in the current fragment
+	 * @param token The token to be scored
+	 * @return a score which is passed to the TermHighlighter class to influence the mark-up of the text
+	 * (this return value is NOT used to score the fragment)
+	 */
+	public float getTokenScore(Token token);
+	
+
+	/**
+	 * Called when the highlighter has no more tokens for the current fragment - the scorer will typically
+	 * call setScore() on the fragment passed in startFragment to record total info
+	 *
+	 */	
+	public float getFragmentScore();
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
@ -0,0 +1,84 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * {@link Fragmenter} implementation which breaks text up into same-size 
+ * fragments with no concerns over spotting sentence boundaries.
+ * @author mark@searcharea.co.uk
+ */
+public class SimpleFragmenter implements Fragmenter
+{
+	private static final int DEFAULT_FRAGMENT_SIZE =100;
+	private int currentNumFrags;
+	private int fragmentSize;
+
+
+	public SimpleFragmenter()
+	{
+		this(DEFAULT_FRAGMENT_SIZE);
+	}
+
+
+	/**
+	 * 
+	 * @param fragmentSize size in bytes of each fragment
+	 */
+	public SimpleFragmenter(int fragmentSize)
+	{
+		this.fragmentSize=fragmentSize;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
+	 */
+	public void start(String originalText)
+	{
+		currentNumFrags=1;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
+	 */
+	public boolean isNewFragment(Token token)
+	{
+		boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
+		if(isNewFrag)
+		{
+			currentNumFrags++;
+		}
+		return isNewFrag;
+	}
+
+	/**
+	 * @return size in bytes of each fragment
+	 */
+	public int getFragmentSize()
+	{
+		return fragmentSize;
+	}
+
+	/**
+	 * @param size size in bytes of each fragment
+	 */
+	public void setFragmentSize(int size)
+	{
+		fragmentSize = size;
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
@ -0,0 +1,57 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Simple {@link Formatter} implementation to highlight terms with a pre and post tag
+ * @author MAHarwood
+ *
+ */
+public class SimpleHTMLFormatter implements Formatter
+{
+	String preTag;
+	String postTag;
+
+	public SimpleHTMLFormatter(String preTag, String postTag)
+	{
+		this.preTag = preTag;
+		this.postTag = postTag;
+	}
+
+	/**
+	 * Default constructor uses HTML: &lt;B&gt; tags to markup terms
+	 * 
+	 **/
+	public SimpleHTMLFormatter()
+	{
+		this.preTag = "<B>";
+		this.postTag = "</B>";
+	}
+
+	public String highlightTerm(String originalText, String term, float score, int startOffset)
+	{
+		if(score<=0)
+		{
+			return originalText;
+		}
+		StringBuffer sb = new StringBuffer();
+		sb.append(preTag);
+		sb.append(originalText);
+		sb.append(postTag);
+		return sb.toString();
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
@ -0,0 +1,70 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * Low-level class used to record information about a section of a document 
+ * with a score.
+ * @author MAHarwood
+ *
+ * 
+ */
+public class TextFragment
+{
+	int fragNum;
+	int textStartPos;
+	int textEndPos;
+	float score;
+
+	public TextFragment(int textStartPos, int fragNum)
+	{
+		this.textStartPos = textStartPos;
+		this.fragNum = fragNum;
+	}
+	void setScore(float score)
+	{
+		this.score=score;
+	}
+	public float getScore()
+	{
+		return score;
+	}
+	/**
+	 * @param frag2 Fragment to be merged into this one
+	 */
+	public void merge(TextFragment frag2)
+	{
+		textEndPos = frag2.textEndPos;
+	}
+	/**
+	 * @param fragment 
+	 * @return true if this fragment follows the one passed
+	 */
+	public boolean follows(TextFragment fragment)
+	{
+		return textStartPos == fragment.textEndPos;
+	}
+
+	/**
+	 * @return the fragment sequence number
+	 */
+	public int getFragNum()
+	{
+		return fragNum;
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
@ -0,0 +1,64 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Lightweight class to hold term and a weight value used for scoring this term 
+ * @author Mark Harwood
+ */
+public class WeightedTerm
+{
+	float weight; // multiplier
+	String term; //stemmed form
+	public WeightedTerm (float weight,String term)
+	{
+		this.weight=weight;
+		this.term=term;
+	}
+	
+	
+	/**
+	 * @return the term value (stemmed)
+	 */
+	public String getTerm()
+	{
+		return term;
+	}
+
+	/**
+	 * @return the weight associated with this term
+	 */
+	public float getWeight()
+	{
+		return weight;
+	}
+
+	/**
+	 * @param term the term value (stemmed)
+	 */
+	public void setTerm(String term)
+	{
+		this.term = term;
+	}
+
+	/**
+	 * @param weight the weight associated with this term
+	 */
+	public void setWeight(float weight)
+	{
+		this.weight = weight;
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html
@ -0,0 +1,28 @@
+<html>
+<body>
+The highlight package contains classes to provide "keyword in context" features
+typically used to highlight search terms in the text of results pages. <br>
+The Highlighter class is the central component and can be used to extract the
+most interesting sections of a piece of text and highlight them, with the help of
+Fragmenter, FragmentScorer and Formatter classes.
+<h2>Example Usage</h2>
+
+<pre>
+		IndexSearcher searcher = new IndexSearcher(ramDir);
+		Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);
+		query=query.rewrite(reader); //required to expand search terms
+		Hits hits = searcher.search(query);
+
+		Highlighter highlighter =new Highlighter(this,new QueryScorer(query));
+		for (int i = 0; i < hits.length(); i++)
+		{
+			String text = hits.doc(i).get(FIELD_NAME);
+			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+			// Get 3 best fragments and seperate with a "..." 
+			String result = highlighter.getBestFragments(tokenStream,text,3,"...");
+			System.out.println(result);
+		}
+</pre>
+
+</body>
+</html>