Added support for analyzers that produce overlapping tokens

PR: Obtained from: Submitted by: Mark Harwod Reviewed by: CVS: ---------------------------------------------------------------------- CVS: PR: CVS: If this change addresses a PR in the problem report tracking CVS: database, then enter the PR number(s) here. CVS: Obtained from: CVS: If this change has been taken from another system, such as NCSA, CVS: then name the system in this line, otherwise delete it. CVS: Submitted by: CVS: If this code has been contributed to Apache by someone else; i.e., CVS: they sent us a patch or a new module, then include their name/email CVS: address here. If this is your work then delete this line. CVS: Reviewed by: CVS: If we are doing pre-commit code reviews and someone else has CVS: reviewed your changes, include their name(s) here. CVS: If you have not had it reviewed then delete this line. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150990 13f79535-47bb-0310-9956-ffa450edef68
2004-07-26 20:39:47 +00:00 · 2004-07-26 20:39:47 +00:00 · 14f0da2aa2
parent cd10939321
commit 14f0da2aa2
5 changed files with 350 additions and 88 deletions
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
@ -23,17 +23,11 @@ package org.apache.lucene.search.highlight;
 */
 public interface Formatter
 {
-  /**
-   * Highlights a search term. For example, an HTML Formatter could simply do:
-   *
-   * <p><dl><dt></dt><dd><code>return "&lt;b&gt;" + term + "&lt;/b&gt;";</code></dd></dl>
-   *
-   * @param originalTermText (unstemmed) term text to highlight
-   * @param stemmedTerm the stemmed form of the originalTermText
-   * @param score The score for this term returned by Scorer.getTokenScore - one use for this may be to set font weight in highlighted text 
-   * @param startOffset the position of the originalTermText in the text being highlighted  
-   *
-   * @return highlighted term text
-   */
-  String highlightTerm(String originalTermText, String stemmedTerm, float score, int startOffset);
+	/**
+	 * @param originalText The section of text being considered for markup
+	 * @param tokenGroup contains one or several overlapping Tokens along with
+	 * their scores and positions.
+	 * @return
+	 */
+	String highlightTerm(String originalText, TokenGroup tokenGroup);
 }
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@ -149,46 +149,60 @@ public class Highlighter
 			int endOffset;
 			int lastEndOffset = 0;
 			textFragmenter.start(text);
+		
+			TokenGroup tokenGroup=new TokenGroup();

 			while ((token = tokenStream.next()) != null)
 			{
-				
-				startOffset = token.startOffset();
-				endOffset = token.endOffset();		
-				//FIXME an issue was reported with CJKTokenizer that I couldnt reproduce
-				// where the analyzer was producing overlapping tokens.
-				// I suspect the fix is to make startOffset=Math.max(startOffset,lastEndOffset+1)
-				// but cant be sure so I'll just leave this comment in for now
-				tokenText = text.substring(startOffset, endOffset);
-
-
-				// append text between end of last token (or beginning of text) and start of current token
-				if (startOffset > lastEndOffset)
-					newText.append(text.substring(lastEndOffset, startOffset));
-
-				// does query contain current token?
-				float score=fragmentScorer.getTokenScore(token);			
-				newText.append(formatter.highlightTerm(tokenText, token.termText(), score, startOffset));
-				
-
-				if(textFragmenter.isNewFragment(token))
+				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
 				{
-					currentFrag.setScore(fragmentScorer.getFragmentScore());
-					//record stats for a new fragment
-					currentFrag.textEndPos = newText.length();
-					currentFrag =new TextFragment(newText.length(), docFrags.size());
-					fragmentScorer.startFragment(currentFrag);
-					docFrags.add(currentFrag);
-				}
+					//the current token is distinct from previous tokens - 
+					// markup the cached token group info
+					startOffset = tokenGroup.startOffset;
+					endOffset = tokenGroup.endOffset;		
+					tokenText = text.substring(startOffset, endOffset);
+					String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
+					//store any whitespace etc from between this and last group
+					if (startOffset > lastEndOffset)
+						newText.append(text.substring(lastEndOffset, startOffset));
+					newText.append(markedUpText);
+					lastEndOffset=endOffset;
+					tokenGroup.clear();

-				lastEndOffset = endOffset;
+					//check if current token marks the start of a new fragment						
+					if(textFragmenter.isNewFragment(token))
+					{
+						currentFrag.setScore(fragmentScorer.getFragmentScore());
+						//record stats for a new fragment
+						currentFrag.textEndPos = newText.length();
+						currentFrag =new TextFragment(newText.length(), docFrags.size());
+						fragmentScorer.startFragment(currentFrag);
+						docFrags.add(currentFrag);
+					}
+				}
+						
+				tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
+				
 				if(lastEndOffset>maxDocBytesToAnalyze)
 				{
 					break;
 				}
 			}
 			currentFrag.setScore(fragmentScorer.getFragmentScore());
-			
+	
+			if(tokenGroup.numTokens>0)
+			{
+				//flush the accumulated text (same code as in above loop)
+				startOffset = tokenGroup.startOffset;
+				endOffset = tokenGroup.endOffset;		
+				tokenText = text.substring(startOffset, endOffset);
+				String markedUpText=formatter.highlightTerm(tokenText, tokenGroup);
+				//store any whitespace etc from between this and last group
+				if (startOffset > lastEndOffset)
+					newText.append(text.substring(lastEndOffset, startOffset));
+				newText.append(markedUpText);
+				lastEndOffset=endOffset;						
+			}

 			// append text after end of last token
 			if (lastEndOffset < text.length())
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
@ -24,6 +24,7 @@ public class SimpleHTMLFormatter implements Formatter
 {
 	String preTag;
 	String postTag;
+	

 	public SimpleHTMLFormatter(String preTag, String postTag)
 	{
@ -41,17 +42,20 @@ public class SimpleHTMLFormatter implements Formatter
 		this.postTag = "</B>";
 	}

-	public String highlightTerm(String originalText, String term, float score, int startOffset)
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.Formatter#highlightTerm(java.lang.String, org.apache.lucene.search.highlight.TokenGroup)
+	 */
+	public String highlightTerm(String originalText, TokenGroup tokenGroup)
 	{
-		if(score<=0)
+		StringBuffer returnBuffer;
+		if(tokenGroup.getTotalScore()>0)
 		{
-			return originalText;
+			returnBuffer=new StringBuffer();
+			returnBuffer.append(preTag);
+			returnBuffer.append(originalText);
+			returnBuffer.append(postTag);
+			return returnBuffer.toString();
 		}
-		StringBuffer sb = new StringBuffer();
-		sb.append(preTag);
-		sb.append(originalText);
-		sb.append(postTag);
-		return sb.toString();
+		return originalText;
 	}
-
 }
--- a/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
+++ b/sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
@ -0,0 +1,120 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.lucene.analysis.Token;
+
+/**
+ * One, or several overlapping tokens, along with the score(s) and the
+ * scope of the original text
+ * @author MAHarwood
+ */
+public class TokenGroup
+{
+	
+	private static final int MAX_NUM_TOKENS_PER_GROUP=50;
+	Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
+	float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
+	int numTokens=0;
+	int startOffset=0;
+	int endOffset=0;
+	
+
+	void addToken(Token token, float score)
+	{
+		if(numTokens==0)
+		{
+			startOffset=token.startOffset();		
+			endOffset=token.endOffset();		
+		}
+		else
+		{
+			startOffset=Math.min(startOffset,token.startOffset());		
+			endOffset=Math.max(endOffset,token.endOffset());		
+		}
+		tokens[numTokens]=token;
+		scores[numTokens]=score;
+		numTokens++;
+	}
+	
+	boolean isDistinct(Token token)
+	{
+		return token.startOffset()>endOffset;
+	}
+	
+	
+	void clear()
+	{
+		numTokens=0;
+	}
+	
+	/**
+	 * 
+	 * @param index a value between 0 and numTokens -1
+	 * @return the "n"th token
+	 */
+	public Token getToken(int index)
+	{
+		return tokens[index];
+	}
+
+	/**
+	 * 
+	 * @param index a value between 0 and numTokens -1
+	 * @return the "n"th score
+	 */
+	public float getScore(int index)
+	{
+		return scores[index];
+	}
+
+	/**
+	 * @return the end position in the original text
+	 */
+	public int getEndOffset()
+	{
+		return endOffset;
+	}
+
+	/**
+	 * @return the number of tokens in this group
+	 */
+	public int getNumTokens()
+	{
+		return numTokens;
+	}
+
+	/**
+	 * @return the start position in the original text
+	 */
+	public int getStartOffset()
+	{
+		return startOffset;
+	}
+
+	/**
+	 * @return
+	 */
+	public float getTotalScore()
+	{
+		float total=0;
+		for (int i = 0; i < numTokens; i++)
+		{
+			total+=scores[i];
+		}
+		return total;
+	}
+
+}
--- a/sandbox/contributions/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/sandbox/contributions/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@ -1,4 +1,5 @@
 package org.apache.lucene.search.highlight;
+
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
@ -16,13 +17,18 @@ package org.apache.lucene.search.highlight;
 */

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringTokenizer;

 import junit.framework.TestCase;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
-//import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -35,6 +41,12 @@ import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.highlight.Formatter;
+import org.apache.lucene.search.highlight.Highlighter;
+import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SimpleFragmenter;
+import org.apache.lucene.search.highlight.TokenGroup;
+import org.apache.lucene.search.highlight.WeightedTerm;
 import org.apache.lucene.store.RAMDirectory;

 /**
@ -60,23 +72,31 @@ public class HighlighterTest extends TestCase implements Formatter
 			"John Kennedy has been shot",
 			"This text has a typo in referring to Keneddy" };

+	/**
+	 * Constructor for HighlightExtractorTest.
+	 * @param arg0
+	 */
+	public HighlighterTest(String arg0)
+	{
+		super(arg0);
+	}

 	public void testSimpleHighlighter() throws Exception
 	{
 		doSearching("Kennedy");
 		Highlighter highlighter =	new Highlighter(new QueryScorer(query));
-		highlighter.setTextFragmenter(new SimpleFragmenter(40));			
+		highlighter.setTextFragmenter(new SimpleFragmenter(40));
 		int maxNumFragmentsRequired = 2;
 		for (int i = 0; i < hits.length(); i++)
 		{
 			String text = hits.doc(i).get(FIELD_NAME);
 			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-			
+
 			String result =
 				highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");
 			System.out.println("\t" + result);
 		}
-		//Not sure we can assert anything here - just running to check we dont throw any exceptions 
+		//Not sure we can assert anything here - just running to check we dont throw any exceptions
 	}


@ -151,7 +171,7 @@ public class HighlighterTest extends TestCase implements Formatter
 		}
 		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4);
 	}
-	
+
 	public void testGetBestSingleFragmentWithWeights() throws Exception
 	{
 		WeightedTerm[]wTerms=new WeightedTerm[2];
@ -160,9 +180,9 @@ public class HighlighterTest extends TestCase implements Formatter
 		Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));
 		TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
 		highlighter.setTextFragmenter(new SimpleFragmenter(2));
-		
+
 		String result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
-		assertTrue("Failed to find best section using weighted terms. Found: "+result
+		assertTrue("Failed to find best section using weighted terms. Found: ["+result+"]"
 			, "<B>Hello</B>".equals(result));

 		//readjust weights
@ -170,14 +190,36 @@ public class HighlighterTest extends TestCase implements Formatter
 		tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0]));
 		highlighter =new Highlighter(new QueryScorer(wTerms));
 		highlighter.setTextFragmenter(new SimpleFragmenter(2));
-		
+
 		result = highlighter.getBestFragment(tokenStream,texts[0]).trim();
 		assertTrue("Failed to find best section using weighted terms. Found: "+result
 			, "<B>kennedy</B>".equals(result));
 	}
 	
 	
-	
+	// tests a "complex" analyzer that produces multiple 
+	// overlapping tokens 
+	public void testOverlapAnalyzer() throws Exception
+	{
+		HashMap synonyms = new HashMap();
+		synonyms.put("football", "soccer,footie");
+		Analyzer analyzer = new SynonymAnalyzer(synonyms);
+		String srchkey = "football";
+
+		String s = "football-soccer in the euro 2004 footie competition";
+		Query query = QueryParser.parse(srchkey, "bookid", analyzer);
+
+		Highlighter highlighter = new Highlighter(new QueryScorer(query));
+		TokenStream tokenStream =
+			analyzer.tokenStream(null, new StringReader(s));
+		// Get 3 best fragments and seperate with a "..."
+		String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
+		String expectedResult="<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
+		assertTrue("overlapping analyzer should handle highlights OK",expectedResult.equals(result));
+	}
+
+
+
 	public void testGetSimpleHighlight() throws Exception
 	{
 		doSearching("Kennedy");
@ -188,7 +230,7 @@ public class HighlighterTest extends TestCase implements Formatter
 		{
 			String text = hits.doc(i).get(FIELD_NAME);
 			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-			
+
 			String result = highlighter.getBestFragment(tokenStream,text);
 			System.out.println("\t" + result);
 		}
@ -209,7 +251,7 @@ public class HighlighterTest extends TestCase implements Formatter
 	}


-	
+
 	public void testUnRewrittenQuery() throws IOException, ParseException
 	{
 		//test to show how rewritten query can still be used
@ -226,7 +268,7 @@ public class HighlighterTest extends TestCase implements Formatter
 		Highlighter highlighter =
 			new Highlighter(this,new QueryScorer(query));

-		highlighter.setTextFragmenter(new SimpleFragmenter(40));		
+		highlighter.setTextFragmenter(new SimpleFragmenter(40));

 		int maxNumFragmentsRequired = 3;

@ -234,14 +276,14 @@ public class HighlighterTest extends TestCase implements Formatter
 		{
 			String text = hits.doc(i).get(FIELD_NAME);
 			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-			
+
 			String highlightedText = highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");
 			System.out.println(highlightedText);
 		}
 		//We expect to have zero highlights if the query is multi-terms and is not rewritten!
 		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 0);
 	}
-	
+
 	public void testNoFragments() throws Exception
 	{
 		doSearching("AnInvalidQueryWhichShouldYieldNoResults");
@ -253,12 +295,12 @@ public class HighlighterTest extends TestCase implements Formatter
 		{
 			String text = texts[i];
 			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-			
+
 			String result = highlighter.getBestFragment(tokenStream,text);
 			assertNull("The highlight result should be null for text with no query terms", result);
 		}
 	}
-	
+
 	public void testMultiSearcher() throws Exception
 	{
 		//setup index 1
@ -266,7 +308,7 @@ public class HighlighterTest extends TestCase implements Formatter
 		IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), true);
 		Document d = new Document();
 		Field f = new Field(FIELD_NAME, "multiOne", true, true, true);
-		d.add(f);		
+		d.add(f);
 		writer1.addDocument(d);
 		writer1.optimize();
 		writer1.close();
@ -277,15 +319,15 @@ public class HighlighterTest extends TestCase implements Formatter
 		IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), true);
 		d = new Document();
 		f = new Field(FIELD_NAME, "multiTwo", true, true, true);
-		d.add(f);		
+		d.add(f);
 		writer2.addDocument(d);
 		writer2.optimize();
 		writer2.close();
 		IndexReader reader2 = IndexReader.open(ramDir2);

-		

-		IndexSearcher searchers[]=new IndexSearcher[2]; 
+
+		IndexSearcher searchers[]=new IndexSearcher[2];
 		searchers[0] = new IndexSearcher(ramDir1);
 		searchers[1] = new IndexSearcher(ramDir2);
 		MultiSearcher multiSearcher=new MultiSearcher(searchers);
@ -299,8 +341,8 @@ public class HighlighterTest extends TestCase implements Formatter
 		expandedQueries[0]=query.rewrite(reader1);
 		expandedQueries[1]=query.rewrite(reader2);
 		query=query.combine(expandedQueries);
-		
-		
+
+
 		//create an instance of the highlighter with the tags used to surround highlighted text
 		Highlighter highlighter =
 			new Highlighter(this,new QueryScorer(query));
@ -312,13 +354,13 @@ public class HighlighterTest extends TestCase implements Formatter
 			String highlightedText = highlighter.getBestFragment(tokenStream,text);
 			System.out.println(highlightedText);
 		}
-		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);		
-		
-		
-		
+		assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 2);
+
+
+
 	}
-	
-/*	
+
+/*

 	public void testBigramAnalyzer() throws IOException, ParseException
 	{
@ -331,11 +373,11 @@ public class HighlighterTest extends TestCase implements Formatter
 		Document d = new Document();
 		Field f = new Field(FIELD_NAME, "java abc def", true, true, true);
 		d.add(f);
-		writer.addDocument(d);		
+		writer.addDocument(d);
 		writer.close();
 		IndexReader reader = IndexReader.open(ramDir);

-		IndexSearcher searcher=new IndexSearcher(reader); 
+		IndexSearcher searcher=new IndexSearcher(reader);
 		query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
 		System.out.println("Searching for: " + query.toString(FIELD_NAME));
 		hits = searcher.search(query);
@ -349,15 +391,15 @@ public class HighlighterTest extends TestCase implements Formatter
 			TokenStream tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
 			String highlightedText = highlighter.getBestFragment(tokenStream,text);
 			System.out.println(highlightedText);
-		}		
-		
+		}
+
 	}
-*/	
+*/


-	public String highlightTerm(String originalText , String weightedTerm, float score, int startOffset)
+	public String highlightTerm(String originalText , TokenGroup group)
 	{
-		if(score<=0)
+		if(group.getTotalScore()<=0)
 		{
 			return originalText;
 		}
@ -369,7 +411,7 @@ public class HighlighterTest extends TestCase implements Formatter
 	{
 		searcher = new IndexSearcher(ramDir);
 		query = QueryParser.parse(queryString, FIELD_NAME, new StandardAnalyzer());
-		//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query! 
+		//for any multi-term queries to work (prefix, wildcard, range,fuzzy etc) you must use a rewritten query!
 		query=query.rewrite(reader);
 		System.out.println("Searching for: " + query.toString(FIELD_NAME));
 		hits = searcher.search(query);
@ -385,7 +427,7 @@ public class HighlighterTest extends TestCase implements Formatter
 			int maxNumFragmentsRequired = 2;
 			String fragmentSeparator = "...";
 			TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-			
+
 			String result =
 				highlighter.getBestFragments(
 					tokenStream,
@ -432,3 +474,91 @@ public class HighlighterTest extends TestCase implements Formatter
 	}

 }
+
+
+//===================================================================
+//========== BEGIN TEST SUPPORTING CLASSES
+//========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
+//========== MADE MORE GENERALLY USEFUL.
+// TODO - make synonyms all interchangeable with each other and produce
+// a version that does antonyms(?) - the "is a specialised type of ...."
+// so that car=audi, bmw and volkswagen but bmw != audi so different
+// behaviour to synonyms
+//===================================================================
+
+class SynonymAnalyzer extends Analyzer
+{
+	private Map synonyms;
+
+	public SynonymAnalyzer(Map synonyms)
+	{
+		this.synonyms = synonyms;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader)
+	 */
+	public TokenStream tokenStream(String arg0, Reader arg1)
+	{
+
+		return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
+	}
+}
+
+/**
+ * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
+ * @author MAHarwood
+ */
+class SynonymTokenizer extends TokenStream
+{
+	private TokenStream realStream;
+	private Token currentRealToken = null;
+	private Map synonyms;
+	StringTokenizer st = null;
+	public SynonymTokenizer(TokenStream realStream, Map synonyms)
+	{
+		this.realStream = realStream;
+		this.synonyms = synonyms;
+	}
+	public Token next() throws IOException
+	{
+		if (currentRealToken == null)
+		{
+			Token nextRealToken = realStream.next();
+			if (nextRealToken == null)
+			{
+				return null;
+			}
+			String expansions = (String) synonyms.get(nextRealToken.termText());
+			if (expansions == null)
+			{
+				return nextRealToken;
+			}
+			st = new StringTokenizer(expansions, ",");
+			if (st.hasMoreTokens())
+			{
+				currentRealToken = nextRealToken;
+			}
+			return currentRealToken;
+		}
+		else
+		{
+			String nextExpandedValue = st.nextToken();
+			Token expandedToken =
+				new Token(
+					nextExpandedValue,
+					currentRealToken.startOffset(),
+					currentRealToken.endOffset());
+			expandedToken.setPositionIncrement(0);
+			if (!st.hasMoreTokens())
+			{
+				currentRealToken = null;
+				st = null;
+			}
+			return expandedToken;
+		}
+	}
+
+}
+
+