From 05d0335dcd9de5c55eae97cf4c555bdcbb817d3e Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Sat, 19 Feb 2005 19:08:52 +0000 Subject: [PATCH] offer additional methods that take analyzer + text instead of tokenstream; fix some unused imports and variables git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@154444 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/search/highlight/Highlighter.java | 45 ++++++++++++++++++- .../search/highlight/HighlighterTest.java | 29 ++++++++---- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java index fbb13563c11..ced1fe208c7 100644 --- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -16,9 +16,11 @@ package org.apache.lucene.search.highlight; */ import java.io.IOException; +import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.PriorityQueue; @@ -57,8 +59,24 @@ public class Highlighter this.fragmentScorer = fragmentScorer; } - - + /** + * Highlights chosen terms in a text, extracting the most relevant section. + * This is a convenience method that calls + * {@link #getBestFragment(TokenStream, String)} + * + * @param analyzer the analyzer that will be used to split text + * into chunks + * @param text text to highlight terms in + * + * @return highlighted text fragment or null if no terms found + */ + public final String getBestFragment(Analyzer analyzer, String text) + throws IOException + { + TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); + return getBestFragment(tokenStream, text); + } + /** * Highlights chosen terms in a text, extracting the most relevant section. * The document text is analysed in chunks to record hit statistics @@ -84,6 +102,29 @@ public class Highlighter } return null; } + + /** + * Highlights chosen terms in a text, extracting the most relevant sections. + * This is a convenience method that calls + * {@link #getBestFragments(TokenStream, String, int)} + * + * @param analyzer the analyzer that will be used to split text + * into chunks + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * + * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) + */ + public final String[] getBestFragments( + Analyzer analyzer, + String text, + int maxNumFragments) + throws IOException + { + TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); + return getBestFragments(tokenStream, text, maxNumFragments); + } + /** * Highlights chosen terms in a text, extracting the most relevant sections. * The document text is analysed in chunks to record hit statistics diff --git a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index 6a7bca42945..5ea3ceb5d39 100644 --- a/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -17,7 +17,6 @@ package org.apache.lucene.search.highlight; */ import java.io.ByteArrayInputStream; -import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -27,7 +26,6 @@ import java.util.StringTokenizer; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; import junit.framework.TestCase; @@ -50,7 +48,6 @@ import org.apache.lucene.search.Searcher; import org.apache.lucene.store.RAMDirectory; import org.w3c.dom.Element; import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; /** * JUnit Test for Highlighter class. @@ -157,7 +154,6 @@ public class HighlighterTest extends TestCase implements Formatter assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 5); } - public void testGetBestSingleFragment() throws Exception { doSearching("Kennedy"); @@ -172,6 +168,23 @@ public class HighlighterTest extends TestCase implements Formatter System.out.println("\t" + result); } assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + highlighter.getBestFragment(analyzer, text); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + + numHighlights = 0; + for (int i = 0; i < hits.length(); i++) + { + String text = hits.doc(i).get(FIELD_NAME); + highlighter.getBestFragments(analyzer, text, 10); + } + assertTrue("Failed to find correct number of highlights " + numHighlights + " found", numHighlights == 4); + } public void testGetBestSingleFragmentWithWeights() throws Exception @@ -278,7 +291,7 @@ public class HighlighterTest extends TestCase implements Formatter TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new StringReader(texts[0])); String result = highlighter.getBestFragment(tokenStream,texts[0]); assertTrue("Setting MaxDocBytesToAnalyze should have prevented " + - "us from finding matches for this record" + numHighlights + + "us from finding matches for this record: " + numHighlights + " found", numHighlights == 0); } @@ -322,7 +335,6 @@ public class HighlighterTest extends TestCase implements Formatter Highlighter highlighter = new Highlighter(this,new QueryScorer(query)); - int highlightFragmentSizeInBytes = 40; for (int i = 0; i < texts.length; i++) { String text = texts[i]; @@ -568,8 +580,8 @@ public class HighlighterTest extends TestCase implements Formatter //========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE //========== MADE MORE GENERALLY USEFUL. // TODO - make synonyms all interchangeable with each other and produce -// a version that does antonyms(?) - the "is a specialised type of ...." -// so that car=audi, bmw and volkswagen but bmw != audi so different +// a version that does hyponyms - the "is a specialised type of ...." +// so that car = audi, bmw and volkswagen but bmw != audi so different // behaviour to synonyms //=================================================================== @@ -587,7 +599,6 @@ class SynonymAnalyzer extends Analyzer */ public TokenStream tokenStream(String arg0, Reader arg1) { - return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); } }