From 9c0d22fe9b887b29aa55ed31dd110a2a3c5b51c4 Mon Sep 17 00:00:00 2001 From: Mike Klaas Date: Fri, 20 Jul 2007 05:34:23 +0000 Subject: [PATCH] SOLR-102: RegexFragmenter git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@557872 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + example/solr/conf/solrconfig.xml | 12 ++- .../solr/common/params/HighlightParams.java | 2 + .../solr/highlight/RegexFragmenter.java | 88 ++++++++++++++----- .../apache/solr/search/SolrIndexSearcher.java | 9 +- .../solr/highlight/HighlighterTest.java | 39 ++++++++ 6 files changed, 127 insertions(+), 26 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b3a1084801c..9869a510573 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -110,6 +110,9 @@ New Features 19. SOLR-305: analysis.jsp can be given a fieldtype instead of a field name. (hossman) +20. SOLR-102: Added RegexFragmenter, which splits text for highlighting + based on a given pattern. (klaas) + Changes in runtime behavior Optimizations diff --git a/example/solr/conf/solrconfig.xml b/example/solr/conf/solrconfig.xml index e58778fa3e3..f615fcefd46 100755 --- a/example/solr/conf/solrconfig.xml +++ b/example/solr/conf/solrconfig.xml @@ -453,18 +453,24 @@ - + 100 - + + - 70 + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} diff --git a/src/java/org/apache/solr/common/params/HighlightParams.java b/src/java/org/apache/solr/common/params/HighlightParams.java index ee8e8e7f213..22aaa0023e0 100644 --- a/src/java/org/apache/solr/common/params/HighlightParams.java +++ b/src/java/org/apache/solr/common/params/HighlightParams.java @@ -40,4 +40,6 @@ public interface HighlightParams { // Regex fragmenter public static final String REGEX = "regex"; public static final String SLOP = HIGHLIGHT+"."+REGEX+".slop"; + public static final String PATTERN = HIGHLIGHT+"."+REGEX+".pattern"; + public static final String MAX_RE_CHARS = HIGHLIGHT+"."+REGEX+".maxAnalyzedChars"; } diff --git a/src/java/org/apache/solr/highlight/RegexFragmenter.java b/src/java/org/apache/solr/highlight/RegexFragmenter.java index a94cb98f584..9177e2b22fb 100644 --- a/src/java/org/apache/solr/highlight/RegexFragmenter.java +++ b/src/java/org/apache/solr/highlight/RegexFragmenter.java @@ -27,26 +27,59 @@ import org.apache.lucene.search.highlight.NullFragmenter; import org.apache.solr.common.params.DefaultSolrParams; import org.apache.solr.common.params.HighlightParams; import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; + +/** + * Fragmenter that tries to produce snippets that "look" like a regular + * expression. + * + * solrconfig.xml parameters: + * + * + * NOTE: the default for maxAnalyzedChars is much lower for this + * fragmenter. After this limit is exhausted, fragments are produced in the + * same way as GapFragmenter + */ public class RegexFragmenter extends HighlightingPluginBase implements SolrFragmenter { + protected String defaultPatternRaw; + protected Pattern defaultPattern; + + public void init(NamedList args) { + super.init(args); + defaultPatternRaw = LuceneRegexFragmenter.DEFAULT_PATTERN_RAW; + if( defaults != null ) { + defaultPatternRaw = defaults.get(HighlightParams.PATTERN, LuceneRegexFragmenter.DEFAULT_PATTERN_RAW); + } + defaultPattern = Pattern.compile(defaultPatternRaw); + } + public Fragmenter getFragmenter(String fieldName, SolrParams params ) { - numRequests++; + numRequests++; if( defaults != null ) { params = new DefaultSolrParams( params, defaults ); } - int fragsize = params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE ); int increment = params.getFieldInt( fieldName, HighlightParams.INCREMENT, LuceneRegexFragmenter.DEFAULT_INCREMENT_GAP ); float slop = params.getFieldFloat( fieldName, HighlightParams.SLOP, LuceneRegexFragmenter.DEFAULT_SLOP ); - int maxchars = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, LuceneRegexFragmenter.DEFAULT_MAX_ANALYZED_CHARS ); - + int maxchars = params.getFieldInt( fieldName, HighlightParams.MAX_RE_CHARS, LuceneRegexFragmenter.DEFAULT_MAX_ANALYZED_CHARS ); + String rawpat = params.getFieldParam( fieldName, HighlightParams.PATTERN, LuceneRegexFragmenter.DEFAULT_PATTERN_RAW ); + + Pattern p = rawpat == defaultPatternRaw ? defaultPattern : Pattern.compile(rawpat); + if( fragsize <= 0 ) { return new NullFragmenter(); } - return new LuceneRegexFragmenter( fragsize, increment, slop, maxchars ); + return new LuceneRegexFragmenter( fragsize, increment, slop, maxchars, p ); } @@ -56,7 +89,7 @@ public class RegexFragmenter extends HighlightingPluginBase implements SolrFragm @Override public String getDescription() { - return "GapFragmenter"; + return "RegexFragmenter (" + defaultPatternRaw + ")"; } @Override @@ -77,15 +110,12 @@ public class RegexFragmenter extends HighlightingPluginBase implements SolrFragm /** - * Kind of cool but kind of slow compared to regular fragmenting + * Fragmenter that tries to produce snippets that "look" like a regular + * expression. * - * Interestingly, the slowdown comes almost entirely from the pre-analysis, - * and could be completely avoided by pre-computation. - * - * it is also possible that a hand-crafted state machine (switch statement) - * could be significantly faster. Could even build in custom tricks... - * perhaps JavaCC should be used? TODO - * + * NOTE: the default for maxAnalyzedChars is much lower for this + * fragmenter. After this limit is exhausted, fragments are produced in the + * same way as GapFragmenter */ class LuceneRegexFragmenter implements Fragmenter { @@ -93,7 +123,7 @@ class LuceneRegexFragmenter implements Fragmenter public static final int DEFAULT_FRAGMENT_SIZE = 70; public static final int DEFAULT_INCREMENT_GAP = 50; public static final float DEFAULT_SLOP = 0.6f; - public static final int DEFAULT_MAX_ANALYZED_CHARS = 3000; + public static final int DEFAULT_MAX_ANALYZED_CHARS = 10000; // ** settings @@ -106,6 +136,9 @@ class LuceneRegexFragmenter implements Fragmenter protected float slop; // analysis limit (ensures we don't waste too much time on long fields) protected int maxAnalyzedChars; + // default desirable pattern for text fragments. + protected Pattern textRE; + // ** state protected int currentNumFrags; @@ -116,10 +149,11 @@ class LuceneRegexFragmenter implements Fragmenter // ** other // note: could dynamically change size of sentences extracted to match // target frag size - protected static final Pattern textRE = Pattern.compile("[-\\w ,\"']{20,200}"); + public static final String + DEFAULT_PATTERN_RAW = "[-\\w ,\\n\"']{20,200}"; + public static final Pattern + DEFAULT_PATTERN = Pattern.compile(DEFAULT_PATTERN_RAW); - // twice as fast, but not terribly good. - //protected static final Pattern textRE = Pattern.compile("\\w{20,200}"); public LuceneRegexFragmenter() { this(DEFAULT_FRAGMENT_SIZE, @@ -135,13 +169,24 @@ class LuceneRegexFragmenter implements Fragmenter } public LuceneRegexFragmenter(int targetFragChars, - int incrementGapThreshold, - float slop, - int maxAnalyzedChars ) { + int incrementGapThreshold, + float slop, + int maxAnalyzedChars ) { + this(targetFragChars, incrementGapThreshold, slop, maxAnalyzedChars, + DEFAULT_PATTERN); + + } + + public LuceneRegexFragmenter(int targetFragChars, + int incrementGapThreshold, + float slop, + int maxAnalyzedChars, + Pattern targetPattern) { this.targetFragChars = targetFragChars; this.incrementGapThreshold = incrementGapThreshold; this.slop = slop; this.maxAnalyzedChars = maxAnalyzedChars; + this.textRE = targetPattern; } @@ -171,7 +216,6 @@ class LuceneRegexFragmenter implements Fragmenter cur = end; //System.out.println("Matched " + match.group()); } - //System.out.println("matches: " + temphs.size() + "\n\n"); hotspots = new int[temphs.size()]; for(int i = 0; i < temphs.size(); i++) { hotspots[i] = temphs.get(i); diff --git a/src/java/org/apache/solr/search/SolrIndexSearcher.java b/src/java/org/apache/solr/search/SolrIndexSearcher.java index 20a062f6256..96ef05fbe67 100644 --- a/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -388,8 +388,15 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean { */ public void readDocs(Document[] docs, DocList ids, Set fields) throws IOException { DocIterator iter = ids.iterator(); + int[] idlist = new int[ids.size()]; + Map pos = new HashMap(); for (int i=0; i args = new HashMap(); + args.put("fl", "id score"); + args.put("hl", "true"); + args.put("hl.snippets", "10"); + args.put("hl.fl", "t_text"); + args.put("hl.fragmenter", "regex"); + args.put("hl.regex.pattern", "[-\\w ,\"']{20,200}"); + args.put("hl.regex.slop", ".9"); + TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory( + "standard", 0, 200, args); + + String t = "This is an example of a sentence. Another example \"sentence\" with " + + "special characters\nand a line-break! Miscellaneous character like ^ are " + + "unknowns and end up being bad example s of sentences? I wonder how " + + "slashes/other punctuation fare in these examples?"; + assertU(adoc("t_text", t, "id", "1")); + assertU(commit()); + assertU(optimize()); + assertQ("regex fragmenter", + sumLRF.makeRequest("t_text:example"), + "//lst[@name='highlighting']/lst[@name='1']", + "//arr/str[.='This is an example of a sentence']", + "//arr/str[.='. Another example \"sentence\" with special characters\nand a line-break']", + "//arr/str[.=' ^ are unknowns and end up being bad example s of sentences']", + "//arr/str[.='/other punctuation fare in these examples?']" + ); + // try with some punctuation included + args.put("hl.regex.pattern", "[-\\w ,^/\\n\"']{20,200}"); + sumLRF = h.getRequestFactory("standard", 0, 200, args); + assertQ("regex fragmenter 2", + sumLRF.makeRequest("t_text:example"), + "//lst[@name='highlighting']/lst[@name='1']", + "//arr/str[.='This is an example of a sentence']", + "//arr/str[.='. Another example \"sentence\" with special characters\nand a line-break']", + "//arr/str[.='! Miscellaneous character like ^ are unknowns and end up being bad example s of sentences']", + "//arr/str[.='? I wonder how slashes/other punctuation fare in these examples?']" + ); + } public void testVariableFragsize() { assertU(adoc("tv_text", "a long days night this should be a piece of text which is is is is is is is is is is is is is is is is is is is is is is is is isis is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is sufficiently lengthly to produce multiple fragments which are not concatenated at all", "id", "1"));