mirror of https://github.com/apache/lucene.git
SOLR-102: RegexFragmenter
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@557872 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
773ec01848
commit
9c0d22fe9b
|
@ -110,6 +110,9 @@ New Features
|
||||||
19. SOLR-305: analysis.jsp can be given a fieldtype instead of a field
|
19. SOLR-305: analysis.jsp can be given a fieldtype instead of a field
|
||||||
name. (hossman)
|
name. (hossman)
|
||||||
|
|
||||||
|
20. SOLR-102: Added RegexFragmenter, which splits text for highlighting
|
||||||
|
based on a given pattern. (klaas)
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
|
@ -453,18 +453,24 @@
|
||||||
</lst>
|
</lst>
|
||||||
</requestHandler>
|
</requestHandler>
|
||||||
|
|
||||||
<!-- This should mostlikely be commented out in the "default" case -->
|
|
||||||
<highlighting>
|
<highlighting>
|
||||||
<!-- Configure the standard fragmenter -->
|
<!-- Configure the standard fragmenter -->
|
||||||
|
<!-- This could most likely be commented out in the "default" case -->
|
||||||
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
|
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
|
||||||
<lst name="defaults">
|
<lst name="defaults">
|
||||||
<int name="hl.fragsize">100</int>
|
<int name="hl.fragsize">100</int>
|
||||||
</lst>
|
</lst>
|
||||||
</fragmenter>
|
</fragmenter>
|
||||||
|
|
||||||
|
<!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
|
||||||
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
|
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
|
||||||
<lst name="defaults">
|
<lst name="defaults">
|
||||||
<int name="hl.fragsize">70</int>
|
<!-- slightly smaller fragsizes work better because of slop -->
|
||||||
|
<int name="hl.fragsize">70</int>
|
||||||
|
<!-- allow 50% slop on fragment sizes -->
|
||||||
|
<float name="hl.regex.slop">0.5</float>
|
||||||
|
<!-- a basic sentence pattern -->
|
||||||
|
<str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
|
||||||
</lst>
|
</lst>
|
||||||
</fragmenter>
|
</fragmenter>
|
||||||
|
|
||||||
|
|
|
@ -40,4 +40,6 @@ public interface HighlightParams {
|
||||||
// Regex fragmenter
|
// Regex fragmenter
|
||||||
public static final String REGEX = "regex";
|
public static final String REGEX = "regex";
|
||||||
public static final String SLOP = HIGHLIGHT+"."+REGEX+".slop";
|
public static final String SLOP = HIGHLIGHT+"."+REGEX+".slop";
|
||||||
|
public static final String PATTERN = HIGHLIGHT+"."+REGEX+".pattern";
|
||||||
|
public static final String MAX_RE_CHARS = HIGHLIGHT+"."+REGEX+".maxAnalyzedChars";
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,26 +27,59 @@ import org.apache.lucene.search.highlight.NullFragmenter;
|
||||||
import org.apache.solr.common.params.DefaultSolrParams;
|
import org.apache.solr.common.params.DefaultSolrParams;
|
||||||
import org.apache.solr.common.params.HighlightParams;
|
import org.apache.solr.common.params.HighlightParams;
|
||||||
import org.apache.solr.common.params.SolrParams;
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fragmenter that tries to produce snippets that "look" like a regular
|
||||||
|
* expression.
|
||||||
|
*
|
||||||
|
* <code>solrconfig.xml</code> parameters:
|
||||||
|
* <ul>
|
||||||
|
* <li><code>hl.regex.pattern</code>: regular expression corresponding to "nice" fragments.</li>
|
||||||
|
* <li><code>hl.regex.slop</code>: how far the fragmenter can stray from the ideal fragment size.
|
||||||
|
A slop of 0.2 means that the fragmenter can go over or under by 20%.</li>
|
||||||
|
* <li><code>hl.regex.maxAnalyzedChars</code>: how many characters to apply the
|
||||||
|
regular expression to (independent from the global highlighter setting).</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* NOTE: the default for <code>maxAnalyzedChars</code> is much lower for this
|
||||||
|
* fragmenter. After this limit is exhausted, fragments are produced in the
|
||||||
|
* same way as <code>GapFragmenter</code>
|
||||||
|
*/
|
||||||
|
|
||||||
public class RegexFragmenter extends HighlightingPluginBase implements SolrFragmenter
|
public class RegexFragmenter extends HighlightingPluginBase implements SolrFragmenter
|
||||||
{
|
{
|
||||||
|
protected String defaultPatternRaw;
|
||||||
|
protected Pattern defaultPattern;
|
||||||
|
|
||||||
|
public void init(NamedList args) {
|
||||||
|
super.init(args);
|
||||||
|
defaultPatternRaw = LuceneRegexFragmenter.DEFAULT_PATTERN_RAW;
|
||||||
|
if( defaults != null ) {
|
||||||
|
defaultPatternRaw = defaults.get(HighlightParams.PATTERN, LuceneRegexFragmenter.DEFAULT_PATTERN_RAW);
|
||||||
|
}
|
||||||
|
defaultPattern = Pattern.compile(defaultPatternRaw);
|
||||||
|
}
|
||||||
|
|
||||||
public Fragmenter getFragmenter(String fieldName, SolrParams params )
|
public Fragmenter getFragmenter(String fieldName, SolrParams params )
|
||||||
{
|
{
|
||||||
numRequests++;
|
numRequests++;
|
||||||
if( defaults != null ) {
|
if( defaults != null ) {
|
||||||
params = new DefaultSolrParams( params, defaults );
|
params = new DefaultSolrParams( params, defaults );
|
||||||
}
|
}
|
||||||
|
|
||||||
int fragsize = params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE );
|
int fragsize = params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE );
|
||||||
int increment = params.getFieldInt( fieldName, HighlightParams.INCREMENT, LuceneRegexFragmenter.DEFAULT_INCREMENT_GAP );
|
int increment = params.getFieldInt( fieldName, HighlightParams.INCREMENT, LuceneRegexFragmenter.DEFAULT_INCREMENT_GAP );
|
||||||
float slop = params.getFieldFloat( fieldName, HighlightParams.SLOP, LuceneRegexFragmenter.DEFAULT_SLOP );
|
float slop = params.getFieldFloat( fieldName, HighlightParams.SLOP, LuceneRegexFragmenter.DEFAULT_SLOP );
|
||||||
int maxchars = params.getFieldInt( fieldName, HighlightParams.MAX_CHARS, LuceneRegexFragmenter.DEFAULT_MAX_ANALYZED_CHARS );
|
int maxchars = params.getFieldInt( fieldName, HighlightParams.MAX_RE_CHARS, LuceneRegexFragmenter.DEFAULT_MAX_ANALYZED_CHARS );
|
||||||
|
String rawpat = params.getFieldParam( fieldName, HighlightParams.PATTERN, LuceneRegexFragmenter.DEFAULT_PATTERN_RAW );
|
||||||
|
|
||||||
|
Pattern p = rawpat == defaultPatternRaw ? defaultPattern : Pattern.compile(rawpat);
|
||||||
|
|
||||||
if( fragsize <= 0 ) {
|
if( fragsize <= 0 ) {
|
||||||
return new NullFragmenter();
|
return new NullFragmenter();
|
||||||
}
|
}
|
||||||
|
|
||||||
return new LuceneRegexFragmenter( fragsize, increment, slop, maxchars );
|
return new LuceneRegexFragmenter( fragsize, increment, slop, maxchars, p );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,7 +89,7 @@ public class RegexFragmenter extends HighlightingPluginBase implements SolrFragm
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getDescription() {
|
public String getDescription() {
|
||||||
return "GapFragmenter";
|
return "RegexFragmenter (" + defaultPatternRaw + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -77,15 +110,12 @@ public class RegexFragmenter extends HighlightingPluginBase implements SolrFragm
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Kind of cool but kind of slow compared to regular fragmenting
|
* Fragmenter that tries to produce snippets that "look" like a regular
|
||||||
*
|
* expression.
|
||||||
* Interestingly, the slowdown comes almost entirely from the pre-analysis,
|
|
||||||
* and could be completely avoided by pre-computation.
|
|
||||||
*
|
|
||||||
* it is also possible that a hand-crafted state machine (switch statement)
|
|
||||||
* could be significantly faster. Could even build in custom tricks...
|
|
||||||
* perhaps JavaCC should be used? TODO
|
|
||||||
*
|
*
|
||||||
|
* NOTE: the default for <code>maxAnalyzedChars</code> is much lower for this
|
||||||
|
* fragmenter. After this limit is exhausted, fragments are produced in the
|
||||||
|
* same way as <code>GapFragmenter</code>
|
||||||
*/
|
*/
|
||||||
class LuceneRegexFragmenter implements Fragmenter
|
class LuceneRegexFragmenter implements Fragmenter
|
||||||
{
|
{
|
||||||
|
@ -93,7 +123,7 @@ class LuceneRegexFragmenter implements Fragmenter
|
||||||
public static final int DEFAULT_FRAGMENT_SIZE = 70;
|
public static final int DEFAULT_FRAGMENT_SIZE = 70;
|
||||||
public static final int DEFAULT_INCREMENT_GAP = 50;
|
public static final int DEFAULT_INCREMENT_GAP = 50;
|
||||||
public static final float DEFAULT_SLOP = 0.6f;
|
public static final float DEFAULT_SLOP = 0.6f;
|
||||||
public static final int DEFAULT_MAX_ANALYZED_CHARS = 3000;
|
public static final int DEFAULT_MAX_ANALYZED_CHARS = 10000;
|
||||||
|
|
||||||
// ** settings
|
// ** settings
|
||||||
|
|
||||||
|
@ -106,6 +136,9 @@ class LuceneRegexFragmenter implements Fragmenter
|
||||||
protected float slop;
|
protected float slop;
|
||||||
// analysis limit (ensures we don't waste too much time on long fields)
|
// analysis limit (ensures we don't waste too much time on long fields)
|
||||||
protected int maxAnalyzedChars;
|
protected int maxAnalyzedChars;
|
||||||
|
// default desirable pattern for text fragments.
|
||||||
|
protected Pattern textRE;
|
||||||
|
|
||||||
|
|
||||||
// ** state
|
// ** state
|
||||||
protected int currentNumFrags;
|
protected int currentNumFrags;
|
||||||
|
@ -116,10 +149,11 @@ class LuceneRegexFragmenter implements Fragmenter
|
||||||
// ** other
|
// ** other
|
||||||
// note: could dynamically change size of sentences extracted to match
|
// note: could dynamically change size of sentences extracted to match
|
||||||
// target frag size
|
// target frag size
|
||||||
protected static final Pattern textRE = Pattern.compile("[-\\w ,\"']{20,200}");
|
public static final String
|
||||||
|
DEFAULT_PATTERN_RAW = "[-\\w ,\\n\"']{20,200}";
|
||||||
|
public static final Pattern
|
||||||
|
DEFAULT_PATTERN = Pattern.compile(DEFAULT_PATTERN_RAW);
|
||||||
|
|
||||||
// twice as fast, but not terribly good.
|
|
||||||
//protected static final Pattern textRE = Pattern.compile("\\w{20,200}");
|
|
||||||
|
|
||||||
public LuceneRegexFragmenter() {
|
public LuceneRegexFragmenter() {
|
||||||
this(DEFAULT_FRAGMENT_SIZE,
|
this(DEFAULT_FRAGMENT_SIZE,
|
||||||
|
@ -135,13 +169,24 @@ class LuceneRegexFragmenter implements Fragmenter
|
||||||
}
|
}
|
||||||
|
|
||||||
public LuceneRegexFragmenter(int targetFragChars,
|
public LuceneRegexFragmenter(int targetFragChars,
|
||||||
int incrementGapThreshold,
|
int incrementGapThreshold,
|
||||||
float slop,
|
float slop,
|
||||||
int maxAnalyzedChars ) {
|
int maxAnalyzedChars ) {
|
||||||
|
this(targetFragChars, incrementGapThreshold, slop, maxAnalyzedChars,
|
||||||
|
DEFAULT_PATTERN);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public LuceneRegexFragmenter(int targetFragChars,
|
||||||
|
int incrementGapThreshold,
|
||||||
|
float slop,
|
||||||
|
int maxAnalyzedChars,
|
||||||
|
Pattern targetPattern) {
|
||||||
this.targetFragChars = targetFragChars;
|
this.targetFragChars = targetFragChars;
|
||||||
this.incrementGapThreshold = incrementGapThreshold;
|
this.incrementGapThreshold = incrementGapThreshold;
|
||||||
this.slop = slop;
|
this.slop = slop;
|
||||||
this.maxAnalyzedChars = maxAnalyzedChars;
|
this.maxAnalyzedChars = maxAnalyzedChars;
|
||||||
|
this.textRE = targetPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,7 +216,6 @@ class LuceneRegexFragmenter implements Fragmenter
|
||||||
cur = end;
|
cur = end;
|
||||||
//System.out.println("Matched " + match.group());
|
//System.out.println("Matched " + match.group());
|
||||||
}
|
}
|
||||||
//System.out.println("matches: " + temphs.size() + "\n\n");
|
|
||||||
hotspots = new int[temphs.size()];
|
hotspots = new int[temphs.size()];
|
||||||
for(int i = 0; i < temphs.size(); i++) {
|
for(int i = 0; i < temphs.size(); i++) {
|
||||||
hotspots[i] = temphs.get(i);
|
hotspots[i] = temphs.get(i);
|
||||||
|
|
|
@ -388,8 +388,15 @@ public class SolrIndexSearcher extends Searcher implements SolrInfoMBean {
|
||||||
*/
|
*/
|
||||||
public void readDocs(Document[] docs, DocList ids, Set<String> fields) throws IOException {
|
public void readDocs(Document[] docs, DocList ids, Set<String> fields) throws IOException {
|
||||||
DocIterator iter = ids.iterator();
|
DocIterator iter = ids.iterator();
|
||||||
|
int[] idlist = new int[ids.size()];
|
||||||
|
Map<Integer, Integer> pos = new HashMap<Integer, Integer>();
|
||||||
for (int i=0; i<docs.length; i++) {
|
for (int i=0; i<docs.length; i++) {
|
||||||
docs[i] = doc(iter.nextDoc(), fields);
|
idlist[i] = iter.nextDoc();
|
||||||
|
pos.put(idlist[i], i);
|
||||||
|
}
|
||||||
|
Arrays.sort(idlist);
|
||||||
|
for(int docid: idlist) {
|
||||||
|
docs[pos.get(docid)] = doc(docid, fields);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -313,6 +313,45 @@ public class HighlighterTest extends AbstractSolrTestCase {
|
||||||
"//lst[@name='1'][not(*)]"
|
"//lst[@name='1'][not(*)]"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
public void testRegexFragmenter() {
|
||||||
|
HashMap<String,String> args = new HashMap<String,String>();
|
||||||
|
args.put("fl", "id score");
|
||||||
|
args.put("hl", "true");
|
||||||
|
args.put("hl.snippets", "10");
|
||||||
|
args.put("hl.fl", "t_text");
|
||||||
|
args.put("hl.fragmenter", "regex");
|
||||||
|
args.put("hl.regex.pattern", "[-\\w ,\"']{20,200}");
|
||||||
|
args.put("hl.regex.slop", ".9");
|
||||||
|
TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
|
||||||
|
"standard", 0, 200, args);
|
||||||
|
|
||||||
|
String t = "This is an example of a sentence. Another example \"sentence\" with " +
|
||||||
|
"special characters\nand a line-break! Miscellaneous character like ^ are " +
|
||||||
|
"unknowns and end up being bad example s of sentences? I wonder how " +
|
||||||
|
"slashes/other punctuation fare in these examples?";
|
||||||
|
assertU(adoc("t_text", t, "id", "1"));
|
||||||
|
assertU(commit());
|
||||||
|
assertU(optimize());
|
||||||
|
assertQ("regex fragmenter",
|
||||||
|
sumLRF.makeRequest("t_text:example"),
|
||||||
|
"//lst[@name='highlighting']/lst[@name='1']",
|
||||||
|
"//arr/str[.='This is an <em>example</em> of a sentence']",
|
||||||
|
"//arr/str[.='. Another <em>example</em> \"sentence\" with special characters\nand a line-break']",
|
||||||
|
"//arr/str[.=' ^ are unknowns and end up being bad <em>example</em> s of sentences']",
|
||||||
|
"//arr/str[.='/other punctuation fare in these <em>examples</em>?']"
|
||||||
|
);
|
||||||
|
// try with some punctuation included
|
||||||
|
args.put("hl.regex.pattern", "[-\\w ,^/\\n\"']{20,200}");
|
||||||
|
sumLRF = h.getRequestFactory("standard", 0, 200, args);
|
||||||
|
assertQ("regex fragmenter 2",
|
||||||
|
sumLRF.makeRequest("t_text:example"),
|
||||||
|
"//lst[@name='highlighting']/lst[@name='1']",
|
||||||
|
"//arr/str[.='This is an <em>example</em> of a sentence']",
|
||||||
|
"//arr/str[.='. Another <em>example</em> \"sentence\" with special characters\nand a line-break']",
|
||||||
|
"//arr/str[.='! Miscellaneous character like ^ are unknowns and end up being bad <em>example</em> s of sentences']",
|
||||||
|
"//arr/str[.='? I wonder how slashes/other punctuation fare in these <em>examples</em>?']"
|
||||||
|
);
|
||||||
|
}
|
||||||
public void testVariableFragsize() {
|
public void testVariableFragsize() {
|
||||||
assertU(adoc("tv_text", "a long days night this should be a piece of text which is is is is is is is is is is is is is is is is is is is is is is is is isis is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is sufficiently lengthly to produce multiple fragments which are not concatenated at all",
|
assertU(adoc("tv_text", "a long days night this should be a piece of text which is is is is is is is is is is is is is is is is is is is is is is is is isis is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is is sufficiently lengthly to produce multiple fragments which are not concatenated at all",
|
||||||
"id", "1"));
|
"id", "1"));
|
||||||
|
|
Loading…
Reference in New Issue