SOLR-4271: add support for PostingsHighlighter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1429413 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-01-05 22:51:30 +00:00
parent 8e49b54fde
commit 64e18dc0f6
15 changed files with 461 additions and 16 deletions

View File

@ -162,7 +162,7 @@ New Features
extractWikipedia.alg was changed to use this task, so now it creates two
files. (Doron Cohen)
* LUCENE-4290: Added PostingsHighlighter to the sandbox module. It uses
* LUCENE-4290: Added PostingsHighlighter to the highlighter module. It uses
offsets from the postings lists to highlight documents. (Robert Muir)
* LUCENE-4628: Added CommonTermsQuery that executes high-frequency terms

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -25,6 +25,32 @@ package org.apache.lucene.sandbox.postingshighlight;
* @lucene.experimental
*/
public class PassageFormatter {
private final String preTag;
private final String postTag;
private final String ellipsis;
/**
* Creates a new PassageFormatter with the default tags.
*/
public PassageFormatter() {
this("<b>", "</b>", "... ");
}
/**
* Creates a new PassageFormatter with custom tags.
* @param preTag text which should appear before a highlighted term.
* @param postTag text which should appear after a highlighted term.
* @param ellipsis text which should be used to connect two unconnected passages.
*/
public PassageFormatter(String preTag, String postTag, String ellipsis) {
if (preTag == null || postTag == null || ellipsis == null) {
throw new NullPointerException();
}
this.preTag = preTag;
this.postTag = postTag;
this.ellipsis = ellipsis;
}
/**
* Formats the top <code>passages</code> from <code>content</code>
* into a human-readable text snippet.
@ -40,7 +66,7 @@ public class PassageFormatter {
for (Passage passage : passages) {
// don't add ellipsis if its the first one, or if its connected.
if (passage.startOffset > pos && pos > 0) {
sb.append("... ");
sb.append(ellipsis);
}
pos = passage.startOffset;
for (int i = 0; i < passage.numMatches; i++) {
@ -51,9 +77,9 @@ public class PassageFormatter {
sb.append(content.substring(pos, start));
}
if (end > pos) {
sb.append("<b>");
sb.append(preTag);
sb.append(content.substring(Math.max(pos, start), end));
sb.append("</b>");
sb.append(postTag);
pos = end;
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -62,8 +62,7 @@ import org.apache.lucene.util.UnicodeUtil;
* into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
* Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
* <p>
* <b>WARNING</b>: The code is very new and may still have some exciting bugs! This is why
* it's located under Lucene's sandbox module.
* <b>WARNING</b>: The code is very new and may still have some exciting bugs!
* <p>
* Example usage:
* <pre class="prettyprint">
@ -256,7 +255,7 @@ public final class PostingsHighlighter {
LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, maxLength);
String contents[][] = new String[fields.length][docids.length];
for (int i = 0; i < docids.length; i++) {
reader.document(docids[i], visitor);
searcher.doc(docids[i], visitor);
for (int j = 0; j < fields.length; j++) {
contents[j][i] = visitor.getValue(j).toString();
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.sandbox.postingshighlight;
package org.apache.lucene.search.postingshighlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -189,6 +189,8 @@ New Features
rolling averages; median, 75th, 95th, 99th, 99.9th percentile request times
(Alan Woodward, Shawn Heisey, Adrien Grand, Uwe Schindler)
* SOLR-4271: Add support for PostingsHighlighter. (Robert Muir)
Optimizations
----------------------

View File

@ -24,6 +24,7 @@ import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.highlight.PostingsSolrHighlighter;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.highlight.DefaultSolrHighlighter;
import org.apache.solr.request.SolrQueryRequest;
@ -128,7 +129,7 @@ public class HighlightComponent extends SearchComponent implements PluginInfoIni
}
if(highlightQuery != null) {
boolean rewrite = !(Boolean.valueOf(params.get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true")) &&
boolean rewrite = (highlighter instanceof PostingsSolrHighlighter == false) && !(Boolean.valueOf(params.get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true")) &&
Boolean.valueOf(params.get(HighlightParams.HIGHLIGHT_MULTI_TERM, "true")));
highlightQuery = rewrite ? highlightQuery.rewrite(req.getSearcher().getIndexReader()) : highlightQuery;
}

View File

@ -0,0 +1,189 @@
package org.apache.solr.highlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.postingshighlight.PassageFormatter;
import org.apache.lucene.search.postingshighlight.PassageScorer;
import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.PluginInfoInitialized;
/**
* Highlighter impl that uses {@link PostingsHighlighter}
* <p>
* Example configuration:
* <pre class="prettyprint">
* &lt;searchComponent class="solr.HighlightComponent" name="highlight"&gt;
* &lt;highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter"
* preTag="&amp;lt;em&amp;gt;"
* postTag="&amp;lt;/em&amp;gt;"
* ellipsis="... "
* maxLength=10000/&gt;
* &lt;/searchComponent&gt;
* </pre>
* <p>
* Notes:
* <ul>
* <li>fields to highlight must be configured with storeOffsetsWithPositions="true"
* <li>hl.fl specifies the field list.
* <li>hl.snippets specifies how many underlying sentence fragments form the resulting snippet.
* </ul>
*
* @lucene.experimental
*/
public class PostingsSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {
protected PostingsHighlighter highlighter;
@Override
public void initalize(SolrConfig config) {}
@Override
public void init(PluginInfo info) {
Map<String,String> attributes = info.attributes;
BreakIterator breakIterator = BreakIterator.getSentenceInstance(Locale.ROOT);
PassageScorer scorer = new PassageScorer();
// formatter parameters: preTag/postTag/ellipsis
String preTag = attributes.get("preTag");
if (preTag == null) {
preTag = "<em>";
}
String postTag = attributes.get("postTag");
if (postTag == null) {
postTag = "</em>";
}
String ellipsis = attributes.get("ellipsis");
if (ellipsis == null) {
ellipsis = "... ";
}
PassageFormatter formatter = new PassageFormatter(preTag, postTag, ellipsis);
// maximum content size to process
int maxLength = PostingsHighlighter.DEFAULT_MAX_LENGTH;
if (attributes.containsKey("maxLength")) {
maxLength = Integer.parseInt(attributes.get("maxLength"));
}
highlighter = new PostingsHighlighter(maxLength, breakIterator, scorer, formatter);
}
@Override
public NamedList<Object> doHighlighting(DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException {
SolrParams params = req.getParams();
// if highlighting isnt enabled, then why call doHighlighting?
if (isHighlightingEnabled(params)) {
SolrIndexSearcher searcher = req.getSearcher();
TopDocs topDocs = toTopDocs(docs);
// fetch the unique keys
String[] keys = getUniqueKeys(searcher, topDocs);
// query-time parameters
String[] fieldNames = getHighlightFields(query, req, defaultFields);
int numSnippets = params.getInt(HighlightParams.SNIPPETS, 1);
Map<String,String[]> snippets = highlighter.highlightFields(fieldNames, query, searcher, topDocs, numSnippets);
return encodeSnippets(keys, fieldNames, snippets);
} else {
return null;
}
}
/**
* Encodes the resulting snippets into a namedlist
* @param keys the document unique keys
* @param fieldNames field names to highlight in the order
* @param snippets map from field name to snippet array for the docs
* @return encoded namedlist of summaries
*/
protected NamedList<Object> encodeSnippets(String[] keys, String[] fieldNames, Map<String,String[]> snippets) {
NamedList<Object> list = new SimpleOrderedMap<Object>();
for (int i = 0; i < keys.length; i++) {
NamedList<Object> summary = new SimpleOrderedMap<Object>();
for (String field : fieldNames) {
String snippet = snippets.get(field)[i];
// box in an array to match the format of existing highlighters,
// even though its always one element.
if (snippet == null) {
summary.add(field, new String[0]);
} else {
summary.add(field, new String[] { snippet });
}
}
list.add(keys[i], summary);
}
return list;
}
/** Converts solr's DocList to a lucene TopDocs */
protected TopDocs toTopDocs(DocList docs) {
ScoreDoc[] scoreDocs = new ScoreDoc[docs.size()];
DocIterator iterator = docs.iterator();
for (int i = 0; i < scoreDocs.length; i++) {
if (!iterator.hasNext()) {
throw new AssertionError();
}
scoreDocs[i] = new ScoreDoc(iterator.nextDoc(), Float.NaN);
}
if (iterator.hasNext()) {
throw new AssertionError();
}
return new TopDocs(docs.matches(), scoreDocs, Float.NaN);
}
/** Retrieves the unique keys for the topdocs to key the results */
protected String[] getUniqueKeys(SolrIndexSearcher searcher, TopDocs topDocs) throws IOException {
IndexSchema schema = searcher.getSchema();
SchemaField keyField = schema.getUniqueKeyField();
if (keyField != null) {
Set<String> selector = Collections.singleton(keyField.getName());
String uniqueKeys[] = new String[topDocs.scoreDocs.length];
for (int i = 0; i < topDocs.scoreDocs.length; i++) {
int docid = topDocs.scoreDocs[i].doc;
StoredDocument doc = searcher.doc(docid, selector);
String id = schema.printableUniqueKey(doc);
uniqueKeys[i] = id;
}
return uniqueKeys;
} else {
return new String[topDocs.scoreDocs.length];
}
}
}

View File

@ -109,6 +109,8 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
private final SolrCache[] cacheList;
private static final SolrCache[] noCaches = new SolrCache[0];
private final FieldInfos fieldInfos;
// TODO: do we need this separate set of field names? we can just use the fieldinfos?
private final Collection<String> fieldNames;
private Collection<String> storedHighlightFieldNames;
private DirectoryFactory directoryFactory;
@ -199,7 +201,8 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
optimizer = null;
fieldNames = new HashSet<String>();
for(FieldInfo fieldInfo : atomicReader.getFieldInfos()) {
fieldInfos = atomicReader.getFieldInfos();
for(FieldInfo fieldInfo : fieldInfos) {
fieldNames.add(fieldInfo.name);
}
@ -509,13 +512,56 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn
}
/** Visit a document's fields using a {@link StoredFieldVisitor}
* This method does not currently use the Solr document cache.
* This method does not currently add to the Solr document cache.
*
* @see IndexReader#document(int, StoredFieldVisitor) */
@Override
public void doc(int n, StoredFieldVisitor visitor) throws IOException {
if (documentCache != null) {
StoredDocument cached = documentCache.get(n);
if (cached != null) {
visitFromCached(cached, visitor);
return;
}
}
getIndexReader().document(n, visitor);
}
/** Executes a stored field visitor against a hit from the document cache */
private void visitFromCached(StoredDocument document, StoredFieldVisitor visitor) throws IOException {
for (StorableField f : document) {
FieldInfo info = fieldInfos.fieldInfo(f.name());
switch(visitor.needsField(info)) {
case YES:
if (f.binaryValue() != null) {
BytesRef binaryValue = f.binaryValue();
byte copy[] = new byte[binaryValue.length];
System.arraycopy(binaryValue.bytes, binaryValue.offset, copy, 0, copy.length);
visitor.binaryField(info, copy);
} else if (f.numericValue() != null) {
Number numericValue = f.numericValue();
if (numericValue instanceof Double) {
visitor.doubleField(info, numericValue.doubleValue());
} else if (numericValue instanceof Integer) {
visitor.intField(info, numericValue.intValue());
} else if (numericValue instanceof Float) {
visitor.floatField(info, numericValue.floatValue());
} else if (numericValue instanceof Long) {
visitor.longField(info, numericValue.longValue());
} else {
throw new AssertionError();
}
} else {
visitor.stringField(info, f.stringValue());
}
break;
case NO:
break;
case STOP:
return;
}
}
}
/**
* Retrieve the {@link Document} instance corresponding to the document id.

View File

@ -0,0 +1,49 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for PostingsHighlighter -->
<schema name="postingshighlight" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field: no offsets! -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldtype>
<!-- text field with offsets -->
<fieldtype name="text_offsets" class="solr.TextField" storeOffsetsWithPositions="true">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text_offsets" indexed="true" stored="true"/>
<field name="text2" type="text" indexed="true" stored="true"/>
<field name="text3" type="text_offsets" indexed="true" stored="true"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,30 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- a basic solrconfig for postings highlighter -->
<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<dataDir>${solr.data.dir:}</dataDir>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
<searchComponent class="solr.HighlightComponent" name="highlight">
<highlighting class="org.apache.solr.highlight.PostingsSolrHighlighter"/>
</searchComponent>
</config>

View File

@ -0,0 +1,103 @@
package org.apache.solr.highlight;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.schema.IndexSchema;
import org.junit.BeforeClass;
/** simple tests for PostingsSolrHighlighter */
@SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"})
public class TestPostingsSolrHighlighter extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-postingshighlight.xml", "schema-postingshighlight.xml");
// test our config is sane, just to be sure:
// postingshighlighter should be used
SolrHighlighter highlighter = HighlightComponent.getHighlighter(h.getCore());
assertTrue("wrong highlighter: " + highlighter.getClass(), highlighter instanceof PostingsSolrHighlighter);
// 'text' and 'text3' should have offsets, 'text2' should not
IndexSchema schema = h.getCore().getSchema();
assertTrue(schema.getField("text").storeOffsetsWithPositions());
assertTrue(schema.getField("text3").storeOffsetsWithPositions());
assertFalse(schema.getField("text2").storeOffsetsWithPositions());
assertU(adoc("text", "document one", "text2", "document one", "text3", "crappy document", "id", "101"));
assertU(adoc("text", "second document", "text2", "second document", "text3", "crappier document", "id", "102"));
assertU(commit());
}
public void testSimple() {
assertQ("simplest test",
req("q", "text:document", "sort", "id asc", "hl", "true"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em> one'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'");
}
public void testPagination() {
assertQ("pagination test",
req("q", "text:document", "sort", "id asc", "hl", "true", "rows", "1", "start", "1"),
"count(//lst[@name='highlighting']/*)=1",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'");
}
public void testEmptySnippet() {
assertQ("null snippet test",
req("q", "text:one OR *:*", "sort", "id asc", "hl", "true"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='document <em>one</em>'",
"count(//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/*)=0");
}
public void testDifferentField() {
assertQ("highlighting text3",
req("q", "text3:document", "sort", "id asc", "hl", "true", "hl.fl", "text3"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text3']/str='crappy <em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier <em>document</em>'");
}
public void testTwoFields() {
assertQ("highlighting text and text3",
req("q", "text:document text3:document", "sort", "id asc", "hl", "true", "hl.fl", "text,text3"),
"count(//lst[@name='highlighting']/*)=2",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text']/str='<em>document</em> one'",
"//lst[@name='highlighting']/lst[@name='101']/arr[@name='text3']/str='crappy <em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text']/str='second <em>document</em>'",
"//lst[@name='highlighting']/lst[@name='102']/arr[@name='text3']/str='crappier <em>document</em>'");
}
public void testMisconfiguredField() {
ignoreException("was indexed without offsets");
try {
assertQ("should fail, has no offsets",
req("q", "text2:document", "sort", "id asc", "hl", "true", "hl.fl", "text2"));
fail();
} catch (Exception expected) {
// expected
}
resetExceptionIgnores();
}
}