LUCENE-3668: if there's only 1 output for a synonym rule then set start/endOffset to match the full span of the input tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1228650 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-01-07 16:26:15 +00:00
parent 40b3b75a6e
commit ed9f0fd5ef
4 changed files with 123 additions and 11 deletions

View File

@ -136,6 +136,14 @@ Bug Fixes
* LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
to correctly handle minShouldMatch behaviour of previous versions.
(Shay Banon, Uwe Schindler)
* LUCENE-3668: For a multi-token synonym mapping to a single token,
SynonymFilter will now set the start offset of the synonym token to
the start offset of the first matched token, and the end offset of
the synonym token to the end offset of the last matched token.
This way if the synonym token is used for highlighting, it will
cover all tokens it had matched. (Koji Sekiguchi, Robert Muir,
Mike McCandless)
Documentation

View File

@ -153,12 +153,15 @@ public final class SynonymFilter extends TokenFilter {
// Holds pending output synonyms for one future position:
private static class PendingOutputs {
CharsRef[] outputs;
int[] endOffsets;
int upto;
int count;
int posIncr = 1;
int lastEndOffset;
public PendingOutputs() {
outputs = new CharsRef[1];
endOffsets = new int[1];
}
public void reset() {
@ -168,6 +171,7 @@ public final class SynonymFilter extends TokenFilter {
public CharsRef pullNext() {
assert upto < count;
lastEndOffset = endOffsets[upto];
final CharsRef result = outputs[upto++];
posIncr = 0;
if (upto == count) {
@ -176,16 +180,29 @@ public final class SynonymFilter extends TokenFilter {
return result;
}
public void add(char[] output, int offset, int len) {
public int getLastEndOffset() {
return lastEndOffset;
}
public void add(char[] output, int offset, int len, int endOffset) {
if (count == outputs.length) {
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(outputs, 0, next, 0, count);
outputs = next;
}
if (count == endOffsets.length) {
final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
System.arraycopy(endOffsets, 0, next, 0, count);
endOffsets = next;
}
if (outputs[count] == null) {
outputs[count] = new CharsRef();
}
outputs[count].copyChars(output, offset, len);
// endOffset can be -1, in which case we should simply
// use the endOffset of the input token, or X >= 0, in
// which case we use X as the endOffset for this output
endOffsets[count] = endOffset;
count++;
}
};
@ -281,6 +298,7 @@ public final class SynonymFilter extends TokenFilter {
// Holds the longest match we've seen so far:
BytesRef matchOutput = null;
int matchInputLength = 0;
int matchEndOffset = -1;
BytesRef pendingOutput = fst.outputs.getNoOutput();
fst.getFirstArc(scratchArc);
@ -297,6 +315,8 @@ public final class SynonymFilter extends TokenFilter {
final int bufferLen;
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
int inputEndOffset = 0;
if (curNextRead == nextWrite) {
// We used up our lookahead buffer of input tokens
@ -317,6 +337,7 @@ public final class SynonymFilter extends TokenFilter {
final PendingInput input = futureInputs[nextWrite];
input.startOffset = offsetAtt.startOffset();
input.endOffset = offsetAtt.endOffset();
inputEndOffset = input.endOffset;
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
if (nextRead != nextWrite) {
capture();
@ -335,6 +356,7 @@ public final class SynonymFilter extends TokenFilter {
// Still in our lookahead
buffer = futureInputs[curNextRead].term.chars;
bufferLen = futureInputs[curNextRead].term.length;
inputEndOffset = futureInputs[curNextRead].endOffset;
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
}
@ -360,6 +382,7 @@ public final class SynonymFilter extends TokenFilter {
if (scratchArc.isFinal()) {
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
matchInputLength = tokenCount;
matchEndOffset = inputEndOffset;
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
}
@ -390,7 +413,7 @@ public final class SynonymFilter extends TokenFilter {
if (matchOutput != null) {
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
inputSkipCount = matchInputLength;
addOutput(matchOutput, matchInputLength);
addOutput(matchOutput, matchInputLength, matchEndOffset);
} else if (nextRead != nextWrite) {
// Even though we had no match here, we set to 1
// because we need to skip current input token before
@ -404,7 +427,7 @@ public final class SynonymFilter extends TokenFilter {
}
// Interleaves all output tokens onto the futureOutputs:
private void addOutput(BytesRef bytes, int matchInputLength) {
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
final int code = bytesReader.readVInt();
@ -425,7 +448,21 @@ public final class SynonymFilter extends TokenFilter {
// Caller is not allowed to have empty string in
// the output:
assert outputLen > 0: "output contains empty string: " + scratchChars;
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
final int endOffset;
if (chIDX == chEnd && lastStart == scratchChars.offset) {
// This rule had a single output token, so, we set
// this output's endOffset to the current
// endOffset (ie, endOffset of the last input
// token it matched):
endOffset = matchEndOffset;
} else {
// This rule has more than one output token; we
// can't pick any particular endOffset for this
// case, so, we inherit the endOffset for the
// input token which this output overlaps:
endOffset = -1;
}
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
lastStart = 1+chIDX;
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
@ -507,7 +544,11 @@ public final class SynonymFilter extends TokenFilter {
clearAttributes();
termAtt.copyBuffer(output.chars, output.offset, output.length);
typeAtt.setType(TYPE_SYNONYM);
offsetAtt.setOffset(input.startOffset, input.endOffset);
int endOffset = outputs.getLastEndOffset();
if (endOffset == -1) {
endOffset = input.endOffset;
}
offsetAtt.setOffset(input.startOffset, endOffset);
posIncrAtt.setPositionIncrement(posIncr);
if (outputs.count == 0) {
// Done with the buffered input and all outputs at

View File

@ -59,7 +59,12 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
}
}
// todo: we should probably refactor this guy to use/take analyzer,
// For the output string: separate positions with a space,
// and separate multiple tokens at each position with a
// /. If a token should have end offset != the input
// token's end offset then add :X to it:
// TODO: we should probably refactor this guy to use/take analyzer,
// the tests are a little messy
private void verify(String input, String output) throws Exception {
if (VERBOSE) {
@ -73,7 +78,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
while(tokensOut.incrementToken()) {
if (VERBOSE) {
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
}
assertTrue(expectedUpto < expected.length);
@ -85,16 +90,26 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
if (atPos > 0) {
assertTrue(tokensOut.incrementToken());
if (VERBOSE) {
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
}
}
assertEquals(termAtt, expectedAtPos[atPos]);
final int colonIndex = expectedAtPos[atPos].indexOf(':');
final String expectedToken;
final int expectedEndOffset;
if (colonIndex != -1) {
expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
} else {
expectedToken = expectedAtPos[atPos];
expectedEndOffset = endOffset;
}
assertEquals(expectedToken, termAtt.toString());
assertEquals(atPos == 0 ? 1 : 0,
posIncrAtt.getPositionIncrement());
// start/end offset of all tokens at same pos should
// be the same:
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(endOffset, offsetAtt.endOffset());
assertEquals(expectedEndOffset, offsetAtt.endOffset());
}
}
tokensOut.end();
@ -112,6 +127,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
add("b c", "dog collar", true);
add("c d", "dog harness holder extras", true);
add("m c e", "dog barks loudly", false);
add("i j k", "feep", true);
add("e f", "foo bar", false);
add("e f", "baz bee", false);
@ -148,6 +164,9 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
// two outputs for same input
verify("e f", "foo/baz bar/bee");
// verify multi-word / single-output offsets:
verify("g i j k g", "g i/feep:7 j k g");
// mixed keepOrig true/false:
verify("a m c e x", "a/foo dog barks loudly x");
verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
@ -241,6 +260,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
} else {
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
}
if (synOutputs.length == 1) {
// Add endOffset
outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length());
}
}
}
}
@ -663,4 +686,24 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 1, 0, 1 });
}
public void testMultiwordOffsets() throws Exception {
b = new SynonymMap.Builder(true);
final boolean keepOrig = true;
add("national hockey league", "nhl", keepOrig);
final SynonymMap map = b.build();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
}
};
assertAnalyzesTo(a, "national hockey league",
new String[] { "national", "nhl", "hockey", "league" },
new int[] { 0, 0, 9, 16 },
new int[] { 8, 22, 15, 22 },
new int[] { 1, 0, 1, 1 });
}
}

View File

@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -65,6 +66,25 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
new int[] { 1, 0, 0, 0 });
}
/** test multiword offsets with the old impl
* @deprecated Remove this test in Lucene 5.0 */
@Deprecated
public void testMultiwordOffsetsOld() throws Exception {
SynonymFilterFactory factory = new SynonymFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("luceneMatchVersion", Version.LUCENE_33.toString());
args.put("synonyms", "synonyms.txt");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl"));
TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false));
// WTF?
assertTokenStreamContents(ts,
new String[] { "national", "nhl", "hockey", "league" },
new int[] { 0, 0, 0, 0 },
new int[] { 22, 22, 22, 22 },
new int[] { 1, 0, 1, 1 });
}
/** if the synonyms are completely empty, test that we still analyze correctly */
public void testEmptySynonyms() throws Exception {
SynonymFilterFactory factory = new SynonymFilterFactory();
@ -85,7 +105,7 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
}
public List<String> getLines(String resource) throws IOException {
return null;
return Arrays.asList(text.split("\n"));
}
public Object newInstance(String cname, String... subpackages) {