mirror of https://github.com/apache/lucene.git
LUCENE-3668: if there's only 1 output for a synonym rule then set start/endOffset to match the full span of the input tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1228650 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
40b3b75a6e
commit
ed9f0fd5ef
|
@ -136,6 +136,14 @@ Bug Fixes
|
|||
* LUCENE-3609: Fix regression in BooleanFilter, introduced in Lucene 3.5,
|
||||
to correctly handle minShouldMatch behaviour of previous versions.
|
||||
(Shay Banon, Uwe Schindler)
|
||||
|
||||
* LUCENE-3668: For a multi-token synonym mapping to a single token,
|
||||
SynonymFilter will now set the start offset of the synonym token to
|
||||
the start offset of the first matched token, and the end offset of
|
||||
the synonym token to the end offset of the last matched token.
|
||||
This way if the synonym token is used for highlighting, it will
|
||||
cover all tokens it had matched. (Koji Sekiguchi, Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
Documentation
|
||||
|
||||
|
|
|
@ -153,12 +153,15 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// Holds pending output synonyms for one future position:
|
||||
private static class PendingOutputs {
|
||||
CharsRef[] outputs;
|
||||
int[] endOffsets;
|
||||
int upto;
|
||||
int count;
|
||||
int posIncr = 1;
|
||||
int lastEndOffset;
|
||||
|
||||
public PendingOutputs() {
|
||||
outputs = new CharsRef[1];
|
||||
endOffsets = new int[1];
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
|
@ -168,6 +171,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
|
||||
public CharsRef pullNext() {
|
||||
assert upto < count;
|
||||
lastEndOffset = endOffsets[upto];
|
||||
final CharsRef result = outputs[upto++];
|
||||
posIncr = 0;
|
||||
if (upto == count) {
|
||||
|
@ -176,16 +180,29 @@ public final class SynonymFilter extends TokenFilter {
|
|||
return result;
|
||||
}
|
||||
|
||||
public void add(char[] output, int offset, int len) {
|
||||
public int getLastEndOffset() {
|
||||
return lastEndOffset;
|
||||
}
|
||||
|
||||
public void add(char[] output, int offset, int len, int endOffset) {
|
||||
if (count == outputs.length) {
|
||||
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(outputs, 0, next, 0, count);
|
||||
outputs = next;
|
||||
}
|
||||
if (count == endOffsets.length) {
|
||||
final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
System.arraycopy(endOffsets, 0, next, 0, count);
|
||||
endOffsets = next;
|
||||
}
|
||||
if (outputs[count] == null) {
|
||||
outputs[count] = new CharsRef();
|
||||
}
|
||||
outputs[count].copyChars(output, offset, len);
|
||||
// endOffset can be -1, in which case we should simply
|
||||
// use the endOffset of the input token, or X >= 0, in
|
||||
// which case we use X as the endOffset for this output
|
||||
endOffsets[count] = endOffset;
|
||||
count++;
|
||||
}
|
||||
};
|
||||
|
@ -281,6 +298,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// Holds the longest match we've seen so far:
|
||||
BytesRef matchOutput = null;
|
||||
int matchInputLength = 0;
|
||||
int matchEndOffset = -1;
|
||||
|
||||
BytesRef pendingOutput = fst.outputs.getNoOutput();
|
||||
fst.getFirstArc(scratchArc);
|
||||
|
@ -297,6 +315,8 @@ public final class SynonymFilter extends TokenFilter {
|
|||
final int bufferLen;
|
||||
//System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
|
||||
|
||||
int inputEndOffset = 0;
|
||||
|
||||
if (curNextRead == nextWrite) {
|
||||
|
||||
// We used up our lookahead buffer of input tokens
|
||||
|
@ -317,6 +337,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
final PendingInput input = futureInputs[nextWrite];
|
||||
input.startOffset = offsetAtt.startOffset();
|
||||
input.endOffset = offsetAtt.endOffset();
|
||||
inputEndOffset = input.endOffset;
|
||||
//System.out.println(" new token=" + new String(buffer, 0, bufferLen));
|
||||
if (nextRead != nextWrite) {
|
||||
capture();
|
||||
|
@ -335,6 +356,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// Still in our lookahead
|
||||
buffer = futureInputs[curNextRead].term.chars;
|
||||
bufferLen = futureInputs[curNextRead].term.length;
|
||||
inputEndOffset = futureInputs[curNextRead].endOffset;
|
||||
//System.out.println(" old token=" + new String(buffer, 0, bufferLen));
|
||||
}
|
||||
|
||||
|
@ -360,6 +382,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
if (scratchArc.isFinal()) {
|
||||
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
|
||||
matchInputLength = tokenCount;
|
||||
matchEndOffset = inputEndOffset;
|
||||
//System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||
}
|
||||
|
||||
|
@ -390,7 +413,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
if (matchOutput != null) {
|
||||
//System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
|
||||
inputSkipCount = matchInputLength;
|
||||
addOutput(matchOutput, matchInputLength);
|
||||
addOutput(matchOutput, matchInputLength, matchEndOffset);
|
||||
} else if (nextRead != nextWrite) {
|
||||
// Even though we had no match here, we set to 1
|
||||
// because we need to skip current input token before
|
||||
|
@ -404,7 +427,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
// Interleaves all output tokens onto the futureOutputs:
|
||||
private void addOutput(BytesRef bytes, int matchInputLength) {
|
||||
private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
|
||||
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
|
||||
|
||||
final int code = bytesReader.readVInt();
|
||||
|
@ -425,7 +448,21 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// Caller is not allowed to have empty string in
|
||||
// the output:
|
||||
assert outputLen > 0: "output contains empty string: " + scratchChars;
|
||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
|
||||
final int endOffset;
|
||||
if (chIDX == chEnd && lastStart == scratchChars.offset) {
|
||||
// This rule had a single output token, so, we set
|
||||
// this output's endOffset to the current
|
||||
// endOffset (ie, endOffset of the last input
|
||||
// token it matched):
|
||||
endOffset = matchEndOffset;
|
||||
} else {
|
||||
// This rule has more than one output token; we
|
||||
// can't pick any particular endOffset for this
|
||||
// case, so, we inherit the endOffset for the
|
||||
// input token which this output overlaps:
|
||||
endOffset = -1;
|
||||
}
|
||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
|
||||
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
||||
lastStart = 1+chIDX;
|
||||
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
||||
|
@ -507,7 +544,11 @@ public final class SynonymFilter extends TokenFilter {
|
|||
clearAttributes();
|
||||
termAtt.copyBuffer(output.chars, output.offset, output.length);
|
||||
typeAtt.setType(TYPE_SYNONYM);
|
||||
offsetAtt.setOffset(input.startOffset, input.endOffset);
|
||||
int endOffset = outputs.getLastEndOffset();
|
||||
if (endOffset == -1) {
|
||||
endOffset = input.endOffset;
|
||||
}
|
||||
offsetAtt.setOffset(input.startOffset, endOffset);
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
if (outputs.count == 0) {
|
||||
// Done with the buffered input and all outputs at
|
||||
|
|
|
@ -59,7 +59,12 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// todo: we should probably refactor this guy to use/take analyzer,
|
||||
// For the output string: separate positions with a space,
|
||||
// and separate multiple tokens at each position with a
|
||||
// /. If a token should have end offset != the input
|
||||
// token's end offset then add :X to it:
|
||||
|
||||
// TODO: we should probably refactor this guy to use/take analyzer,
|
||||
// the tests are a little messy
|
||||
private void verify(String input, String output) throws Exception {
|
||||
if (VERBOSE) {
|
||||
|
@ -73,7 +78,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
while(tokensOut.incrementToken()) {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
assertTrue(expectedUpto < expected.length);
|
||||
|
@ -85,16 +90,26 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
if (atPos > 0) {
|
||||
assertTrue(tokensOut.incrementToken());
|
||||
if (VERBOSE) {
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement());
|
||||
System.out.println(" incr token=" + termAtt.toString() + " posIncr=" + posIncrAtt.getPositionIncrement() + " startOff=" + offsetAtt.startOffset() + " endOff=" + offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
assertEquals(termAtt, expectedAtPos[atPos]);
|
||||
final int colonIndex = expectedAtPos[atPos].indexOf(':');
|
||||
final String expectedToken;
|
||||
final int expectedEndOffset;
|
||||
if (colonIndex != -1) {
|
||||
expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
|
||||
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
|
||||
} else {
|
||||
expectedToken = expectedAtPos[atPos];
|
||||
expectedEndOffset = endOffset;
|
||||
}
|
||||
assertEquals(expectedToken, termAtt.toString());
|
||||
assertEquals(atPos == 0 ? 1 : 0,
|
||||
posIncrAtt.getPositionIncrement());
|
||||
// start/end offset of all tokens at same pos should
|
||||
// be the same:
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(endOffset, offsetAtt.endOffset());
|
||||
assertEquals(expectedEndOffset, offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
tokensOut.end();
|
||||
|
@ -112,6 +127,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
add("b c", "dog collar", true);
|
||||
add("c d", "dog harness holder extras", true);
|
||||
add("m c e", "dog barks loudly", false);
|
||||
add("i j k", "feep", true);
|
||||
|
||||
add("e f", "foo bar", false);
|
||||
add("e f", "baz bee", false);
|
||||
|
@ -148,6 +164,9 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
// two outputs for same input
|
||||
verify("e f", "foo/baz bar/bee");
|
||||
|
||||
// verify multi-word / single-output offsets:
|
||||
verify("g i j k g", "g i/feep:7 j k g");
|
||||
|
||||
// mixed keepOrig true/false:
|
||||
verify("a m c e x", "a/foo dog barks loudly x");
|
||||
verify("c d m c e x", "c/dog d/harness holder/dog extras/barks loudly x");
|
||||
|
@ -241,6 +260,10 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
} else {
|
||||
outputs[matchIDX] = outputs[matchIDX] + "/" + synOutputs[synUpto++];
|
||||
}
|
||||
if (synOutputs.length == 1) {
|
||||
// Add endOffset
|
||||
outputs[matchIDX] = outputs[matchIDX] + ":" + ((inputIDX*2) + syn.in.length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -663,4 +686,24 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
|
||||
new int[] { 1, 0, 1, 1, 1, 0, 1 });
|
||||
}
|
||||
|
||||
public void testMultiwordOffsets() throws Exception {
|
||||
b = new SynonymMap.Builder(true);
|
||||
final boolean keepOrig = true;
|
||||
add("national hockey league", "nhl", keepOrig);
|
||||
final SynonymMap map = b.build();
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(a, "national hockey league",
|
||||
new String[] { "national", "nhl", "hockey", "league" },
|
||||
new int[] { 0, 0, 9, 16 },
|
||||
new int[] { 8, 22, 15, 22 },
|
||||
new int[] { 1, 0, 1, 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.ByteArrayInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -65,6 +66,25 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
|
|||
new int[] { 1, 0, 0, 0 });
|
||||
}
|
||||
|
||||
/** test multiword offsets with the old impl
|
||||
* @deprecated Remove this test in Lucene 5.0 */
|
||||
@Deprecated
|
||||
public void testMultiwordOffsetsOld() throws Exception {
|
||||
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("luceneMatchVersion", Version.LUCENE_33.toString());
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader("national hockey league, nhl"));
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("national hockey league"), MockTokenizer.WHITESPACE, false));
|
||||
// WTF?
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "national", "nhl", "hockey", "league" },
|
||||
new int[] { 0, 0, 0, 0 },
|
||||
new int[] { 22, 22, 22, 22 },
|
||||
new int[] { 1, 0, 1, 1 });
|
||||
}
|
||||
|
||||
/** if the synonyms are completely empty, test that we still analyze correctly */
|
||||
public void testEmptySynonyms() throws Exception {
|
||||
SynonymFilterFactory factory = new SynonymFilterFactory();
|
||||
|
@ -85,7 +105,7 @@ public class TestSynonymFilterFactory extends BaseTokenTestCase {
|
|||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return null;
|
||||
return Arrays.asList(text.split("\n"));
|
||||
}
|
||||
|
||||
public Object newInstance(String cname, String... subpackages) {
|
||||
|
|
Loading…
Reference in New Issue