LUCENE-3942: syn filter sets posLen when possible

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311100 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-04-08 20:55:32 +00:00
parent b5103519f0
commit c63f95911a
2 changed files with 45 additions and 6 deletions

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
@ -99,7 +100,8 @@ import org.apache.lucene.util.fst.FST;
// practice, but it's possible on some set of synonyms it
// will. We'd have to modify Aho/Corasick to enforce our
// conflict resolving (eg greedy matching) because that algo
// finds all matches.
// finds all matches. This really amounts to adding a .*
// closure to the FST and then determinizing it.
public final class SynonymFilter extends TokenFilter {
@ -116,6 +118,7 @@ public final class SynonymFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -156,14 +159,17 @@ public final class SynonymFilter extends TokenFilter {
private static class PendingOutputs {
CharsRef[] outputs;
int[] endOffsets;
int[] posLengths;
int upto;
int count;
int posIncr = 1;
int lastEndOffset;
int lastPosLength;
public PendingOutputs() {
outputs = new CharsRef[1];
endOffsets = new int[1];
posLengths = new int[1];
}
public void reset() {
@ -174,6 +180,7 @@ public final class SynonymFilter extends TokenFilter {
public CharsRef pullNext() {
assert upto < count;
lastEndOffset = endOffsets[upto];
lastPosLength = posLengths[upto];
final CharsRef result = outputs[upto++];
posIncr = 0;
if (upto == count) {
@ -186,7 +193,11 @@ public final class SynonymFilter extends TokenFilter {
return lastEndOffset;
}
public void add(char[] output, int offset, int len, int endOffset) {
public int getLastPosLength() {
return lastPosLength;
}
public void add(char[] output, int offset, int len, int endOffset, int posLength) {
if (count == outputs.length) {
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(outputs, 0, next, 0, count);
@ -197,6 +208,11 @@ public final class SynonymFilter extends TokenFilter {
System.arraycopy(endOffsets, 0, next, 0, count);
endOffsets = next;
}
if (count == posLengths.length) {
final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
System.arraycopy(posLengths, 0, next, 0, count);
posLengths = next;
}
if (outputs[count] == null) {
outputs[count] = new CharsRef();
}
@ -205,6 +221,7 @@ public final class SynonymFilter extends TokenFilter {
// use the endOffset of the input token, or X >= 0, in
// which case we use X as the endOffset for this output
endOffsets[count] = endOffset;
posLengths[count] = posLength;
count++;
}
};
@ -456,20 +473,23 @@ public final class SynonymFilter extends TokenFilter {
// the output:
assert outputLen > 0: "output contains empty string: " + scratchChars;
final int endOffset;
final int posLen;
if (chIDX == chEnd && lastStart == scratchChars.offset) {
// This rule had a single output token, so, we set
// this output's endOffset to the current
// endOffset (ie, endOffset of the last input
// token it matched):
endOffset = matchEndOffset;
posLen = matchInputLength;
} else {
// This rule has more than one output token; we
// can't pick any particular endOffset for this
// case, so, we inherit the endOffset for the
// input token which this output overlaps:
endOffset = -1;
posLen = 1;
}
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset, posLen);
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
lastStart = 1+chIDX;
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
@ -557,6 +577,7 @@ public final class SynonymFilter extends TokenFilter {
}
offsetAtt.setOffset(input.startOffset, endOffset);
posIncrAtt.setPositionIncrement(posIncr);
posLenAtt.setPositionLength(outputs.getLastPosLength());
if (outputs.count == 0) {
// Done with the buffered input and all outputs at
// this position

View File

@ -47,6 +47,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
private SynonymFilter tokensOut;
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private PositionLengthAttribute posLenAtt;
private OffsetAttribute offsetAtt;
private void add(String input, String output, boolean keepOrig) {
@ -98,14 +99,23 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
}
}
final int colonIndex = expectedAtPos[atPos].indexOf(':');
final int underbarIndex = expectedAtPos[atPos].indexOf('_');
final String expectedToken;
final int expectedEndOffset;
final int expectedPosLen;
if (colonIndex != -1) {
expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
if (underbarIndex != -1) {
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex, underbarIndex));
expectedPosLen = Integer.parseInt(expectedAtPos[atPos].substring(1+underbarIndex));
} else {
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
expectedPosLen = 1;
}
} else {
expectedToken = expectedAtPos[atPos];
expectedEndOffset = endOffset;
expectedPosLen = 1;
}
assertEquals(expectedToken, termAtt.toString());
assertEquals(atPos == 0 ? 1 : 0,
@ -114,6 +124,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
// be the same:
assertEquals(startOffset, offsetAtt.startOffset());
assertEquals(expectedEndOffset, offsetAtt.endOffset());
assertEquals(expectedPosLen, posLenAtt.getPositionLength());
}
}
tokensOut.end();
@ -153,6 +164,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
verify("a b c", "a/bar b/fee c");
@ -169,7 +181,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
verify("e f", "foo/baz bar/bee");
// verify multi-word / single-output offsets:
verify("g i j k g", "g i/feep:7 j k g");
verify("g i j k g", "g i/feep:7_3 j k g");
// mixed keepOrig true/false:
verify("a m c e x", "a/foo dog barks loudly x");
@ -266,14 +278,17 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
}
final int endOffset;
if (matchIDX < numInputs) {
final int posLen;
if (synOutputs.length == 1) {
// Add full endOffset
endOffset = (inputIDX*2) + syn.in.length();
posLen = (1+syn.in.length())/2;
} else {
// Add endOffset matching input token's
endOffset = (matchIDX*2) + 1;
posLen = 1;
}
outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset;
outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset + "_" + posLen;
}
}
}
@ -365,6 +380,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
if (dedup) {
@ -578,6 +594,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
true);
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
if (keepOrig) {
@ -730,6 +747,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
// Make sure endOffset inherits from previous input token:
verify("a", "a b:1");