mirror of https://github.com/apache/lucene.git
LUCENE-3942: syn filter sets posLen when possible
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311100 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b5103519f0
commit
c63f95911a
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -99,7 +100,8 @@ import org.apache.lucene.util.fst.FST;
|
|||
// practice, but it's possible on some set of synonyms it
|
||||
// will. We'd have to modify Aho/Corasick to enforce our
|
||||
// conflict resolving (eg greedy matching) because that algo
|
||||
// finds all matches.
|
||||
// finds all matches. This really amounts to adding a .*
|
||||
// closure to the FST and then determinizing it.
|
||||
|
||||
public final class SynonymFilter extends TokenFilter {
|
||||
|
||||
|
@ -116,6 +118,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
|
@ -156,14 +159,17 @@ public final class SynonymFilter extends TokenFilter {
|
|||
private static class PendingOutputs {
|
||||
CharsRef[] outputs;
|
||||
int[] endOffsets;
|
||||
int[] posLengths;
|
||||
int upto;
|
||||
int count;
|
||||
int posIncr = 1;
|
||||
int lastEndOffset;
|
||||
int lastPosLength;
|
||||
|
||||
public PendingOutputs() {
|
||||
outputs = new CharsRef[1];
|
||||
endOffsets = new int[1];
|
||||
posLengths = new int[1];
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
|
@ -174,6 +180,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
public CharsRef pullNext() {
|
||||
assert upto < count;
|
||||
lastEndOffset = endOffsets[upto];
|
||||
lastPosLength = posLengths[upto];
|
||||
final CharsRef result = outputs[upto++];
|
||||
posIncr = 0;
|
||||
if (upto == count) {
|
||||
|
@ -186,7 +193,11 @@ public final class SynonymFilter extends TokenFilter {
|
|||
return lastEndOffset;
|
||||
}
|
||||
|
||||
public void add(char[] output, int offset, int len, int endOffset) {
|
||||
public int getLastPosLength() {
|
||||
return lastPosLength;
|
||||
}
|
||||
|
||||
public void add(char[] output, int offset, int len, int endOffset, int posLength) {
|
||||
if (count == outputs.length) {
|
||||
final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(outputs, 0, next, 0, count);
|
||||
|
@ -197,6 +208,11 @@ public final class SynonymFilter extends TokenFilter {
|
|||
System.arraycopy(endOffsets, 0, next, 0, count);
|
||||
endOffsets = next;
|
||||
}
|
||||
if (count == posLengths.length) {
|
||||
final int[] next = new int[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
System.arraycopy(posLengths, 0, next, 0, count);
|
||||
posLengths = next;
|
||||
}
|
||||
if (outputs[count] == null) {
|
||||
outputs[count] = new CharsRef();
|
||||
}
|
||||
|
@ -205,6 +221,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// use the endOffset of the input token, or X >= 0, in
|
||||
// which case we use X as the endOffset for this output
|
||||
endOffsets[count] = endOffset;
|
||||
posLengths[count] = posLength;
|
||||
count++;
|
||||
}
|
||||
};
|
||||
|
@ -456,20 +473,23 @@ public final class SynonymFilter extends TokenFilter {
|
|||
// the output:
|
||||
assert outputLen > 0: "output contains empty string: " + scratchChars;
|
||||
final int endOffset;
|
||||
final int posLen;
|
||||
if (chIDX == chEnd && lastStart == scratchChars.offset) {
|
||||
// This rule had a single output token, so, we set
|
||||
// this output's endOffset to the current
|
||||
// endOffset (ie, endOffset of the last input
|
||||
// token it matched):
|
||||
endOffset = matchEndOffset;
|
||||
posLen = matchInputLength;
|
||||
} else {
|
||||
// This rule has more than one output token; we
|
||||
// can't pick any particular endOffset for this
|
||||
// case, so, we inherit the endOffset for the
|
||||
// input token which this output overlaps:
|
||||
endOffset = -1;
|
||||
posLen = 1;
|
||||
}
|
||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset);
|
||||
futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen, endOffset, posLen);
|
||||
//System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
|
||||
lastStart = 1+chIDX;
|
||||
//System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
|
||||
|
@ -557,6 +577,7 @@ public final class SynonymFilter extends TokenFilter {
|
|||
}
|
||||
offsetAtt.setOffset(input.startOffset, endOffset);
|
||||
posIncrAtt.setPositionIncrement(posIncr);
|
||||
posLenAtt.setPositionLength(outputs.getLastPosLength());
|
||||
if (outputs.count == 0) {
|
||||
// Done with the buffered input and all outputs at
|
||||
// this position
|
||||
|
|
|
@ -47,6 +47,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
private SynonymFilter tokensOut;
|
||||
private CharTermAttribute termAtt;
|
||||
private PositionIncrementAttribute posIncrAtt;
|
||||
private PositionLengthAttribute posLenAtt;
|
||||
private OffsetAttribute offsetAtt;
|
||||
|
||||
private void add(String input, String output, boolean keepOrig) {
|
||||
|
@ -98,14 +99,23 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
}
|
||||
final int colonIndex = expectedAtPos[atPos].indexOf(':');
|
||||
final int underbarIndex = expectedAtPos[atPos].indexOf('_');
|
||||
final String expectedToken;
|
||||
final int expectedEndOffset;
|
||||
final int expectedPosLen;
|
||||
if (colonIndex != -1) {
|
||||
expectedToken = expectedAtPos[atPos].substring(0, colonIndex);
|
||||
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
|
||||
if (underbarIndex != -1) {
|
||||
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex, underbarIndex));
|
||||
expectedPosLen = Integer.parseInt(expectedAtPos[atPos].substring(1+underbarIndex));
|
||||
} else {
|
||||
expectedEndOffset = Integer.parseInt(expectedAtPos[atPos].substring(1+colonIndex));
|
||||
expectedPosLen = 1;
|
||||
}
|
||||
} else {
|
||||
expectedToken = expectedAtPos[atPos];
|
||||
expectedEndOffset = endOffset;
|
||||
expectedPosLen = 1;
|
||||
}
|
||||
assertEquals(expectedToken, termAtt.toString());
|
||||
assertEquals(atPos == 0 ? 1 : 0,
|
||||
|
@ -114,6 +124,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
// be the same:
|
||||
assertEquals(startOffset, offsetAtt.startOffset());
|
||||
assertEquals(expectedEndOffset, offsetAtt.endOffset());
|
||||
assertEquals(expectedPosLen, posLenAtt.getPositionLength());
|
||||
}
|
||||
}
|
||||
tokensOut.end();
|
||||
|
@ -153,6 +164,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
verify("a b c", "a/bar b/fee c");
|
||||
|
@ -169,7 +181,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
verify("e f", "foo/baz bar/bee");
|
||||
|
||||
// verify multi-word / single-output offsets:
|
||||
verify("g i j k g", "g i/feep:7 j k g");
|
||||
verify("g i j k g", "g i/feep:7_3 j k g");
|
||||
|
||||
// mixed keepOrig true/false:
|
||||
verify("a m c e x", "a/foo dog barks loudly x");
|
||||
|
@ -266,14 +278,17 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
final int endOffset;
|
||||
if (matchIDX < numInputs) {
|
||||
final int posLen;
|
||||
if (synOutputs.length == 1) {
|
||||
// Add full endOffset
|
||||
endOffset = (inputIDX*2) + syn.in.length();
|
||||
posLen = (1+syn.in.length())/2;
|
||||
} else {
|
||||
// Add endOffset matching input token's
|
||||
endOffset = (matchIDX*2) + 1;
|
||||
posLen = 1;
|
||||
}
|
||||
outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset;
|
||||
outputs[matchIDX] = outputs[matchIDX] + ":" + endOffset + "_" + posLen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -365,6 +380,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
if (dedup) {
|
||||
|
@ -578,6 +594,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
true);
|
||||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
|
||||
if (keepOrig) {
|
||||
|
@ -730,6 +747,7 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
|||
termAtt = tokensOut.addAttribute(CharTermAttribute.class);
|
||||
posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
|
||||
offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
|
||||
posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
// Make sure endOffset inherits from previous input token:
|
||||
verify("a", "a b:1");
|
||||
|
|
Loading…
Reference in New Issue