LUCENE-6664: add SynonymGraphFilter for correct multi-token synonym handling

This commit is contained in:
Mike McCandless 2016-12-22 15:39:17 -05:00
parent 6c1f085073
commit 68db033408
18 changed files with 3594 additions and 36 deletions

View File

@ -23,6 +23,14 @@ New features
make it simpler to execute drill down when drill sideways counts are make it simpler to execute drill down when drill sideways counts are
not needed (Emmanuel Keller via Mike McCandless) not needed (Emmanuel Keller via Mike McCandless)
* LUCENE-6664: A new SynonymGraphFilter outputs a correct graph
structure for multi-token synonyms, separating out a
FlattenGraphFilter that is hardwired into the current
SynonymFilter. This finally makes it possible to implement
correct multi-token synonyms at search time. See
http://blog.mikemccandless.com/2012/04/lucenes-tokenstreams-are-actually.html
for details. (Mike McCandless)
Bug Fixes Bug Fixes
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the * LUCENE-7547: JapaneseTokenizerFactory was failing to close the

View File

@ -0,0 +1,424 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
/**
* This filter "casts" token graphs down into a "flat" form,
* for indexing. This is an inherently lossy process: nodes (positions)
* along side paths are forcefully merged.
*
* <p>In general this means the output graph will accept token sequences
* that the input graph did not accept, and will also fail to accept
* token sequences that the input graph did accept.
*
* <p>This is only necessary at indexing time because Lucene cannot yet index
* an arbitrary token graph. At search time there are better options, e.g.
* the experimental <code>TermAutomatonQuery</code> in sandbox.
*
* @lucene.experimental
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.RollingBuffer;
/**
* Converts an incoming graph token stream, such as one from
* {@link SynonymGraphFilter}, into a flat form so that
* all nodes form a single linear chain with no side paths. Every
* path through the graph touches every node.
*
* <p>If the graph was not already flat to start, this
* is likely a lossy process, i.e. it will often cause the
* graph to accept token sequences it should not, and to
* reject token sequences it should not.
*
* <p>However, when applying synonyms during indexing, this
* is necessary because Lucene already does not index a graph
* and so the indexing process is already lossy
* (it ignores the {@link PositionLengthAttribute}).
*
* @lucene.experimental
*/
public final class FlattenGraphFilter extends TokenFilter {
/** Holds all tokens leaving a given input position. */
private final static class InputNode implements RollingBuffer.Resettable {
private final List<AttributeSource.State> tokens = new ArrayList<>();
/** Our input node, or -1 if we haven't been assigned yet */
int node = -1;
/** Maximum to input node for all tokens leaving here; we use this
* to know when we can freeze. */
int maxToNode = -1;
/** Where we currently map to; this changes (can only
* increase as we see more input tokens), until we are finished
* with this position. */
int outputNode = -1;
/** Which token (index into {@link #tokens}) we will next output. */
int nextOut;
@Override
public void reset() {
tokens.clear();
node = -1;
outputNode = -1;
maxToNode = -1;
nextOut = 0;
}
}
/** Gathers up merged input positions into a single output position,
* only for the current "frontier" of nodes we've seen but can't yet
* output because they are not frozen. */
private final static class OutputNode implements RollingBuffer.Resettable {
private final List<Integer> inputNodes = new ArrayList<>();
/** Node ID for this output, or -1 if we haven't been assigned yet. */
int node = -1;
/** Which input node (index into {@link #inputNodes}) we will next output. */
int nextOut;
/** Start offset of tokens leaving this node. */
int startOffset = -1;
/** End offset of tokens arriving to this node. */
int endOffset = -1;
@Override
public void reset() {
inputNodes.clear();
node = -1;
nextOut = 0;
startOffset = -1;
endOffset = -1;
}
}
private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() {
@Override
protected InputNode newInstance() {
return new InputNode();
}
};
private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() {
@Override
protected OutputNode newInstance() {
return new OutputNode();
}
};
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/** Which input node the last seen token leaves from */
private int inputFrom;
/** We are currently releasing tokens leaving from this output node */
private int outputFrom;
// for debugging:
//private int retOutputFrom;
private boolean done;
private int lastOutputFrom;
private int finalOffset;
private int finalPosInc;
private int maxLookaheadUsed;
private int lastStartOffset;
public FlattenGraphFilter(TokenStream in) {
super(in);
}
private boolean releaseBufferedToken() {
// We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving):
while (outputFrom < outputNodes.getMaxPos()) {
OutputNode output = outputNodes.get(outputFrom);
if (output.inputNodes.isEmpty()) {
// No tokens arrived to this node, which happens for the first node
// after a hole:
//System.out.println(" skip empty outputFrom=" + outputFrom);
outputFrom++;
continue;
}
int maxToNode = -1;
for(int inputNodeID : output.inputNodes) {
InputNode inputNode = inputNodes.get(inputNodeID);
assert inputNode.outputNode == outputFrom;
maxToNode = Math.max(maxToNode, inputNode.maxToNode);
}
//System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom);
// TODO: we could shrink the frontier here somewhat if we
// always output posLen=1 as part of our "sausagizing":
if (maxToNode <= inputFrom || done) {
//System.out.println(" output node merged these inputs: " + output.inputNodes);
// These tokens are now frozen
assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size();
InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut));
if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) {
return false;
}
if (inputNode.tokens.size() == 0) {
assert inputNode.nextOut == 0;
assert output.nextOut == 0;
// Hole dest nodes should never be merged since 1) we always
// assign them to a new output position, and 2) since they never
// have arriving tokens they cannot be pushed:
assert output.inputNodes.size() == 1: output.inputNodes.size();
outputFrom++;
inputNodes.freeBefore(output.inputNodes.get(0));
outputNodes.freeBefore(outputFrom);
continue;
}
assert inputNode.nextOut < inputNode.tokens.size();
restoreState(inputNode.tokens.get(inputNode.nextOut));
// Correct posInc
assert outputFrom >= lastOutputFrom;
posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom);
int toInputNodeID = inputNode.node + posLenAtt.getPositionLength();
InputNode toInputNode = inputNodes.get(toInputNodeID);
// Correct posLen
assert toInputNode.outputNode > outputFrom;
posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom);
lastOutputFrom = outputFrom;
inputNode.nextOut++;
//System.out.println(" ret " + this);
OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode);
// Correct offsets
// This is a bit messy; we must do this so offset don't go backwards,
// which would otherwise happen if the replacement has more tokens
// than the input:
int startOffset = Math.max(lastStartOffset, output.startOffset);
offsetAtt.setOffset(startOffset, outputEndNode.endOffset);
lastStartOffset = startOffset;
if (inputNode.nextOut == inputNode.tokens.size()) {
output.nextOut++;
if (output.nextOut == output.inputNodes.size()) {
outputFrom++;
inputNodes.freeBefore(output.inputNodes.get(0));
outputNodes.freeBefore(outputFrom);
}
}
return true;
} else {
return false;
}
}
//System.out.println(" break false");
return false;
}
@Override
public boolean incrementToken() throws IOException {
//System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom);
while (true) {
if (releaseBufferedToken()) {
//retOutputFrom += posIncAtt.getPositionIncrement();
//System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength()));
//printStates();
return true;
} else if (done) {
//System.out.println(" done, return false");
return false;
}
if (input.incrementToken()) {
// Input node this token leaves from:
inputFrom += posIncAtt.getPositionIncrement();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
// Input node this token goes to:
int inputTo = inputFrom + posLenAtt.getPositionLength();
//System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo);
InputNode src = inputNodes.get(inputFrom);
if (src.node == -1) {
// This means the "from" node of this token was never seen as a "to" node,
// which should only happen if we just crossed a hole. This is a challenging
// case for us because we normally rely on the full dependencies expressed
// by the arcs to assign outgoing node IDs. It would be better if tokens
// were never dropped but instead just marked deleted with a new
// TermDeletedAttribute (boolean valued) ... but until that future, we have
// a hack here to forcefully jump the output node ID:
assert src.outputNode == -1;
src.node = inputFrom;
src.outputNode = outputNodes.getMaxPos() + 1;
//System.out.println(" hole: force to outputNode=" + src.outputNode);
OutputNode outSrc = outputNodes.get(src.outputNode);
// Not assigned yet:
assert outSrc.node == -1;
outSrc.node = src.outputNode;
outSrc.inputNodes.add(inputFrom);
outSrc.startOffset = startOffset;
} else {
OutputNode outSrc = outputNodes.get(src.outputNode);
if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) {
// "shrink wrap" the offsets so the original tokens (with most
// restrictive offsets) win:
outSrc.startOffset = Math.max(startOffset, outSrc.startOffset);
}
}
// Buffer this token:
src.tokens.add(captureState());
src.maxToNode = Math.max(src.maxToNode, inputTo);
maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize());
InputNode dest = inputNodes.get(inputTo);
if (dest.node == -1) {
// Common case: first time a token is arriving to this input position:
dest.node = inputTo;
}
// Always number output nodes sequentially:
int outputEndNode = src.outputNode + 1;
if (outputEndNode > dest.outputNode) {
if (dest.outputNode != -1) {
boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo));
assert removed;
}
//System.out.println(" increase output node: " + dest.outputNode + " vs " + outputEndNode);
outputNodes.get(outputEndNode).inputNodes.add(inputTo);
dest.outputNode = outputEndNode;
// Since all we ever do is merge incoming nodes together, and then renumber
// the merged nodes sequentially, we should only ever assign smaller node
// numbers:
assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo;
}
OutputNode outDest = outputNodes.get(dest.outputNode);
// "shrink wrap" the offsets so the original tokens (with most
// restrictive offsets) win:
if (outDest.endOffset == -1 || endOffset < outDest.endOffset) {
outDest.endOffset = endOffset;
}
} else {
//System.out.println(" got false from input");
input.end();
finalPosInc = posIncAtt.getPositionIncrement();
finalOffset = offsetAtt.endOffset();
done = true;
// Don't return false here: we need to force release any buffered tokens now
}
}
}
// Only for debugging:
/*
private void printStates() {
System.out.println("states:");
for(int i=outputFrom;i<outputNodes.getMaxPos();i++) {
OutputNode outputNode = outputNodes.get(i);
System.out.println(" output " + i + ": inputs " + outputNode.inputNodes);
for(int inputNodeID : outputNode.inputNodes) {
InputNode inputNode = inputNodes.get(inputNodeID);
assert inputNode.outputNode == i;
}
}
}
*/
@Override
public void end() throws IOException {
if (done == false) {
super.end();
} else {
// NOTE, shady: don't call super.end, because we did already from incrementToken
}
clearAttributes();
if (done) {
// On exc, done is false, and we will not have set these:
posIncAtt.setPositionIncrement(finalPosInc);
offsetAtt.setOffset(finalOffset, finalOffset);
} else {
super.end();
}
}
@Override
public void reset() throws IOException {
//System.out.println("F: reset");
super.reset();
inputFrom = -1;
inputNodes.reset();
InputNode in = inputNodes.get(0);
in.node = 0;
in.outputNode = 0;
outputNodes.reset();
OutputNode out = outputNodes.get(0);
out.node = 0;
out.inputNodes.add(0);
out.startOffset = 0;
outputFrom = 0;
//retOutputFrom = -1;
lastOutputFrom = -1;
done = false;
finalPosInc = -1;
finalOffset = -1;
lastStartOffset = 0;
maxLookaheadUsed = 0;
}
// for testing
int getMaxLookaheadUsed() {
return maxLookaheadUsed;
}
}

View File

@ -0,0 +1,44 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link FlattenGraphFilter}.
*
* @lucene.experimental
*/
public class FlattenGraphFilterFactory extends TokenFilterFactory {
/** Creates a new FlattenGraphFilterFactory */
public FlattenGraphFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new FlattenGraphFilter(input);
}
}

View File

@ -81,6 +81,9 @@ import org.apache.lucene.util.fst.FST;
* used for parsing. Subsequent tokens simply pass through * used for parsing. Subsequent tokens simply pass through
* and are not parsed. A future improvement would be to * and are not parsed. A future improvement would be to
* allow these tokens to also be matched.</p> * allow these tokens to also be matched.</p>
*
* @deprecated Use {@link SynonymGraphFilter} instead, but be sure to also
* use {@link FlattenGraphFilter} at index time (not at search time) as well.
*/ */
// TODO: maybe we should resolve token -> wordID then run // TODO: maybe we should resolve token -> wordID then run
@ -105,6 +108,7 @@ import org.apache.lucene.util.fst.FST;
// //
// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
@Deprecated
public final class SynonymFilter extends TokenFilter { public final class SynonymFilter extends TokenFilter {
public static final String TYPE_SYNONYM = "SYNONYM"; public static final String TYPE_SYNONYM = "SYNONYM";

View File

@ -72,7 +72,11 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
* <li><code>{@link Analyzer} analyzer</code> - an analyzer used for each raw synonym</li> * <li><code>{@link Analyzer} analyzer</code> - an analyzer used for each raw synonym</li>
* </ul> * </ul>
* @see SolrSynonymParser SolrSynonymParser: default format * @see SolrSynonymParser SolrSynonymParser: default format
*
* @deprecated Use {@link SynonymGraphFilterFactory} instead, but be sure to also
* use {@link FlattenGraphFilterFactory} at index time (not at search time) as well.
*/ */
@Deprecated
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private final boolean ignoreCase; private final boolean ignoreCase;
private final String tokenizerFactory; private final String tokenizerFactory;

View File

@ -0,0 +1,586 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.fst.FST;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
// TODO: maybe we should resolve token -> wordID then run
// FST on wordIDs, for better perf?
// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
// It improves over the current approach here
// because it does not fully re-start matching at every
// token. For example if one pattern is "a b c x"
// and another is "b c d" and the input is "a b c d", on
// trying to parse "a b c x" but failing when you got to x,
// rather than starting over again your really should
// immediately recognize that "b c d" matches at the next
// input. I suspect this won't matter that much in
// practice, but it's possible on some set of synonyms it
// will. We'd have to modify Aho/Corasick to enforce our
// conflict resolving (eg greedy matching) because that algo
// finds all matches. This really amounts to adding a .*
// closure to the FST and then determinizing it.
//
// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
/** Applies single- or multi-token synonyms from a {@link SynonymMap}
* to an incoming {@link TokenStream}, producing a fully correct graph
* output. This is a replacement for {@link SynonymFilter}, which produces
* incorrect graphs for multi-token synonyms.
*
* <p>However, if you use this during indexing, you must follow it with
* {@link FlattenGraphFilter} to squash tokens on top of one another
* like {@link SynonymFilter}, because the indexer can't directly
* consume a graph. To get fully correct positional queries when your
* synonym replacements are multiple tokens, you should instead apply
* synonyms using this {@code TokenFilter} at query time and translate
* the resulting graph to a {@code TermAutomatonQuery} e.g. using
* {@code TokenStreamToTermAutomatonQuery}.
*
* <p><b>NOTE</b>: this cannot consume an incoming graph; results will
* be undefined.
*
* @lucene.experimental */
public final class SynonymGraphFilter extends TokenFilter {
public static final String TYPE_SYNONYM = "SYNONYM";
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final SynonymMap synonyms;
private final boolean ignoreCase;
private final FST<BytesRef> fst;
private final FST.BytesReader fstReader;
private final FST.Arc<BytesRef> scratchArc;
private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
private final BytesRef scratchBytes = new BytesRef();
private final CharsRefBuilder scratchChars = new CharsRefBuilder();
private final LinkedList<BufferedOutputToken> outputBuffer = new LinkedList<>();
private int nextNodeOut;
private int lastNodeOut;
private int maxLookaheadUsed;
// For testing:
private int captureCount;
private boolean liveToken;
// Start/end offset of the current match:
private int matchStartOffset;
private int matchEndOffset;
// True once the input TokenStream is exhausted:
private boolean finished;
private int lookaheadNextRead;
private int lookaheadNextWrite;
private RollingBuffer<BufferedInputToken> lookahead = new RollingBuffer<BufferedInputToken>() {
@Override
protected BufferedInputToken newInstance() {
return new BufferedInputToken();
}
};
static class BufferedInputToken implements RollingBuffer.Resettable {
final CharsRefBuilder term = new CharsRefBuilder();
AttributeSource.State state;
int startOffset = -1;
int endOffset = -1;
@Override
public void reset() {
state = null;
term.clear();
// Intentionally invalid to ferret out bugs:
startOffset = -1;
endOffset = -1;
}
}
static class BufferedOutputToken {
final String term;
// Non-null if this was an incoming token:
final State state;
final int startNode;
final int endNode;
public BufferedOutputToken(State state, String term, int startNode, int endNode) {
this.state = state;
this.term = term;
this.startNode = startNode;
this.endNode = endNode;
}
}
public SynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
super(input);
this.synonyms = synonyms;
this.fst = synonyms.fst;
if (fst == null) {
throw new IllegalArgumentException("fst must be non-null");
}
this.fstReader = fst.getBytesReader();
scratchArc = new FST.Arc<>();
this.ignoreCase = ignoreCase;
}
@Override
public boolean incrementToken() throws IOException {
//System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut);
assert lastNodeOut <= nextNodeOut;
if (outputBuffer.isEmpty() == false) {
// We still have pending outputs from a prior synonym match:
releaseBufferedToken();
//System.out.println(" syn: ret buffered=" + this);
assert liveToken == false;
return true;
}
// Try to parse a new synonym match at the current token:
if (parse()) {
// A new match was found:
releaseBufferedToken();
//System.out.println(" syn: after parse, ret buffered=" + this);
assert liveToken == false;
return true;
}
if (lookaheadNextRead == lookaheadNextWrite) {
// Fast path: parse pulled one token, but it didn't match
// the start for any synonym, so we now return it "live" w/o having
// cloned all of its atts:
if (finished) {
//System.out.println(" syn: ret END");
return false;
}
assert liveToken;
liveToken = false;
// NOTE: no need to change posInc since it's relative, i.e. whatever
// node our output is upto will just increase by the incoming posInc.
// We also don't need to change posLen, but only because we cannot
// consume a graph, so the incoming token can never span a future
// synonym match.
} else {
// We still have buffered lookahead tokens from a previous
// parse attempt that required lookahead; just replay them now:
//System.out.println(" restore buffer");
assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite;
BufferedInputToken token = lookahead.get(lookaheadNextRead);
lookaheadNextRead++;
restoreState(token.state);
lookahead.freeBefore(lookaheadNextRead);
//System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset());
assert liveToken == false;
}
lastNodeOut += posIncrAtt.getPositionIncrement();
nextNodeOut = lastNodeOut + posLenAtt.getPositionLength();
//System.out.println(" syn: ret lookahead=" + this);
return true;
}
private void releaseBufferedToken() throws IOException {
//System.out.println(" releaseBufferedToken");
BufferedOutputToken token = outputBuffer.pollFirst();
if (token.state != null) {
// This is an original input token (keepOrig=true case):
//System.out.println(" hasState");
restoreState(token.state);
//System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset());
} else {
clearAttributes();
//System.out.println(" no state");
termAtt.append(token.term);
// We better have a match already:
assert matchStartOffset != -1;
offsetAtt.setOffset(matchStartOffset, matchEndOffset);
//System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset);
typeAtt.setType(TYPE_SYNONYM);
}
//System.out.println(" lastNodeOut=" + lastNodeOut);
//System.out.println(" term=" + termAtt);
posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut);
lastNodeOut = token.startNode;
posLenAtt.setPositionLength(token.endNode - token.startNode);
}
/** Scans the next input token(s) to see if a synonym matches. Returns true
* if a match was found. */
private boolean parse() throws IOException {
// System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this));
// Holds the longest match we've seen so far:
BytesRef matchOutput = null;
int matchInputLength = 0;
BytesRef pendingOutput = fst.outputs.getNoOutput();
fst.getFirstArc(scratchArc);
assert scratchArc.output == fst.outputs.getNoOutput();
// How many tokens in the current match
int matchLength = 0;
boolean doFinalCapture = false;
int lookaheadUpto = lookaheadNextRead;
matchStartOffset = -1;
byToken:
while (true) {
//System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos());
// Pull next token's chars:
final char[] buffer;
final int bufferLen;
final int inputEndOffset;
if (lookaheadUpto <= lookahead.getMaxPos()) {
// Still in our lookahead buffer
BufferedInputToken token = lookahead.get(lookaheadUpto);
lookaheadUpto++;
buffer = token.term.chars();
bufferLen = token.term.length();
inputEndOffset = token.endOffset;
//System.out.println(" use buffer now max=" + lookahead.getMaxPos());
if (matchStartOffset == -1) {
matchStartOffset = token.startOffset;
}
} else {
// We used up our lookahead buffer of input tokens
// -- pull next real input token:
assert finished || liveToken == false;
if (finished) {
//System.out.println(" break: finished");
break;
} else if (input.incrementToken()) {
//System.out.println(" input.incrToken");
liveToken = true;
buffer = termAtt.buffer();
bufferLen = termAtt.length();
if (matchStartOffset == -1) {
matchStartOffset = offsetAtt.startOffset();
}
inputEndOffset = offsetAtt.endOffset();
lookaheadUpto++;
} else {
// No more input tokens
finished = true;
//System.out.println(" break: now set finished");
break;
}
}
matchLength++;
//System.out.println(" cycle term=" + new String(buffer, 0, bufferLen));
// Run each char in this token through the FST:
int bufUpto = 0;
while (bufUpto < bufferLen) {
final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
break byToken;
}
// Accum the output
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
bufUpto += Character.charCount(codePoint);
}
assert bufUpto == bufferLen;
// OK, entire token matched; now see if this is a final
// state in the FST (a match):
if (scratchArc.isFinal()) {
matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
matchInputLength = matchLength;
matchEndOffset = inputEndOffset;
//System.out.println(" ** match");
}
// See if the FST can continue matching (ie, needs to
// see the next input token):
if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
// No further rules can match here; we're done
// searching for matching rules starting at the
// current input position.
break;
} else {
// More matching is possible -- accum the output (if
// any) of the WORD_SEP arc:
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
doFinalCapture = true;
if (liveToken) {
capture();
}
}
}
if (doFinalCapture && liveToken && finished == false) {
// Must capture the final token if we captured any prior tokens:
capture();
}
if (matchOutput != null) {
if (liveToken) {
// Single input token synonym; we must buffer it now:
capture();
}
// There is a match!
bufferOutputTokens(matchOutput, matchInputLength);
lookaheadNextRead += matchInputLength;
//System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
lookahead.freeBefore(lookaheadNextRead);
//System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
return true;
} else {
//System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead);
return false;
}
//System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
}
/** Expands the output graph into the necessary tokens, adding
* synonyms as side paths parallel to the input tokens, and
* buffers them in the output token buffer. */
private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
final int code = bytesReader.readVInt();
final boolean keepOrig = (code & 0x1) == 0;
//System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength);
// How many nodes along all paths; we need this to assign the
// node ID for the final end node where all paths merge back:
int totalPathNodes;
if (keepOrig) {
assert matchInputLength > 0;
totalPathNodes = matchInputLength - 1;
} else {
totalPathNodes = 0;
}
// How many synonyms we will insert over this match:
final int count = code >>> 1;
// TODO: we could encode this instead into the FST:
// 1st pass: count how many new nodes we need
List<List<String>> paths = new ArrayList<>();
for(int outputIDX=0;outputIDX<count;outputIDX++) {
int wordID = bytesReader.readVInt();
synonyms.words.get(wordID, scratchBytes);
scratchChars.copyUTF8Bytes(scratchBytes);
int lastStart = 0;
List<String> path = new ArrayList<>();
paths.add(path);
int chEnd = scratchChars.length();
for(int chUpto=0; chUpto<=chEnd; chUpto++) {
if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) {
path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart));
lastStart = 1 + chUpto;
}
}
assert path.size() > 0;
totalPathNodes += path.size() - 1;
}
//System.out.println(" totalPathNodes=" + totalPathNodes);
// 2nd pass: buffer tokens for the graph fragment
// NOTE: totalPathNodes will be 0 in the case where the matched
// input is a single token and all outputs are also a single token
// We "spawn" a side-path for each of the outputs for this matched
// synonym, all ending back at this end node:
int startNode = nextNodeOut;
int endNode = startNode + totalPathNodes + 1;
//System.out.println(" " + paths.size() + " new side-paths");
// First, fanout all tokens departing start node for these new side paths:
int newNodeCount = 0;
for(List<String> path : paths) {
int pathEndNode;
//System.out.println(" path size=" + path.size());
if (path.size() == 1) {
// Single token output, so there are no intermediate nodes:
pathEndNode = endNode;
} else {
pathEndNode = nextNodeOut + newNodeCount + 1;
newNodeCount += path.size() - 1;
}
outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode));
}
// We must do the original tokens last, else the offsets "go backwards":
if (keepOrig) {
BufferedInputToken token = lookahead.get(lookaheadNextRead);
int inputEndNode;
if (matchInputLength == 1) {
// Single token matched input, so there are no intermediate nodes:
inputEndNode = endNode;
} else {
inputEndNode = nextNodeOut + newNodeCount + 1;
}
//System.out.println(" keepOrig first token: " + token.term);
outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode));
}
nextNodeOut = endNode;
// Do full side-path for each syn output:
for(int pathID=0;pathID<paths.size();pathID++) {
List<String> path = paths.get(pathID);
if (path.size() > 1) {
int lastNode = outputBuffer.get(pathID).endNode;
for(int i=1;i<path.size()-1;i++) {
outputBuffer.add(new BufferedOutputToken(null, path.get(i), lastNode, lastNode+1));
lastNode++;
}
outputBuffer.add(new BufferedOutputToken(null, path.get(path.size()-1), lastNode, endNode));
}
}
if (keepOrig && matchInputLength > 1) {
// Do full "side path" with the original tokens:
int lastNode = outputBuffer.get(paths.size()).endNode;
for(int i=1;i<matchInputLength-1;i++) {
BufferedInputToken token = lookahead.get(lookaheadNextRead + i);
outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, lastNode+1));
lastNode++;
}
BufferedInputToken token = lookahead.get(lookaheadNextRead + matchInputLength - 1);
outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, endNode));
}
/*
System.out.println(" after buffer: " + outputBuffer.size() + " tokens:");
for(BufferedOutputToken token : outputBuffer) {
System.out.println(" tok: " + token.term + " startNode=" + token.startNode + " endNode=" + token.endNode);
}
*/
}
/** Buffers the current input token into lookahead buffer. */
private void capture() {
assert liveToken;
liveToken = false;
BufferedInputToken token = lookahead.get(lookaheadNextWrite);
lookaheadNextWrite++;
token.state = captureState();
token.startOffset = offsetAtt.startOffset();
token.endOffset = offsetAtt.endOffset();
assert token.term.length() == 0;
token.term.append(termAtt);
captureCount++;
maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize());
//System.out.println(" maxLookaheadUsed=" + maxLookaheadUsed);
}
@Override
public void reset() throws IOException {
super.reset();
lookahead.reset();
lookaheadNextWrite = 0;
lookaheadNextRead = 0;
captureCount = 0;
lastNodeOut = -1;
nextNodeOut = 0;
matchStartOffset = -1;
matchEndOffset = -1;
finished = false;
liveToken = false;
outputBuffer.clear();
maxLookaheadUsed = 0;
//System.out.println("S: reset");
}
// for testing
int getCaptureCount() {
return captureCount;
}
// for testing
int getMaxLookaheadUsed() {
return maxLookaheadUsed;
}
}

View File

@ -0,0 +1,204 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
/**
* Factory for {@link SynonymGraphFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt"
* format="solr" ignoreCase="false" expand="true"
* tokenizerFactory="solr.WhitespaceTokenizerFactory"
* [optional tokenizer factory parameters]/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>
* An optional param name prefix of "tokenizerFactory." may be used for any
* init params that the SynonymGraphFilterFactory needs to pass to the specified
* TokenizerFactory. If the TokenizerFactory expects an init parameters with
* the same name as an init param used by the SynonymGraphFilterFactory, the prefix
* is mandatory.
* </p>
*
* <p>
* The optional {@code format} parameter controls how the synonyms will be parsed:
* It supports the short names of {@code solr} for {@link SolrSynonymParser}
* and {@code wordnet} for and {@link WordnetSynonymParser}, or your own
* {@code SynonymMap.Parser} class name. The default is {@code solr}.
* A custom {@link SynonymMap.Parser} is expected to have a constructor taking:
* <ul>
* <li><code>boolean dedup</code> - true if duplicates should be ignored, false otherwise</li>
* <li><code>boolean expand</code> - true if conflation groups should be expanded, false if they are one-directional</li>
* <li><code>{@link Analyzer} analyzer</code> - an analyzer used for each raw synonym</li>
* </ul>
* @see SolrSynonymParser SolrSynonymParser: default format
*
* @lucene.experimental
*/
public class SynonymGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private final boolean ignoreCase;
private final String tokenizerFactory;
private final String synonyms;
private final String format;
private final boolean expand;
private final String analyzerName;
private final Map<String, String> tokArgs = new HashMap<>();
private SynonymMap map;
public SynonymGraphFilterFactory(Map<String,String> args) {
super(args);
ignoreCase = getBoolean(args, "ignoreCase", false);
synonyms = require(args, "synonyms");
format = get(args, "format");
expand = getBoolean(args, "expand", true);
analyzerName = get(args, "analyzer");
tokenizerFactory = get(args, "tokenizerFactory");
if (analyzerName != null && tokenizerFactory != null) {
throw new IllegalArgumentException("Analyzer and TokenizerFactory can't be specified both: " +
analyzerName + " and " + tokenizerFactory);
}
if (tokenizerFactory != null) {
tokArgs.put("luceneMatchVersion", getLuceneMatchVersion().toString());
for (Iterator<String> itr = args.keySet().iterator(); itr.hasNext();) {
String key = itr.next();
tokArgs.put(key.replaceAll("^tokenizerFactory\\.",""), args.get(key));
itr.remove();
}
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
// if the fst is null, it means there's actually no synonyms... just return the original stream
// as there is nothing to do here.
return map.fst == null ? input : new SynonymGraphFilter(input, map, ignoreCase);
}
@Override
public void inform(ResourceLoader loader) throws IOException {
final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
Analyzer analyzer;
if (analyzerName != null) {
analyzer = loadAnalyzer(loader, analyzerName);
} else {
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
}
try (Analyzer a = analyzer) {
String formatClass = format;
if (format == null || format.equals("solr")) {
formatClass = SolrSynonymParser.class.getName();
} else if (format.equals("wordnet")) {
formatClass = WordnetSynonymParser.class.getName();
}
// TODO: expose dedup as a parameter?
map = loadSynonyms(loader, formatClass, true, a);
} catch (ParseException e) {
throw new IOException("Error parsing synonyms file:", e);
}
}
/**
* Load synonyms with the given {@link SynonymMap.Parser} class.
*/
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
SynonymMap.Parser parser;
Class<? extends SynonymMap.Parser> clazz = loader.findClass(cname, SynonymMap.Parser.class);
try {
parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer);
} catch (Exception e) {
throw new RuntimeException(e);
}
List<String> files = splitFileNames(synonyms);
for (String file : files) {
decoder.reset();
parser.parse(new InputStreamReader(loader.openResource(file), decoder));
}
return parser.build();
}
// (there are no tests for this functionality)
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException {
Class<? extends TokenizerFactory> clazz = loader.findClass(cname, TokenizerFactory.class);
try {
TokenizerFactory tokFactory = clazz.getConstructor(Map.class).newInstance(tokArgs);
if (tokFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokFactory).inform(loader);
}
return tokFactory;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
try {
Analyzer analyzer = clazz.getConstructor().newInstance();
if (analyzer instanceof ResourceLoaderAware) {
((ResourceLoaderAware) analyzer).inform(loader);
}
return analyzer;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -74,6 +74,11 @@ public class SynonymMap {
private int maxHorizontalContext; private int maxHorizontalContext;
private final boolean dedup; private final boolean dedup;
/** Default constructor, passes {@code dedup=true}. */
public Builder() {
this(true);
}
/** If dedup is true then identical rules (same input, /** If dedup is true then identical rules (same input,
* same output) will be added only once. */ * same output) will be added only once. */
public Builder(boolean dedup) { public Builder(boolean dedup) {
@ -109,8 +114,6 @@ public class SynonymMap {
reuse.setLength(upto); reuse.setLength(upto);
return reuse.get(); return reuse.get();
} }
/** only used for asserting! */ /** only used for asserting! */
private boolean hasHoles(CharsRef chars) { private boolean hasHoles(CharsRef chars) {

View File

@ -256,10 +256,12 @@ public abstract class CharTokenizer extends Tokenizer {
} }
end += charCount; end += charCount;
length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized
if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test
break; break;
} else if (length > 0) // at non-Letter w/ chars }
} else if (length > 0) { // at non-Letter w/ chars
break; // return 'em break; // return 'em
}
} }
termAtt.setLength(length); termAtt.setLength(length);

View File

@ -101,5 +101,7 @@ org.apache.lucene.analysis.standard.ClassicFilterFactory
org.apache.lucene.analysis.standard.StandardFilterFactory org.apache.lucene.analysis.standard.StandardFilterFactory
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
org.apache.lucene.analysis.synonym.SynonymFilterFactory org.apache.lucene.analysis.synonym.SynonymFilterFactory
org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory
org.apache.lucene.analysis.synonym.FlattenGraphFilterFactory
org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
org.apache.lucene.analysis.util.ElisionFilterFactory org.apache.lucene.analysis.util.ElisionFilterFactory

View File

@ -224,18 +224,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 }, new int[] { 0, 9 },
new int[] { 6, 13 }, new int[] { 6, 13 },
new int[] { 1, 1 }); null,
new int[] { 1, 1 },
null,
false);
/* only in this case, posInc of 2 ?! */ /* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
new int[] { 0, 9, 9, 12 }, new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 }, new int[] { 6, 12, 13, 13 },
new int[] { 1, 1, 0, 1 }); null,
new int[] { 1, 1, 0, 1 },
null,
false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 }, new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 }, new int[] { 6, 14, 19 },
new int[] { 1, 1, 1 }); null,
new int[] { 1, 1, 1 },
null,
false);
/* analyzer that will consume tokens with large position increments */ /* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() { Analyzer a2 = new Analyzer() {
@ -252,24 +261,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" },
new int[] { 0, 7, 16 }, new int[] { 0, 7, 16 },
new int[] { 6, 15, 20 }, new int[] { 6, 15, 20 },
new int[] { 1, 10, 1 }); null,
new int[] { 1, 10, 1 },
null,
false);
/* the "/" had a position increment of 10, where did it go?!?!! */ /* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" },
new int[] { 0, 9 }, new int[] { 0, 9 },
new int[] { 6, 13 }, new int[] { 6, 13 },
new int[] { 1, 11 }); null,
new int[] { 1, 11 },
null,
false);
/* in this case, the increment of 10 from the "/" is carried over */ /* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" },
new int[] { 0, 9, 9, 12 }, new int[] { 0, 9, 9, 12 },
new int[] { 6, 12, 13, 13 }, new int[] { 6, 12, 13, 13 },
new int[] { 1, 11, 0, 1 }); null,
new int[] { 1, 11, 0, 1 },
null,
false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" },
new int[] { 0, 9, 15 }, new int[] { 0, 9, 15 },
new int[] { 6, 14, 19 }, new int[] { 6, 14, 19 },
new int[] { 1, 11, 1 }); null,
new int[] { 1, 11, 1 },
null,
false);
Analyzer a3 = new Analyzer() { Analyzer a3 = new Analyzer() {
@Override @Override
@ -284,14 +305,21 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new String[] { "lucene", "lucenesolr", "solr" }, new String[] { "lucene", "lucenesolr", "solr" },
new int[] { 0, 0, 7 }, new int[] { 0, 0, 7 },
new int[] { 6, 11, 11 }, new int[] { 6, 11, 11 },
new int[] { 1, 0, 1 }); null,
new int[] { 1, 0, 1 },
null,
false);
/* the stopword should add a gap here */ /* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr", assertAnalyzesTo(a3, "the lucene.solr",
new String[] { "lucene", "lucenesolr", "solr" }, new String[] { "lucene", "lucenesolr", "solr" },
new int[] { 4, 4, 11 }, new int[] { 4, 4, 11 },
new int[] { 10, 15, 15 }, new int[] { 10, 15, 15 },
new int[] { 2, 0, 1 }); null,
new int[] { 2, 0, 1 },
null,
false);
IOUtils.close(a, a2, a3); IOUtils.close(a, a2, a3);
} }

View File

@ -0,0 +1,284 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.synonym;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
public class TestFlattenGraphFilter extends BaseTokenStreamTestCase {
private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) {
final Token t = new Token(term, startOffset, endOffset);
t.setPositionIncrement(posInc);
t.setPositionLength(posLength);
return t;
}
public void testSimpleMock() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
TokenStream ts = new FlattenGraphFilter(tokenizer);
return new TokenStreamComponents(tokenizer, ts);
}
};
assertAnalyzesTo(a, "wtf happened",
new String[] {"wtf", "happened"},
new int[] { 0, 4},
new int[] { 3, 12},
null,
new int[] { 1, 1},
new int[] { 1, 1},
true);
}
// Make sure graph is unchanged if it's already flat
public void testAlreadyFlatten() throws Exception {
TokenStream in = new CannedTokenStream(0, 12, new Token[] {
token("wtf", 1, 1, 0, 3),
token("what", 0, 1, 0, 3),
token("wow", 0, 1, 0, 3),
token("the", 1, 1, 0, 3),
token("that's", 0, 1, 0, 3),
token("fudge", 1, 1, 0, 3),
token("funny", 0, 1, 0, 3),
token("happened", 1, 1, 4, 12)
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
new int[] {0, 0, 0, 0, 0, 0, 0, 4},
new int[] {3, 3, 3, 3, 3, 3, 3, 12},
new int[] {1, 0, 0, 1, 0, 1, 0, 1},
new int[] {1, 1, 1, 1, 1, 1, 1, 1},
12);
}
public void testWTF1() throws Exception {
// "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
TokenStream in = new CannedTokenStream(0, 12, new Token[] {
token("wtf", 1, 5, 0, 3),
token("what", 0, 1, 0, 3),
token("wow", 0, 3, 0, 3),
token("the", 1, 1, 0, 3),
token("fudge", 1, 3, 0, 3),
token("that's", 1, 1, 0, 3),
token("funny", 1, 1, 0, 3),
token("happened", 1, 1, 4, 12)
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"},
new int[] {0, 0, 0, 0, 0, 0, 0, 4},
new int[] {3, 3, 3, 3, 3, 3, 3, 12},
new int[] {1, 0, 0, 1, 0, 1, 0, 1},
new int[] {3, 1, 1, 1, 1, 1, 1, 1},
12);
}
/** Same as testWTF1 except the "wtf" token comes out later */
public void testWTF2() throws Exception {
// "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
TokenStream in = new CannedTokenStream(0, 12, new Token[] {
token("what", 1, 1, 0, 3),
token("wow", 0, 3, 0, 3),
token("wtf", 0, 5, 0, 3),
token("the", 1, 1, 0, 3),
token("fudge", 1, 3, 0, 3),
token("that's", 1, 1, 0, 3),
token("funny", 1, 1, 0, 3),
token("happened", 1, 1, 4, 12)
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"},
new int[] {0, 0, 0, 0, 0, 0, 0, 4},
new int[] {3, 3, 3, 3, 3, 3, 3, 12},
new int[] {1, 0, 0, 1, 0, 1, 0, 1},
new int[] {1, 1, 3, 1, 1, 1, 1, 1},
12);
}
public void testNonGreedySynonyms() throws Exception {
// This is just "hypothetical" for Lucene today, because SynFilter is
// greedy: when two syn rules match on overlapping tokens, only one
// (greedily) wins. This test pretends all syn matches could match:
TokenStream in = new CannedTokenStream(0, 20, new Token[] {
token("wizard", 1, 1, 0, 6),
token("wizard_of_oz", 0, 3, 0, 12),
token("of", 1, 1, 7, 9),
token("oz", 1, 1, 10, 12),
token("oz_screams", 0, 2, 10, 20),
token("screams", 1, 1, 13, 20),
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"},
new int[] {0, 0, 7, 10, 10, 13},
new int[] {6, 12, 9, 12, 20, 20},
new int[] {1, 0, 1, 1, 0, 1},
new int[] {1, 3, 1, 1, 2, 1},
20);
}
public void testNonGraph() throws Exception {
TokenStream in = new CannedTokenStream(0, 22, new Token[] {
token("hello", 1, 1, 0, 5),
token("pseudo", 1, 1, 6, 12),
token("world", 1, 1, 13, 18),
token("fun", 1, 1, 19, 22),
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"hello", "pseudo", "world", "fun"},
new int[] {0, 6, 13, 19},
new int[] {5, 12, 18, 22},
new int[] {1, 1, 1, 1},
new int[] {1, 1, 1, 1},
22);
}
public void testSimpleHole() throws Exception {
TokenStream in = new CannedTokenStream(0, 13, new Token[] {
token("hello", 1, 1, 0, 5),
token("hole", 2, 1, 6, 10),
token("fun", 1, 1, 11, 13),
});
TokenStream out = new FlattenGraphFilter(in);
// ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
assertTokenStreamContents(out,
new String[] {"hello", "hole", "fun"},
new int[] {0, 6, 11},
new int[] {5, 10, 13},
new int[] {1, 2, 1},
new int[] {1, 1, 1},
13);
}
public void testHoleUnderSyn() throws Exception {
// Tests a StopFilter after SynFilter where a stopword in a syn is removed
//
// wizard of oz -> woz syn, but then "of" becomes a hole
TokenStream in = new CannedTokenStream(0, 12, new Token[] {
token("wizard", 1, 1, 0, 6),
token("woz", 0, 3, 0, 12),
token("oz", 2, 1, 10, 12),
});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(out,
new String[] {"wizard", "woz", "oz"},
new int[] {0, 0, 10},
new int[] {6, 12, 12},
new int[] {1, 0, 2},
new int[] {1, 3, 1},
12);
}
public void testStrangelyNumberedNodes() throws Exception {
// Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!)
TokenStream in = new CannedTokenStream(0, 27, new Token[] {
token("dog", 1, 3, 0, 5),
token("puppy", 0, 3, 0, 5),
token("flies", 3, 1, 6, 11),
});
TokenStream out = new FlattenGraphFilter(in);
assertTokenStreamContents(out,
new String[] {"dog", "puppy", "flies"},
new int[] {0, 0, 6},
new int[] {5, 5, 11},
new int[] {1, 0, 1},
new int[] {1, 1, 1},
27);
}
public void testTwoLongParallelPaths() throws Exception {
// "a a a a a a" in parallel with "b b b b b b"
TokenStream in = new CannedTokenStream(0, 11, new Token[] {
token("a", 1, 1, 0, 1),
token("b", 0, 2, 0, 1),
token("a", 1, 2, 2, 3),
token("b", 1, 2, 2, 3),
token("a", 1, 2, 4, 5),
token("b", 1, 2, 4, 5),
token("a", 1, 2, 6, 7),
token("b", 1, 2, 6, 7),
token("a", 1, 2, 8, 9),
token("b", 1, 2, 8, 9),
token("a", 1, 2, 10, 11),
token("b", 1, 2, 10, 11),
});
TokenStream out = new FlattenGraphFilter(in);
// ... becomes flattened to a single path with overlapping a/b token between each node:
assertTokenStreamContents(out,
new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"},
new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10},
new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11},
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
11);
}
// NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter
}

View File

@ -579,14 +579,15 @@ public class Automaton implements Accountable {
/** Returns the dot (graphviz) representation of this automaton. /** Returns the dot (graphviz) representation of this automaton.
* This is extremely useful for visualizing the automaton. */ * This is extremely useful for visualizing the automaton. */
public String toDot() { public String toDot() {
// TODO: breadth first search so we can see get layered output... // TODO: breadth first search so we can get layered output...
StringBuilder b = new StringBuilder(); StringBuilder b = new StringBuilder();
b.append("digraph Automaton {\n"); b.append("digraph Automaton {\n");
b.append(" rankdir = LR\n"); b.append(" rankdir = LR\n");
b.append(" node [width=0.2, height=0.2, fontsize=8]\n");
final int numStates = getNumStates(); final int numStates = getNumStates();
if (numStates > 0) { if (numStates > 0) {
b.append(" initial [shape=plaintext,label=\"0\"]\n"); b.append(" initial [shape=plaintext,label=\"\"]\n");
b.append(" initial -> 0\n"); b.append(" initial -> 0\n");
} }

View File

@ -370,10 +370,8 @@ final public class Operations {
} }
/** Returns true if these two automata accept exactly the /** Returns true if these two automata accept exactly the
* same language. This is a costly computation! Note * same language. This is a costly computation! Both automata
* also that a1 and a2 will be determinized as a side * must be determinized and have no dead states! */
* effect. Both automata must be determinized and have
* no dead states! */
public static boolean sameLanguage(Automaton a1, Automaton a2) { public static boolean sameLanguage(Automaton a1, Automaton a2) {
if (a1 == a2) { if (a1 == a2) {
return true; return true;

View File

@ -79,7 +79,9 @@ public class StatePair {
*/ */
@Override @Override
public int hashCode() { public int hashCode() {
return s1 ^ s2; // Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0, e.g. if you call Operations.sameLanguage,
// passing the same automaton against itself:
return s1 * 31 + s2;
} }
@Override @Override

View File

@ -184,22 +184,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
assertEquals("term "+i, output[i], termAtt.toString()); assertEquals("term "+i, output[i], termAtt.toString());
if (startOffsets != null) { if (startOffsets != null) {
assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset()); assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset());
} }
if (endOffsets != null) { if (endOffsets != null) {
assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset()); assertEquals("endOffset " + i + " term=" + termAtt, endOffsets[i], offsetAtt.endOffset());
} }
if (types != null) { if (types != null) {
assertEquals("type "+i, types[i], typeAtt.type()); assertEquals("type " + i + " term=" + termAtt, types[i], typeAtt.type());
} }
if (posIncrements != null) { if (posIncrements != null) {
assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); assertEquals("posIncrement " + i + " term=" + termAtt, posIncrements[i], posIncrAtt.getPositionIncrement());
} }
if (posLengths != null) { if (posLengths != null) {
assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength()); assertEquals("posLength " + i + " term=" + termAtt, posLengths[i], posLengthAtt.getPositionLength());
} }
if (keywordAtts != null) { if (keywordAtts != null) {
assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword());
} }
// we can enforce some basic things about a few attributes even if the caller doesn't check: // we can enforce some basic things about a few attributes even if the caller doesn't check:
@ -207,13 +207,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
final int startOffset = offsetAtt.startOffset(); final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset(); final int endOffset = offsetAtt.endOffset();
if (finalOffset != null) { if (finalOffset != null) {
assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); assertTrue("startOffset (= " + startOffset + ") must be <= finalOffset (= " + finalOffset + ") term=" + termAtt, startOffset <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue() + " term=" + termAtt,
endOffset <= finalOffset.intValue()); endOffset <= finalOffset.intValue());
} }
if (offsetsAreCorrect) { if (offsetsAreCorrect) {
assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset);
lastStartOffset = offsetAtt.startOffset(); lastStartOffset = offsetAtt.startOffset();
} }
@ -235,7 +235,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// We've seen a token leaving from this position // We've seen a token leaving from this position
// before; verify the startOffset is the same: // before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset); //System.out.println(" + vs " + pos + " -> " + startOffset);
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); assertEquals(i + " inconsistent startOffset: pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset);
} }
final int endPos = pos + posLength; final int endPos = pos + posLength;
@ -248,7 +248,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// We've seen a token arriving to this position // We've seen a token arriving to this position
// before; verify the endOffset is the same: // before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset); //System.out.println(" + ve " + endPos + " -> " + endOffset);
assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); assertEquals("inconsistent endOffset " + i + " pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset);
} }
} }
} }
@ -350,16 +350,19 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
checkResetException(a, input); checkResetException(a, input);
checkAnalysisConsistency(random(), a, true, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
} }
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
checkResetException(a, input); checkResetException(a, input);
checkAnalysisConsistency(random(), a, true, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
} }
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException { public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
checkResetException(a, input); checkResetException(a, input);
checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
} }
@ -378,6 +381,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException { public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths); assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
} }
public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] posLengths) throws IOException {
assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null); assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
@ -598,7 +605,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
try { try {
for (int i = 0; i < iterations; i++) { for (int i = 0; i < iterations; i++) {
String text; String text;
if (random.nextInt(10) == 7) { if (random.nextInt(10) == 7) {
// real data from linedocs // real data from linedocs
text = docs.nextDoc().get("body"); text = docs.nextDoc().get("body");
@ -622,7 +629,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// synthetic // synthetic
text = TestUtil.randomAnalysisString(random, maxWordLength, simple); text = TestUtil.randomAnalysisString(random, maxWordLength, simple);
} }
try { try {
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField); checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
if (iw != null) { if (iw != null) {
@ -768,7 +775,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
} catch (IllegalStateException ise) { } catch (IllegalStateException ise) {
// Catch & ignore MockTokenizer's // Catch & ignore MockTokenizer's
// anger... // anger...
if ("end() called before incrementToken() returned false!".equals(ise.getMessage())) { if (ise.getMessage().contains("end() called in wrong state=")) {
// OK // OK
} else { } else {
throw ise; throw ise;
@ -793,7 +800,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
} catch (IllegalStateException ise) { } catch (IllegalStateException ise) {
// Catch & ignore MockTokenizer's // Catch & ignore MockTokenizer's
// anger... // anger...
if ("end() called before incrementToken() returned false!".equals(ise.getMessage())) { if (ise.getMessage().contains("end() called in wrong state=")) {
// OK // OK
} else { } else {
throw ise; throw ise;

View File

@ -103,6 +103,7 @@ public class MockTokenizer extends Tokenizer {
public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) { public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
} }
/** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */ /** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */
public MockTokenizer() { public MockTokenizer() {
this(WHITESPACE, true); this(WHITESPACE, true);
@ -316,7 +317,7 @@ public class MockTokenizer extends Tokenizer {
// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
// these tests should disable this check (in general you should consume the entire stream) // these tests should disable this check (in general you should consume the entire stream)
if (streamState != State.INCREMENT_FALSE) { if (streamState != State.INCREMENT_FALSE) {
fail("end() called before incrementToken() returned false!"); fail("end() called in wrong state=" + streamState + "!");
} }
} finally { } finally {
streamState = State.END; streamState = State.END;