diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2c2908de3b4..e9a0b8555a4 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -23,6 +23,14 @@ New features make it simpler to execute drill down when drill sideways counts are not needed (Emmanuel Keller via Mike McCandless) +* LUCENE-6664: A new SynonymGraphFilter outputs a correct graph + structure for multi-token synonyms, separating out a + FlattenGraphFilter that is hardwired into the current + SynonymFilter. This finally makes it possible to implement + correct multi-token synonyms at search time. See + http://blog.mikemccandless.com/2012/04/lucenes-tokenstreams-are-actually.html + for details. (Mike McCandless) + Bug Fixes * LUCENE-7547: JapaneseTokenizerFactory was failing to close the diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java new file mode 100644 index 00000000000..7ede190b61d --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java @@ -0,0 +1,424 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +/** + * This filter "casts" token graphs down into a "flat" form, + * for indexing. This is an inherently lossy process: nodes (positions) + * along side paths are forcefully merged. + * + *

In general this means the output graph will accept token sequences + * that the input graph did not accept, and will also fail to accept + * token sequences that the input graph did accept. + * + *

This is only necessary at indexing time because Lucene cannot yet index + * an arbitrary token graph. At search time there are better options, e.g. + * the experimental TermAutomatonQuery in sandbox. + * + * @lucene.experimental + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.RollingBuffer; + +/** + * Converts an incoming graph token stream, such as one from + * {@link SynonymGraphFilter}, into a flat form so that + * all nodes form a single linear chain with no side paths. Every + * path through the graph touches every node. + * + *

If the graph was not already flat to start, this + * is likely a lossy process, i.e. it will often cause the + * graph to accept token sequences it should not, and to + * reject token sequences it should not. + * + *

However, when applying synonyms during indexing, this + * is necessary because Lucene already does not index a graph + * and so the indexing process is already lossy + * (it ignores the {@link PositionLengthAttribute}). + * + * @lucene.experimental + */ +public final class FlattenGraphFilter extends TokenFilter { + + /** Holds all tokens leaving a given input position. */ + private final static class InputNode implements RollingBuffer.Resettable { + private final List tokens = new ArrayList<>(); + + /** Our input node, or -1 if we haven't been assigned yet */ + int node = -1; + + /** Maximum to input node for all tokens leaving here; we use this + * to know when we can freeze. */ + int maxToNode = -1; + + /** Where we currently map to; this changes (can only + * increase as we see more input tokens), until we are finished + * with this position. */ + int outputNode = -1; + + /** Which token (index into {@link #tokens}) we will next output. */ + int nextOut; + + @Override + public void reset() { + tokens.clear(); + node = -1; + outputNode = -1; + maxToNode = -1; + nextOut = 0; + } + } + + /** Gathers up merged input positions into a single output position, + * only for the current "frontier" of nodes we've seen but can't yet + * output because they are not frozen. */ + private final static class OutputNode implements RollingBuffer.Resettable { + private final List inputNodes = new ArrayList<>(); + + /** Node ID for this output, or -1 if we haven't been assigned yet. */ + int node = -1; + + /** Which input node (index into {@link #inputNodes}) we will next output. */ + int nextOut; + + /** Start offset of tokens leaving this node. */ + int startOffset = -1; + + /** End offset of tokens arriving to this node. */ + int endOffset = -1; + + @Override + public void reset() { + inputNodes.clear(); + node = -1; + nextOut = 0; + startOffset = -1; + endOffset = -1; + } + } + + private final RollingBuffer inputNodes = new RollingBuffer() { + @Override + protected InputNode newInstance() { + return new InputNode(); + } + }; + + private final RollingBuffer outputNodes = new RollingBuffer() { + @Override + protected OutputNode newInstance() { + return new OutputNode(); + } + }; + + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** Which input node the last seen token leaves from */ + private int inputFrom; + + /** We are currently releasing tokens leaving from this output node */ + private int outputFrom; + + // for debugging: + //private int retOutputFrom; + + private boolean done; + + private int lastOutputFrom; + + private int finalOffset; + + private int finalPosInc; + + private int maxLookaheadUsed; + + private int lastStartOffset; + + public FlattenGraphFilter(TokenStream in) { + super(in); + } + + private boolean releaseBufferedToken() { + + // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving): + while (outputFrom < outputNodes.getMaxPos()) { + OutputNode output = outputNodes.get(outputFrom); + if (output.inputNodes.isEmpty()) { + // No tokens arrived to this node, which happens for the first node + // after a hole: + //System.out.println(" skip empty outputFrom=" + outputFrom); + outputFrom++; + continue; + } + + int maxToNode = -1; + for(int inputNodeID : output.inputNodes) { + InputNode inputNode = inputNodes.get(inputNodeID); + assert inputNode.outputNode == outputFrom; + maxToNode = Math.max(maxToNode, inputNode.maxToNode); + } + //System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom); + + // TODO: we could shrink the frontier here somewhat if we + // always output posLen=1 as part of our "sausagizing": + if (maxToNode <= inputFrom || done) { + //System.out.println(" output node merged these inputs: " + output.inputNodes); + // These tokens are now frozen + assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size(); + InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut)); + if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) { + return false; + } + if (inputNode.tokens.size() == 0) { + assert inputNode.nextOut == 0; + assert output.nextOut == 0; + // Hole dest nodes should never be merged since 1) we always + // assign them to a new output position, and 2) since they never + // have arriving tokens they cannot be pushed: + assert output.inputNodes.size() == 1: output.inputNodes.size(); + outputFrom++; + inputNodes.freeBefore(output.inputNodes.get(0)); + outputNodes.freeBefore(outputFrom); + continue; + } + + assert inputNode.nextOut < inputNode.tokens.size(); + + restoreState(inputNode.tokens.get(inputNode.nextOut)); + + // Correct posInc + assert outputFrom >= lastOutputFrom; + posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom); + int toInputNodeID = inputNode.node + posLenAtt.getPositionLength(); + InputNode toInputNode = inputNodes.get(toInputNodeID); + + // Correct posLen + assert toInputNode.outputNode > outputFrom; + posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom); + lastOutputFrom = outputFrom; + inputNode.nextOut++; + //System.out.println(" ret " + this); + + OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode); + + // Correct offsets + + // This is a bit messy; we must do this so offset don't go backwards, + // which would otherwise happen if the replacement has more tokens + // than the input: + int startOffset = Math.max(lastStartOffset, output.startOffset); + offsetAtt.setOffset(startOffset, outputEndNode.endOffset); + lastStartOffset = startOffset; + + if (inputNode.nextOut == inputNode.tokens.size()) { + output.nextOut++; + if (output.nextOut == output.inputNodes.size()) { + outputFrom++; + inputNodes.freeBefore(output.inputNodes.get(0)); + outputNodes.freeBefore(outputFrom); + } + } + + return true; + } else { + return false; + } + } + + //System.out.println(" break false"); + return false; + } + + @Override + public boolean incrementToken() throws IOException { + //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); + + while (true) { + if (releaseBufferedToken()) { + //retOutputFrom += posIncAtt.getPositionIncrement(); + //System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength())); + //printStates(); + return true; + } else if (done) { + //System.out.println(" done, return false"); + return false; + } + + if (input.incrementToken()) { + // Input node this token leaves from: + inputFrom += posIncAtt.getPositionIncrement(); + + int startOffset = offsetAtt.startOffset(); + int endOffset = offsetAtt.endOffset(); + + // Input node this token goes to: + int inputTo = inputFrom + posLenAtt.getPositionLength(); + //System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo); + + InputNode src = inputNodes.get(inputFrom); + if (src.node == -1) { + // This means the "from" node of this token was never seen as a "to" node, + // which should only happen if we just crossed a hole. This is a challenging + // case for us because we normally rely on the full dependencies expressed + // by the arcs to assign outgoing node IDs. It would be better if tokens + // were never dropped but instead just marked deleted with a new + // TermDeletedAttribute (boolean valued) ... but until that future, we have + // a hack here to forcefully jump the output node ID: + assert src.outputNode == -1; + src.node = inputFrom; + + src.outputNode = outputNodes.getMaxPos() + 1; + //System.out.println(" hole: force to outputNode=" + src.outputNode); + OutputNode outSrc = outputNodes.get(src.outputNode); + + // Not assigned yet: + assert outSrc.node == -1; + outSrc.node = src.outputNode; + outSrc.inputNodes.add(inputFrom); + outSrc.startOffset = startOffset; + } else { + OutputNode outSrc = outputNodes.get(src.outputNode); + if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { + // "shrink wrap" the offsets so the original tokens (with most + // restrictive offsets) win: + outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); + } + } + + // Buffer this token: + src.tokens.add(captureState()); + src.maxToNode = Math.max(src.maxToNode, inputTo); + maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); + + InputNode dest = inputNodes.get(inputTo); + if (dest.node == -1) { + // Common case: first time a token is arriving to this input position: + dest.node = inputTo; + } + + // Always number output nodes sequentially: + int outputEndNode = src.outputNode + 1; + + if (outputEndNode > dest.outputNode) { + if (dest.outputNode != -1) { + boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo)); + assert removed; + } + //System.out.println(" increase output node: " + dest.outputNode + " vs " + outputEndNode); + outputNodes.get(outputEndNode).inputNodes.add(inputTo); + dest.outputNode = outputEndNode; + + // Since all we ever do is merge incoming nodes together, and then renumber + // the merged nodes sequentially, we should only ever assign smaller node + // numbers: + assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo; + } + + OutputNode outDest = outputNodes.get(dest.outputNode); + // "shrink wrap" the offsets so the original tokens (with most + // restrictive offsets) win: + if (outDest.endOffset == -1 || endOffset < outDest.endOffset) { + outDest.endOffset = endOffset; + } + + } else { + //System.out.println(" got false from input"); + input.end(); + finalPosInc = posIncAtt.getPositionIncrement(); + finalOffset = offsetAtt.endOffset(); + done = true; + // Don't return false here: we need to force release any buffered tokens now + } + } + } + + // Only for debugging: + /* + private void printStates() { + System.out.println("states:"); + for(int i=outputFrom;i args) { + super(args); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new FlattenGraphFilter(input); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 6a72920d2f6..29f6e1c860f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -81,6 +81,9 @@ import org.apache.lucene.util.fst.FST; * used for parsing. Subsequent tokens simply pass through * and are not parsed. A future improvement would be to * allow these tokens to also be matched.

+ * + * @deprecated Use {@link SynonymGraphFilter} instead, but be sure to also + * use {@link FlattenGraphFilter} at index time (not at search time) as well. */ // TODO: maybe we should resolve token -> wordID then run @@ -105,6 +108,7 @@ import org.apache.lucene.util.fst.FST; // // Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps +@Deprecated public final class SynonymFilter extends TokenFilter { public static final String TYPE_SYNONYM = "SYNONYM"; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java index 8bab9a7eaf6..df10e9b75a9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java @@ -72,7 +72,11 @@ import org.apache.lucene.analysis.util.TokenizerFactory; *
  • {@link Analyzer} analyzer - an analyzer used for each raw synonym
  • * * @see SolrSynonymParser SolrSynonymParser: default format + * + * @deprecated Use {@link SynonymGraphFilterFactory} instead, but be sure to also + * use {@link FlattenGraphFilterFactory} at index time (not at search time) as well. */ +@Deprecated public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { private final boolean ignoreCase; private final String tokenizerFactory; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java new file mode 100644 index 00000000000..3d50e08de64 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymGraphFilter.java @@ -0,0 +1,586 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.fst.FST; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +// TODO: maybe we should resolve token -> wordID then run +// FST on wordIDs, for better perf? + +// TODO: a more efficient approach would be Aho/Corasick's +// algorithm +// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm +// It improves over the current approach here +// because it does not fully re-start matching at every +// token. For example if one pattern is "a b c x" +// and another is "b c d" and the input is "a b c d", on +// trying to parse "a b c x" but failing when you got to x, +// rather than starting over again your really should +// immediately recognize that "b c d" matches at the next +// input. I suspect this won't matter that much in +// practice, but it's possible on some set of synonyms it +// will. We'd have to modify Aho/Corasick to enforce our +// conflict resolving (eg greedy matching) because that algo +// finds all matches. This really amounts to adding a .* +// closure to the FST and then determinizing it. +// +// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps + +/** Applies single- or multi-token synonyms from a {@link SynonymMap} + * to an incoming {@link TokenStream}, producing a fully correct graph + * output. This is a replacement for {@link SynonymFilter}, which produces + * incorrect graphs for multi-token synonyms. + * + *

    However, if you use this during indexing, you must follow it with + * {@link FlattenGraphFilter} to squash tokens on top of one another + * like {@link SynonymFilter}, because the indexer can't directly + * consume a graph. To get fully correct positional queries when your + * synonym replacements are multiple tokens, you should instead apply + * synonyms using this {@code TokenFilter} at query time and translate + * the resulting graph to a {@code TermAutomatonQuery} e.g. using + * {@code TokenStreamToTermAutomatonQuery}. + * + *

    NOTE: this cannot consume an incoming graph; results will + * be undefined. + * + * @lucene.experimental */ + +public final class SynonymGraphFilter extends TokenFilter { + + public static final String TYPE_SYNONYM = "SYNONYM"; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + private final SynonymMap synonyms; + private final boolean ignoreCase; + + private final FST fst; + + private final FST.BytesReader fstReader; + private final FST.Arc scratchArc; + private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + private final BytesRef scratchBytes = new BytesRef(); + private final CharsRefBuilder scratchChars = new CharsRefBuilder(); + private final LinkedList outputBuffer = new LinkedList<>(); + + private int nextNodeOut; + private int lastNodeOut; + private int maxLookaheadUsed; + + // For testing: + private int captureCount; + + private boolean liveToken; + + // Start/end offset of the current match: + private int matchStartOffset; + private int matchEndOffset; + + // True once the input TokenStream is exhausted: + private boolean finished; + + private int lookaheadNextRead; + private int lookaheadNextWrite; + + private RollingBuffer lookahead = new RollingBuffer() { + @Override + protected BufferedInputToken newInstance() { + return new BufferedInputToken(); + } + }; + + static class BufferedInputToken implements RollingBuffer.Resettable { + final CharsRefBuilder term = new CharsRefBuilder(); + AttributeSource.State state; + int startOffset = -1; + int endOffset = -1; + + @Override + public void reset() { + state = null; + term.clear(); + + // Intentionally invalid to ferret out bugs: + startOffset = -1; + endOffset = -1; + } + } + + static class BufferedOutputToken { + final String term; + + // Non-null if this was an incoming token: + final State state; + + final int startNode; + final int endNode; + + public BufferedOutputToken(State state, String term, int startNode, int endNode) { + this.state = state; + this.term = term; + this.startNode = startNode; + this.endNode = endNode; + } + } + + public SynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { + super(input); + this.synonyms = synonyms; + this.fst = synonyms.fst; + if (fst == null) { + throw new IllegalArgumentException("fst must be non-null"); + } + this.fstReader = fst.getBytesReader(); + scratchArc = new FST.Arc<>(); + this.ignoreCase = ignoreCase; + } + + @Override + public boolean incrementToken() throws IOException { + //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut); + + assert lastNodeOut <= nextNodeOut; + + if (outputBuffer.isEmpty() == false) { + // We still have pending outputs from a prior synonym match: + releaseBufferedToken(); + //System.out.println(" syn: ret buffered=" + this); + assert liveToken == false; + return true; + } + + // Try to parse a new synonym match at the current token: + + if (parse()) { + // A new match was found: + releaseBufferedToken(); + //System.out.println(" syn: after parse, ret buffered=" + this); + assert liveToken == false; + return true; + } + + if (lookaheadNextRead == lookaheadNextWrite) { + + // Fast path: parse pulled one token, but it didn't match + // the start for any synonym, so we now return it "live" w/o having + // cloned all of its atts: + if (finished) { + //System.out.println(" syn: ret END"); + return false; + } + + assert liveToken; + liveToken = false; + + // NOTE: no need to change posInc since it's relative, i.e. whatever + // node our output is upto will just increase by the incoming posInc. + // We also don't need to change posLen, but only because we cannot + // consume a graph, so the incoming token can never span a future + // synonym match. + + } else { + // We still have buffered lookahead tokens from a previous + // parse attempt that required lookahead; just replay them now: + //System.out.println(" restore buffer"); + assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite; + BufferedInputToken token = lookahead.get(lookaheadNextRead); + lookaheadNextRead++; + + restoreState(token.state); + + lookahead.freeBefore(lookaheadNextRead); + + //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset()); + assert liveToken == false; + } + + lastNodeOut += posIncrAtt.getPositionIncrement(); + nextNodeOut = lastNodeOut + posLenAtt.getPositionLength(); + + //System.out.println(" syn: ret lookahead=" + this); + + return true; + } + + private void releaseBufferedToken() throws IOException { + //System.out.println(" releaseBufferedToken"); + + BufferedOutputToken token = outputBuffer.pollFirst(); + + if (token.state != null) { + // This is an original input token (keepOrig=true case): + //System.out.println(" hasState"); + restoreState(token.state); + //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset()); + } else { + clearAttributes(); + //System.out.println(" no state"); + termAtt.append(token.term); + + // We better have a match already: + assert matchStartOffset != -1; + + offsetAtt.setOffset(matchStartOffset, matchEndOffset); + //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset); + typeAtt.setType(TYPE_SYNONYM); + } + + //System.out.println(" lastNodeOut=" + lastNodeOut); + //System.out.println(" term=" + termAtt); + + posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut); + lastNodeOut = token.startNode; + posLenAtt.setPositionLength(token.endNode - token.startNode); + } + + /** Scans the next input token(s) to see if a synonym matches. Returns true + * if a match was found. */ + private boolean parse() throws IOException { + // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this)); + + // Holds the longest match we've seen so far: + BytesRef matchOutput = null; + int matchInputLength = 0; + + BytesRef pendingOutput = fst.outputs.getNoOutput(); + fst.getFirstArc(scratchArc); + + assert scratchArc.output == fst.outputs.getNoOutput(); + + // How many tokens in the current match + int matchLength = 0; + boolean doFinalCapture = false; + + int lookaheadUpto = lookaheadNextRead; + matchStartOffset = -1; + + byToken: + while (true) { + //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos()); + + // Pull next token's chars: + final char[] buffer; + final int bufferLen; + final int inputEndOffset; + + if (lookaheadUpto <= lookahead.getMaxPos()) { + // Still in our lookahead buffer + BufferedInputToken token = lookahead.get(lookaheadUpto); + lookaheadUpto++; + buffer = token.term.chars(); + bufferLen = token.term.length(); + inputEndOffset = token.endOffset; + //System.out.println(" use buffer now max=" + lookahead.getMaxPos()); + if (matchStartOffset == -1) { + matchStartOffset = token.startOffset; + } + } else { + + // We used up our lookahead buffer of input tokens + // -- pull next real input token: + + assert finished || liveToken == false; + + if (finished) { + //System.out.println(" break: finished"); + break; + } else if (input.incrementToken()) { + //System.out.println(" input.incrToken"); + liveToken = true; + buffer = termAtt.buffer(); + bufferLen = termAtt.length(); + if (matchStartOffset == -1) { + matchStartOffset = offsetAtt.startOffset(); + } + inputEndOffset = offsetAtt.endOffset(); + + lookaheadUpto++; + } else { + // No more input tokens + finished = true; + //System.out.println(" break: now set finished"); + break; + } + } + + matchLength++; + //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen)); + + // Run each char in this token through the FST: + int bufUpto = 0; + while (bufUpto < bufferLen) { + final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); + if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { + break byToken; + } + + // Accum the output + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + bufUpto += Character.charCount(codePoint); + } + + assert bufUpto == bufferLen; + + // OK, entire token matched; now see if this is a final + // state in the FST (a match): + if (scratchArc.isFinal()) { + matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); + matchInputLength = matchLength; + matchEndOffset = inputEndOffset; + //System.out.println(" ** match"); + } + + // See if the FST can continue matching (ie, needs to + // see the next input token): + if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { + // No further rules can match here; we're done + // searching for matching rules starting at the + // current input position. + break; + } else { + // More matching is possible -- accum the output (if + // any) of the WORD_SEP arc: + pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); + doFinalCapture = true; + if (liveToken) { + capture(); + } + } + } + + if (doFinalCapture && liveToken && finished == false) { + // Must capture the final token if we captured any prior tokens: + capture(); + } + + if (matchOutput != null) { + + if (liveToken) { + // Single input token synonym; we must buffer it now: + capture(); + } + + // There is a match! + bufferOutputTokens(matchOutput, matchInputLength); + lookaheadNextRead += matchInputLength; + //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); + lookahead.freeBefore(lookaheadNextRead); + //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos()); + return true; + } else { + //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead); + return false; + } + + //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite); + } + + /** Expands the output graph into the necessary tokens, adding + * synonyms as side paths parallel to the input tokens, and + * buffers them in the output token buffer. */ + private void bufferOutputTokens(BytesRef bytes, int matchInputLength) { + bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); + + final int code = bytesReader.readVInt(); + final boolean keepOrig = (code & 0x1) == 0; + //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength); + + // How many nodes along all paths; we need this to assign the + // node ID for the final end node where all paths merge back: + int totalPathNodes; + if (keepOrig) { + assert matchInputLength > 0; + totalPathNodes = matchInputLength - 1; + } else { + totalPathNodes = 0; + } + + // How many synonyms we will insert over this match: + final int count = code >>> 1; + + // TODO: we could encode this instead into the FST: + + // 1st pass: count how many new nodes we need + List> paths = new ArrayList<>(); + for(int outputIDX=0;outputIDX path = new ArrayList<>(); + paths.add(path); + int chEnd = scratchChars.length(); + for(int chUpto=0; chUpto<=chEnd; chUpto++) { + if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) { + path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart)); + lastStart = 1 + chUpto; + } + } + + assert path.size() > 0; + totalPathNodes += path.size() - 1; + } + //System.out.println(" totalPathNodes=" + totalPathNodes); + + // 2nd pass: buffer tokens for the graph fragment + + // NOTE: totalPathNodes will be 0 in the case where the matched + // input is a single token and all outputs are also a single token + + // We "spawn" a side-path for each of the outputs for this matched + // synonym, all ending back at this end node: + + int startNode = nextNodeOut; + + int endNode = startNode + totalPathNodes + 1; + //System.out.println(" " + paths.size() + " new side-paths"); + + // First, fanout all tokens departing start node for these new side paths: + int newNodeCount = 0; + for(List path : paths) { + int pathEndNode; + //System.out.println(" path size=" + path.size()); + if (path.size() == 1) { + // Single token output, so there are no intermediate nodes: + pathEndNode = endNode; + } else { + pathEndNode = nextNodeOut + newNodeCount + 1; + newNodeCount += path.size() - 1; + } + outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode)); + } + + // We must do the original tokens last, else the offsets "go backwards": + if (keepOrig) { + BufferedInputToken token = lookahead.get(lookaheadNextRead); + int inputEndNode; + if (matchInputLength == 1) { + // Single token matched input, so there are no intermediate nodes: + inputEndNode = endNode; + } else { + inputEndNode = nextNodeOut + newNodeCount + 1; + } + + //System.out.println(" keepOrig first token: " + token.term); + + outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode)); + } + + nextNodeOut = endNode; + + // Do full side-path for each syn output: + for(int pathID=0;pathID path = paths.get(pathID); + if (path.size() > 1) { + int lastNode = outputBuffer.get(pathID).endNode; + for(int i=1;i 1) { + // Do full "side path" with the original tokens: + int lastNode = outputBuffer.get(paths.size()).endNode; + for(int i=1;i + * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100"> + * <analyzer> + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> + * <filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" + * format="solr" ignoreCase="false" expand="true" + * tokenizerFactory="solr.WhitespaceTokenizerFactory" + * [optional tokenizer factory parameters]/> + * </analyzer> + * </fieldType> + * + *

    + * An optional param name prefix of "tokenizerFactory." may be used for any + * init params that the SynonymGraphFilterFactory needs to pass to the specified + * TokenizerFactory. If the TokenizerFactory expects an init parameters with + * the same name as an init param used by the SynonymGraphFilterFactory, the prefix + * is mandatory. + *

    + * + *

    + * The optional {@code format} parameter controls how the synonyms will be parsed: + * It supports the short names of {@code solr} for {@link SolrSynonymParser} + * and {@code wordnet} for and {@link WordnetSynonymParser}, or your own + * {@code SynonymMap.Parser} class name. The default is {@code solr}. + * A custom {@link SynonymMap.Parser} is expected to have a constructor taking: + *

    + * @see SolrSynonymParser SolrSynonymParser: default format + * + * @lucene.experimental + */ +public class SynonymGraphFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private final boolean ignoreCase; + private final String tokenizerFactory; + private final String synonyms; + private final String format; + private final boolean expand; + private final String analyzerName; + private final Map tokArgs = new HashMap<>(); + + private SynonymMap map; + + public SynonymGraphFilterFactory(Map args) { + super(args); + ignoreCase = getBoolean(args, "ignoreCase", false); + synonyms = require(args, "synonyms"); + format = get(args, "format"); + expand = getBoolean(args, "expand", true); + + analyzerName = get(args, "analyzer"); + tokenizerFactory = get(args, "tokenizerFactory"); + if (analyzerName != null && tokenizerFactory != null) { + throw new IllegalArgumentException("Analyzer and TokenizerFactory can't be specified both: " + + analyzerName + " and " + tokenizerFactory); + } + + if (tokenizerFactory != null) { + tokArgs.put("luceneMatchVersion", getLuceneMatchVersion().toString()); + for (Iterator itr = args.keySet().iterator(); itr.hasNext();) { + String key = itr.next(); + tokArgs.put(key.replaceAll("^tokenizerFactory\\.",""), args.get(key)); + itr.remove(); + } + } + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public TokenStream create(TokenStream input) { + // if the fst is null, it means there's actually no synonyms... just return the original stream + // as there is nothing to do here. + return map.fst == null ? input : new SynonymGraphFilter(input, map, ignoreCase); + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); + Analyzer analyzer; + + if (analyzerName != null) { + analyzer = loadAnalyzer(loader, analyzerName); + } else { + analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create(); + TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + } + + try (Analyzer a = analyzer) { + String formatClass = format; + if (format == null || format.equals("solr")) { + formatClass = SolrSynonymParser.class.getName(); + } else if (format.equals("wordnet")) { + formatClass = WordnetSynonymParser.class.getName(); + } + // TODO: expose dedup as a parameter? + map = loadSynonyms(loader, formatClass, true, a); + } catch (ParseException e) { + throw new IOException("Error parsing synonyms file:", e); + } + } + + /** + * Load synonyms with the given {@link SynonymMap.Parser} class. + */ + protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + SynonymMap.Parser parser; + Class clazz = loader.findClass(cname, SynonymMap.Parser.class); + try { + parser = clazz.getConstructor(boolean.class, boolean.class, Analyzer.class).newInstance(dedup, expand, analyzer); + } catch (Exception e) { + throw new RuntimeException(e); + } + + List files = splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.parse(new InputStreamReader(loader.openResource(file), decoder)); + } + return parser.build(); + } + + // (there are no tests for this functionality) + private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException { + Class clazz = loader.findClass(cname, TokenizerFactory.class); + try { + TokenizerFactory tokFactory = clazz.getConstructor(Map.class).newInstance(tokArgs); + if (tokFactory instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) tokFactory).inform(loader); + } + return tokFactory; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException { + Class clazz = loader.findClass(cname, Analyzer.class); + try { + Analyzer analyzer = clazz.getConstructor().newInstance(); + if (analyzer instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) analyzer).inform(loader); + } + return analyzer; + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java index fc8703f57b2..7371e235085 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java @@ -74,6 +74,11 @@ public class SynonymMap { private int maxHorizontalContext; private final boolean dedup; + /** Default constructor, passes {@code dedup=true}. */ + public Builder() { + this(true); + } + /** If dedup is true then identical rules (same input, * same output) will be added only once. */ public Builder(boolean dedup) { @@ -109,8 +114,6 @@ public class SynonymMap { reuse.setLength(upto); return reuse.get(); } - - /** only used for asserting! */ private boolean hasHoles(CharsRef chars) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java index 9100345251f..13289bee1bd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java @@ -256,10 +256,12 @@ public abstract class CharTokenizer extends Tokenizer { } end += charCount; length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized - if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test + if (length >= MAX_WORD_LEN) { // buffer overflow! make sure to check for >= surrogate pair could break == test break; - } else if (length > 0) // at non-Letter w/ chars + } + } else if (length > 0) { // at non-Letter w/ chars break; // return 'em + } } termAtt.setLength(length); diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 70120c5221b..73986d73fec 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -101,5 +101,7 @@ org.apache.lucene.analysis.standard.ClassicFilterFactory org.apache.lucene.analysis.standard.StandardFilterFactory org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory org.apache.lucene.analysis.synonym.SynonymFilterFactory +org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory +org.apache.lucene.analysis.synonym.FlattenGraphFilterFactory org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory org.apache.lucene.analysis.util.ElisionFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index a22d9c905d7..580b17e205f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -224,18 +224,27 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, - new int[] { 1, 1 }); + null, + new int[] { 1, 1 }, + null, + false); /* only in this case, posInc of 2 ?! */ assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, - new int[] { 1, 1, 0, 1 }); + null, + new int[] { 1, 1, 0, 1 }, + null, + false); assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, - new int[] { 1, 1, 1 }); + null, + new int[] { 1, 1, 1 }, + null, + false); /* analyzer that will consume tokens with large position increments */ Analyzer a2 = new Analyzer() { @@ -252,24 +261,36 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, - new int[] { 1, 10, 1 }); + null, + new int[] { 1, 10, 1 }, + null, + false); /* the "/" had a position increment of 10, where did it go?!?!! */ assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, - new int[] { 1, 11 }); + null, + new int[] { 1, 11 }, + null, + false); /* in this case, the increment of 10 from the "/" is carried over */ assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, - new int[] { 1, 11, 0, 1 }); + null, + new int[] { 1, 11, 0, 1 }, + null, + false); assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, - new int[] { 1, 11, 1 }); + null, + new int[] { 1, 11, 1 }, + null, + false); Analyzer a3 = new Analyzer() { @Override @@ -284,14 +305,21 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 0, 0, 7 }, new int[] { 6, 11, 11 }, - new int[] { 1, 0, 1 }); + null, + new int[] { 1, 0, 1 }, + null, + false); /* the stopword should add a gap here */ assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 4, 4, 11 }, new int[] { 10, 15, 15 }, - new int[] { 2, 0, 1 }); + null, + new int[] { 2, 0, 1 }, + null, + false); + IOUtils.close(a, a2, a3); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java new file mode 100644 index 00000000000..d61fa96669f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestFlattenGraphFilter.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +public class TestFlattenGraphFilter extends BaseTokenStreamTestCase { + + private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { + final Token t = new Token(term, startOffset, endOffset); + t.setPositionIncrement(posInc); + t.setPositionLength(posLength); + return t; + } + + public void testSimpleMock() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + TokenStream ts = new FlattenGraphFilter(tokenizer); + return new TokenStreamComponents(tokenizer, ts); + } + }; + + assertAnalyzesTo(a, "wtf happened", + new String[] {"wtf", "happened"}, + new int[] { 0, 4}, + new int[] { 3, 12}, + null, + new int[] { 1, 1}, + new int[] { 1, 1}, + true); + } + + // Make sure graph is unchanged if it's already flat + public void testAlreadyFlatten() throws Exception { + TokenStream in = new CannedTokenStream(0, 12, new Token[] { + token("wtf", 1, 1, 0, 3), + token("what", 0, 1, 0, 3), + token("wow", 0, 1, 0, 3), + token("the", 1, 1, 0, 3), + token("that's", 0, 1, 0, 3), + token("fudge", 1, 1, 0, 3), + token("funny", 0, 1, 0, 3), + token("happened", 1, 1, 4, 12) + }); + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, + new int[] {0, 0, 0, 0, 0, 0, 0, 4}, + new int[] {3, 3, 3, 3, 3, 3, 3, 12}, + new int[] {1, 0, 0, 1, 0, 1, 0, 1}, + new int[] {1, 1, 1, 1, 1, 1, 1, 1}, + 12); + } + + public void testWTF1() throws Exception { + + // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: + TokenStream in = new CannedTokenStream(0, 12, new Token[] { + token("wtf", 1, 5, 0, 3), + token("what", 0, 1, 0, 3), + token("wow", 0, 3, 0, 3), + token("the", 1, 1, 0, 3), + token("fudge", 1, 3, 0, 3), + token("that's", 1, 1, 0, 3), + token("funny", 1, 1, 0, 3), + token("happened", 1, 1, 4, 12) + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, + new int[] {0, 0, 0, 0, 0, 0, 0, 4}, + new int[] {3, 3, 3, 3, 3, 3, 3, 12}, + new int[] {1, 0, 0, 1, 0, 1, 0, 1}, + new int[] {3, 1, 1, 1, 1, 1, 1, 1}, + 12); + + } + + /** Same as testWTF1 except the "wtf" token comes out later */ + public void testWTF2() throws Exception { + + // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: + TokenStream in = new CannedTokenStream(0, 12, new Token[] { + token("what", 1, 1, 0, 3), + token("wow", 0, 3, 0, 3), + token("wtf", 0, 5, 0, 3), + token("the", 1, 1, 0, 3), + token("fudge", 1, 3, 0, 3), + token("that's", 1, 1, 0, 3), + token("funny", 1, 1, 0, 3), + token("happened", 1, 1, 4, 12) + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"what", "wow", "wtf", "the", "that's", "fudge", "funny", "happened"}, + new int[] {0, 0, 0, 0, 0, 0, 0, 4}, + new int[] {3, 3, 3, 3, 3, 3, 3, 12}, + new int[] {1, 0, 0, 1, 0, 1, 0, 1}, + new int[] {1, 1, 3, 1, 1, 1, 1, 1}, + 12); + + } + + public void testNonGreedySynonyms() throws Exception { + // This is just "hypothetical" for Lucene today, because SynFilter is + // greedy: when two syn rules match on overlapping tokens, only one + // (greedily) wins. This test pretends all syn matches could match: + + TokenStream in = new CannedTokenStream(0, 20, new Token[] { + token("wizard", 1, 1, 0, 6), + token("wizard_of_oz", 0, 3, 0, 12), + token("of", 1, 1, 7, 9), + token("oz", 1, 1, 10, 12), + token("oz_screams", 0, 2, 10, 20), + token("screams", 1, 1, 13, 20), + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"}, + new int[] {0, 0, 7, 10, 10, 13}, + new int[] {6, 12, 9, 12, 20, 20}, + new int[] {1, 0, 1, 1, 0, 1}, + new int[] {1, 3, 1, 1, 2, 1}, + 20); + + } + + public void testNonGraph() throws Exception { + TokenStream in = new CannedTokenStream(0, 22, new Token[] { + token("hello", 1, 1, 0, 5), + token("pseudo", 1, 1, 6, 12), + token("world", 1, 1, 13, 18), + token("fun", 1, 1, 19, 22), + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"hello", "pseudo", "world", "fun"}, + new int[] {0, 6, 13, 19}, + new int[] {5, 12, 18, 22}, + new int[] {1, 1, 1, 1}, + new int[] {1, 1, 1, 1}, + 22); + } + + public void testSimpleHole() throws Exception { + TokenStream in = new CannedTokenStream(0, 13, new Token[] { + token("hello", 1, 1, 0, 5), + token("hole", 2, 1, 6, 10), + token("fun", 1, 1, 11, 13), + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: + assertTokenStreamContents(out, + new String[] {"hello", "hole", "fun"}, + new int[] {0, 6, 11}, + new int[] {5, 10, 13}, + new int[] {1, 2, 1}, + new int[] {1, 1, 1}, + 13); + } + + public void testHoleUnderSyn() throws Exception { + // Tests a StopFilter after SynFilter where a stopword in a syn is removed + // + // wizard of oz -> woz syn, but then "of" becomes a hole + + TokenStream in = new CannedTokenStream(0, 12, new Token[] { + token("wizard", 1, 1, 0, 6), + token("woz", 0, 3, 0, 12), + token("oz", 2, 1, 10, 12), + }); + + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents(out, + new String[] {"wizard", "woz", "oz"}, + new int[] {0, 0, 10}, + new int[] {6, 12, 12}, + new int[] {1, 0, 2}, + new int[] {1, 3, 1}, + 12); + } + + public void testStrangelyNumberedNodes() throws Exception { + + // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!) + TokenStream in = new CannedTokenStream(0, 27, new Token[] { + token("dog", 1, 3, 0, 5), + token("puppy", 0, 3, 0, 5), + token("flies", 3, 1, 6, 11), + }); + + TokenStream out = new FlattenGraphFilter(in); + + assertTokenStreamContents(out, + new String[] {"dog", "puppy", "flies"}, + new int[] {0, 0, 6}, + new int[] {5, 5, 11}, + new int[] {1, 0, 1}, + new int[] {1, 1, 1}, + 27); + } + + public void testTwoLongParallelPaths() throws Exception { + + // "a a a a a a" in parallel with "b b b b b b" + TokenStream in = new CannedTokenStream(0, 11, new Token[] { + token("a", 1, 1, 0, 1), + token("b", 0, 2, 0, 1), + token("a", 1, 2, 2, 3), + token("b", 1, 2, 2, 3), + token("a", 1, 2, 4, 5), + token("b", 1, 2, 4, 5), + token("a", 1, 2, 6, 7), + token("b", 1, 2, 6, 7), + token("a", 1, 2, 8, 9), + token("b", 1, 2, 8, 9), + token("a", 1, 2, 10, 11), + token("b", 1, 2, 10, 11), + }); + + + TokenStream out = new FlattenGraphFilter(in); + + // ... becomes flattened to a single path with overlapping a/b token between each node: + assertTokenStreamContents(out, + new String[] {"a", "b", "a", "b", "a", "b", "a", "b", "a", "b", "a", "b"}, + new int[] {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10}, + new int[] {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11}, + new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, + new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + 11); + + } + + // NOTE: TestSynonymGraphFilter's testRandomSyns also tests FlattenGraphFilter +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java new file mode 100644 index 00000000000..edf2d2a96c5 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java @@ -0,0 +1,1956 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.synonym; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockGraphTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.TestUtil; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonTestUtil; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.TooComplexToDeterminizeException; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.io.StringReader; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class TestSynonymGraphFilter extends BaseTokenStreamTestCase { + + /** Set as a side effect by {@link #getAnalyzer} and {@link #getFlattenAnalyzer}. */ + private SynonymGraphFilter synFilter; + private FlattenGraphFilter flattenFilter; + + public void testBasicKeepOrigOneOutput() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b", + new String[] {"c", "x", "a", "b"}, + new int[] { 0, 2, 2, 4}, + new int[] { 1, 5, 3, 5}, + new String[] {"word", "SYNONYM", "word", "word"}, + new int[] { 1, 1, 0, 1}, + new int[] { 1, 2, 1, 1}); + a.close(); + } + + public void testMixedKeepOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + add(b, "e f", "y", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b c e f g", + new String[] {"c", "x", "a", "b", "c", "y", "g"}, + new int[] { 0, 2, 2, 4, 6, 8, 12}, + new int[] { 1, 5, 3, 5, 7, 11, 13}, + new String[] {"word", "SYNONYM", "word", "word", "word", "SYNONYM", "word"}, + new int[] { 1, 1, 0, 1, 1, 1, 1}, + new int[] { 1, 2, 1, 1, 1, 1, 1}); + a.close(); + } + + public void testNoParseAfterBuffer() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "b a", "x", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "b b b", + new String[] {"b", "b", "b"}, + new int[] { 0, 2, 4}, + new int[] { 1, 3, 5}, + new String[] {"word", "word", "word"}, + new int[] { 1, 1, 1}, + new int[] { 1, 1, 1}); + a.close(); + } + + public void testOneInputMultipleOutputKeepOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + add(b, "a b", "y", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b c", + new String[] {"c", "x", "y", "a", "b", "c"}, + new int[] { 0, 2, 2, 2, 4, 6}, + new int[] { 1, 5, 5, 3, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, + new int[] { 1, 1, 0, 0, 1, 1, 1, 1}, + new int[] { 1, 2, 2, 1, 1, 1, 1, 1}); + a.close(); + } + + /** + * Verify type of token and positionLength after analyzer. + */ + public void testPositionLengthAndTypeSimple() throws Exception { + String testFile = + "spider man, spiderman"; + + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesToPositions(analyzer, "spider man", + new String[]{"spiderman", "spider", "man"}, + new String[]{"SYNONYM", "word", "word"}, + new int[]{1, 0, 1}, + new int[]{2, 1, 1}); + } + + /** + * parse a syn file with some escaped syntax chars + */ + public void testEscapedStuff() throws Exception { + String testFile = + "a\\=>a => b\\=>b\n" + + "a\\,a => b\\,b"; + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesTo(analyzer, "ball", + new String[]{"ball"}, + new int[]{1}); + + assertAnalyzesTo(analyzer, "a=>a", + new String[]{"b=>b"}, + new int[]{1}); + + assertAnalyzesTo(analyzer, "a,a", + new String[]{"b,b"}, + new int[]{1}); + analyzer.close(); + } + + /** + * parse a syn file with bad syntax + */ + public void testInvalidAnalyzesToNothingOutput() throws Exception { + String testFile = "a => 1"; + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, false); + SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); + try { + parser.parse(new StringReader(testFile)); + fail("didn't get expected exception"); + } catch (ParseException expected) { + // expected exc + } + analyzer.close(); + } + + /** + * parse a syn file with bad syntax + */ + public void testInvalidDoubleMap() throws Exception { + String testFile = "a => b => c"; + Analyzer analyzer = new MockAnalyzer(random()); + SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); + try { + parser.parse(new StringReader(testFile)); + fail("didn't get expected exception"); + } catch (ParseException expected) { + // expected exc + } + analyzer.close(); + } + + /** + * Tests some simple examples from the solr wiki + */ + public void testSimple() throws Exception { + String testFile = + "i-pod, ipod, ipoooood\n" + + "foo => foo bar\n" + + "foo => baz\n" + + "this test, that testing"; + + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesTo(analyzer, "ball", + new String[]{"ball"}, + new int[]{1}); + + assertAnalyzesTo(analyzer, "i-pod", + new String[]{"ipod", "ipoooood", "i-pod"}, + new int[]{1, 0, 0}); + + assertAnalyzesTo(analyzer, "foo", + new String[]{"foo", "baz", "bar"}, + new int[]{1, 0, 1}); + + assertAnalyzesTo(analyzer, "this test", + new String[]{"that", "this", "testing", "test"}, + new int[]{1, 0, 1, 0}); + analyzer.close(); + } + + public void testBufferLength() throws Exception { + String testFile = + "c => 8 2 5 6 7\n" + + "f c e d f, 1\n" + + "c g a f d, 6 5 5\n" + + "e c => 4\n" + + "g => 5\n" + + "a g b f e => 5 0 7 7\n" + + "b => 1"; + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + String doc = "b c g a f b d"; + String[] expected = new String[]{"1", "8", "2", "5", "6", "7", "5", "a", "f", "1", "d"}; + assertAnalyzesTo(analyzer, doc, expected); + } + + private Analyzer solrSynsToAnalyzer(String syns) throws IOException, ParseException { + Analyzer analyzer = new MockAnalyzer(random()); + SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer); + parser.parse(new StringReader(syns)); + analyzer.close(); + return getFlattenAnalyzer(parser, true); + } + + public void testMoreThanOneLookAhead() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b c d", "x", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "a b c e", + new String[] {"a", "b", "c", "e"}, + new int[] { 0, 2, 4, 6}, + new int[] { 1, 3, 5, 7}, + new String[] {"word", "word", "word", "word"}, + new int[] { 1, 1, 1, 1}, + new int[] { 1, 1, 1, 1}); + a.close(); + } + + public void testLookaheadAfterParse() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "b b", "x", true); + add(b, "b", "y", true); + + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "b a b b", + new String[] {"y", "b", "a", "x", "b", "b"}, + new int[] {0, 0, 2, 4, 4, 6}, + new int[] {1, 1, 3, 7, 5, 7}, + null, + new int[] {1, 0, 1, 1, 0, 1}, + new int[] {1, 1, 1, 2, 1, 1}, + true); + } + + public void testLookaheadSecondParse() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "b b b", "x", true); + add(b, "b", "y", true); + + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "b b", + new String[] {"y", "b", "y", "b"}, + new int[] { 0, 0, 2, 2}, + new int[] { 1, 1, 3, 3}, + null, + new int[] { 1, 0, 1, 0}, + new int[] { 1, 1, 1, 1}, + true); + } + + public void testOneInputMultipleOutputNoKeepOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", false); + add(b, "a b", "y", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b c", + new String[] {"c", "x", "y", "c"}, + new int[] { 0, 2, 2, 6}, + new int[] { 1, 5, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "word"}, + new int[] { 1, 1, 0, 1}, + new int[] { 1, 1, 1, 1}); + a.close(); + } + + public void testOneInputMultipleOutputMixedKeepOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + add(b, "a b", "y", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b c", + new String[] {"c", "x", "y", "a", "b", "c"}, + new int[] { 0, 2, 2, 2, 4, 6}, + new int[] { 1, 5, 5, 3, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "word", "word", "word"}, + new int[] { 1, 1, 0, 0, 1, 1, 1, 1}, + new int[] { 1, 2, 2, 1, 1, 1, 1, 1}); + a.close(); + } + + public void testSynAtEnd() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c d e a b", + new String[] {"c", "d", "e", "x", "a", "b"}, + new int[] { 0, 2, 4, 6, 6, 8}, + new int[] { 1, 3, 5, 9, 7, 9}, + new String[] {"word", "word", "word", "SYNONYM", "word", "word"}, + new int[] { 1, 1, 1, 1, 0, 1}, + new int[] { 1, 1, 1, 2, 1, 1}); + a.close(); + } + + public void testTwoSynsInARow() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a", "x", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a a b", + new String[] {"c", "x", "x", "b"}, + new int[] { 0, 2, 4, 6}, + new int[] { 1, 3, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "word"}, + new int[] { 1, 1, 1, 1}, + new int[] { 1, 1, 1, 1}); + a.close(); + } + + public void testBasicKeepOrigTwoOutputs() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x y", true); + add(b, "a b", "m n o", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b d", + new String[] {"c", "x", "m", "a", "y", "n", "o", "b", "d"}, + new int[] { 0, 2, 2, 2, 2, 2, 2, 4, 6}, + new int[] { 1, 5, 5, 3, 5, 5, 5, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, + new int[] { 1, 1, 0, 0, 1, 1, 1, 1, 1}, + new int[] { 1, 1, 2, 4, 4, 1, 2, 1, 1}); + a.close(); + } + + public void testNoCaptureIfNoMatch() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x y", true); + + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, + "c d d", + new String[] {"c", "d", "d"}, + new int[] { 0, 2, 4}, + new int[] { 1, 3, 5}, + new String[] {"word", "word", "word"}, + new int[] { 1, 1, 1}, + new int[] { 1, 1, 1}); + assertEquals(0, synFilter.getCaptureCount()); + a.close(); + } + + public void testBasicNotKeepOrigOneOutput() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b", + new String[] {"c", "x"}, + new int[] {0, 2}, + new int[] {1, 5}, + new String[] {"word", "SYNONYM"}, + new int[] {1, 1}, + new int[] {1, 1}); + a.close(); + } + + public void testBasicNoKeepOrigTwoOutputs() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x y", false); + add(b, "a b", "m n o", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b d", + new String[] {"c", "x", "m", "y", "n", "o", "d"}, + new int[] { 0, 2, 2, 2, 2, 2, 6}, + new int[] { 1, 5, 5, 5, 5, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, + new int[] { 1, 1, 0, 1, 1, 1, 1}, + new int[] { 1, 1, 2, 3, 1, 1, 1}); + a.close(); + } + + public void testIgnoreCase() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x y", false); + add(b, "a b", "m n o", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c A B D", + new String[] {"c", "x", "m", "y", "n", "o", "D"}, + new int[] { 0, 2, 2, 2, 2, 2, 6}, + new int[] { 1, 5, 5, 5, 5, 5, 7}, + new String[] {"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, + new int[] { 1, 1, 0, 1, 1, 1, 1}, + new int[] { 1, 1, 2, 3, 1, 1, 1}); + a.close(); + } + + public void testDoNotIgnoreCase() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x y", false); + add(b, "a b", "m n o", false); + + Analyzer a = getAnalyzer(b, false); + assertAnalyzesTo(a, + "c A B D", + new String[] {"c", "A", "B", "D"}, + new int[] { 0, 2, 4, 6}, + new int[] { 1, 3, 5, 7}, + new String[] {"word", "word", "word", "word"}, + new int[] { 1, 1, 1, 1}, + new int[] { 1, 1, 1, 1}); + a.close(); + } + + public void testBufferedFinish1() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b c", "m n o", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a b", + new String[] {"c", "a", "b"}, + new int[] { 0, 2, 4}, + new int[] { 1, 3, 5}, + new String[] {"word", "word", "word"}, + new int[] { 1, 1, 1}, + new int[] { 1, 1, 1}); + a.close(); + } + + public void testBufferedFinish2() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "m n o", false); + add(b, "d e", "m n o", false); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "c a d", + new String[] {"c", "a", "d"}, + new int[] { 0, 2, 4}, + new int[] { 1, 3, 5}, + new String[] {"word", "word", "word"}, + new int[] { 1, 1, 1}, + new int[] { 1, 1, 1}); + a.close(); + } + + public void testCanReuse() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b", "x", true); + Analyzer a = getAnalyzer(b, true); + for(int i=0;i<10;i++) { + assertAnalyzesTo(a, + "c a b", + new String[] {"c", "x", "a", "b"}, + new int[] { 0, 2, 2, 4}, + new int[] { 1, 5, 3, 5}, + new String[] {"word", "SYNONYM", "word", "word"}, + new int[] { 1, 1, 0, 1}, + new int[] { 1, 2, 1, 1}); + } + a.close(); + } + + /** Multiple input tokens map to a single output token */ + public void testManyToOne() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b c", "z", true); + + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "a b c d", + new String[] {"z", "a", "b", "c", "d"}, + new int[] { 0, 0, 2, 4, 6}, + new int[] { 5, 1, 3, 5, 7}, + new String[] {"SYNONYM", "word", "word", "word", "word"}, + new int[] { 1, 0, 1, 1, 1}, + new int[] { 3, 1, 1, 1, 1}); + a.close(); + } + + public void testBufferAfterMatch() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "a b c d", "x", true); + add(b, "a b", "y", false); + + // The 'c' token has to be buffered because SynGraphFilter + // needs to know whether a b c d -> x matches: + Analyzer a = getAnalyzer(b, true); + assertAnalyzesTo(a, + "f a b c e", + new String[] {"f", "y", "c", "e"}, + new int[] { 0, 2, 6, 8}, + new int[] { 1, 5, 7, 9}, + new String[] {"word", "SYNONYM", "word", "word"}, + new int[] { 1, 1, 1, 1}, + new int[] { 1, 1, 1, 1}); + a.close(); + } + + public void testZeroSyns() throws Exception { + Tokenizer tokenizer = new MockTokenizer(); + tokenizer.setReader(new StringReader("aa bb")); + try { + new SynonymGraphFilter(tokenizer, new SynonymMap.Builder(true).build(), true); + fail("did not hit expected exception"); + } catch (IllegalArgumentException iae) { + // expected + assertEquals("fst must be non-null", iae.getMessage()); + } + } + + public void testOutputHangsOffEnd() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + // b hangs off the end (no input token under it): + add(b, "a", "a b", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + assertAnalyzesTo(a, "a", + new String[] {"a", "b"}, + new int[] { 0, 0}, + new int[] { 1, 1}, + null, + new int[] { 1, 1}, + new int[] { 1, 1}, + true); + a.close(); + } + + public void testDedup() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "a b", + new String[]{"ab"}, + new int[]{1}); + a.close(); + } + + public void testNoDedup() throws Exception { + // dedup is false: + SynonymMap.Builder b = new SynonymMap.Builder(false); + final boolean keepOrig = false; + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "a b", + new String[]{"ab", "ab", "ab"}, + new int[]{1, 0, 0}); + a.close(); + } + + public void testMatching() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add(b, "a b", "ab", keepOrig); + add(b, "a c", "ac", keepOrig); + add(b, "a", "aa", keepOrig); + add(b, "b", "bb", keepOrig); + add(b, "z x c v", "zxcv", keepOrig); + add(b, "x c", "xc", keepOrig); + + Analyzer a = getFlattenAnalyzer(b, true); + + checkOneTerm(a, "$", "$"); + checkOneTerm(a, "a", "aa"); + checkOneTerm(a, "b", "bb"); + + assertAnalyzesTo(a, "a $", + new String[]{"aa", "$"}, + new int[]{1, 1}); + + assertAnalyzesTo(a, "$ a", + new String[]{"$", "aa"}, + new int[]{1, 1}); + + assertAnalyzesTo(a, "a a", + new String[]{"aa", "aa"}, + new int[]{1, 1}); + + assertAnalyzesTo(a, "z x c v", + new String[]{"zxcv"}, + new int[]{1}); + + assertAnalyzesTo(a, "z x c $", + new String[]{"z", "xc", "$"}, + new int[]{1, 1, 1}); + a.close(); + } + + public void testBasic1() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b, "a", "foo", true); + add(b, "a b", "bar fee", true); + add(b, "b c", "dog collar", true); + add(b, "c d", "dog harness holder extras", true); + add(b, "m c e", "dog barks loudly", false); + add(b, "i j k", "feep", true); + + add(b, "e f", "foo bar", false); + add(b, "e f", "baz bee", false); + + add(b, "z", "boo", false); + add(b, "y", "bee", true); + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "a b c", + new String[] {"bar", "a", "fee", "b", "c"}, + new int[] {1, 0, 1, 0, 1}); + + assertAnalyzesTo(a, "x a b c d", + new String[] {"x", "bar", "a", "fee", "b", "dog", "c", "harness", "d", "holder", "extras"}, + new int[] {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1}); + + assertAnalyzesTo(a, "a b a", + new String[] {"bar", "a", "fee", "b", "foo", "a"}, + new int[] {1, 0, 1, 0, 1, 0}); + + // outputs no longer add to one another: + assertAnalyzesTo(a, "c d c d", + new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "c", "harness", "d", "holder", "extras"}, + new int[] {1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1}); + + // two outputs for same input + assertAnalyzesTo(a, "e f", + new String[] {"foo", "baz", "bar", "bee"}, + new int[] {1, 0, 1, 0}); + + // verify multi-word / single-output offsets: + assertAnalyzesTo(a, "g i j k g", + new String[] {"g", "feep", "i", "j", "k", "g"}, + new int[] {1, 1, 0, 1, 1, 1}); + + // mixed keepOrig true/false: + assertAnalyzesTo(a, "a m c e x", + new String[] {"foo", "a", "dog", "barks", "loudly", "x"}, + new int[] {1, 0, 1, 1, 1, 1}); + assertAnalyzesTo(a, "c d m c e x", + new String[] {"dog", "c", "harness", "d", "holder", "extras", "dog", "barks", "loudly","x"}, + new int[] {1, 0, 1, 0, 1, 1, 1, 1, 1, 1}); + assertTrue(synFilter.getCaptureCount() > 0); + + // no captureStates when no syns matched + assertAnalyzesTo(a, "p q r s t", + new String[] {"p", "q", "r", "s", "t"}, + new int[] {1, 1, 1, 1, 1}); + assertEquals(0, synFilter.getCaptureCount()); + + // captureStates are necessary for the single-token syn case: + assertAnalyzesTo(a, "p q z y t", + new String[] {"p", "q", "boo", "bee", "y", "t"}, + new int[] {1, 1, 1, 1, 0, 1}); + assertTrue(synFilter.getCaptureCount() > 0); + } + + public void testBasic2() throws Exception { + boolean keepOrig = true; + do { + keepOrig = !keepOrig; + + SynonymMap.Builder b = new SynonymMap.Builder(true); + add(b,"aaa", "aaaa1 aaaa2 aaaa3", keepOrig); + add(b, "bbb", "bbbb1 bbbb2", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + + if (keepOrig) { + assertAnalyzesTo(a, "xyzzy bbb pot of gold", + new String[] {"xyzzy", "bbbb1", "bbb", "bbbb2", "pot", "of", "gold"}, + new int[] {1, 1, 0, 1, 1, 1, 1}); + assertAnalyzesTo(a, "xyzzy aaa pot of gold", + new String[] {"xyzzy", "aaaa1", "aaa", "aaaa2", "aaaa2", "pot", "of", "gold"}, + new int[] {1, 1, 0, 1, 1, 1, 1, 1}); + } else { + assertAnalyzesTo(a, "xyzzy bbb pot of gold", + new String[] {"xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold"}, + new int[] {1, 1, 1, 1, 1, 1}); + assertAnalyzesTo(a, "xyzzy aaa pot of gold", + new String[] {"xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold"}, + new int[] {1, 1, 1, 1, 1, 1, 1}); + } + } while (keepOrig); + } + + /** If we expand synonyms during indexing, it's a bit better than + * SynonymFilter is today, but still necessarily has false + * positive and negative PhraseQuery matches because we do not + * index posLength, so we lose information. */ + public void testFlattenedGraph() throws Exception { + + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "wtf", "what the fudge", true); + + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "wtf happened", + new String[] {"what", "wtf", "the", "fudge", "happened"}, + new int[] { 0, 0, 0, 0, 4}, + new int[] { 3, 3, 3, 3, 12}, + null, + new int[] { 1, 0, 1, 1, 1}, + new int[] { 1, 3, 1, 1, 1}, + true); + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, a); + Document doc = new Document(); + doc.add(newTextField("field", "wtf happened", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + // Good (this should not match, and doesn't): + assertEquals(0, s.count(new PhraseQuery("field", "what", "happened"))); + + // Bad (this should match, but doesn't): + assertEquals(0, s.count(new PhraseQuery("field", "wtf", "happened"))); + + // Good (this should match, and does): + assertEquals(1, s.count(new PhraseQuery("field", "what", "the", "fudge", "happened"))); + + // Bad (this should not match, but does): + assertEquals(1, s.count(new PhraseQuery("field", "wtf", "the"))); + + IOUtils.close(r, dir); + } + + // Needs TermAutomatonQuery, which is in sandbox still: + /* + public void testAccurateGraphQuery1() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "wtf happened", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "what the fudge", "wtf", true); + + SynonymMap map = b.build(); + + TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); + + TokenStream in = new CannedTokenStream(0, 23, new Token[] { + token("what", 1, 1, 0, 4), + token("the", 1, 1, 5, 8), + token("fudge", 1, 1, 9, 14), + token("happened", 1, 1, 15, 23), + }); + + assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + in = new CannedTokenStream(0, 12, new Token[] { + token("wtf", 1, 1, 0, 3), + token("happened", 1, 1, 4, 12), + }); + + assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + // "what happened" should NOT match: + in = new CannedTokenStream(0, 13, new Token[] { + token("what", 1, 1, 0, 4), + token("happened", 1, 1, 5, 13), + }); + assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + IOUtils.close(r, dir); + } + */ + + /** If we expand synonyms at search time, the results are correct. */ + // Needs TermAutomatonQuery, which is in sandbox still: + /* + public void testAccurateGraphQuery2() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "say wtf happened", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "what the fudge", "wtf", true); + + SynonymMap map = b.build(); + + TokenStream in = new CannedTokenStream(0, 26, new Token[] { + token("say", 1, 1, 0, 3), + token("what", 1, 1, 3, 7), + token("the", 1, 1, 8, 11), + token("fudge", 1, 1, 12, 17), + token("happened", 1, 1, 18, 26), + }); + + TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); + + assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + // "what happened" should NOT match: + in = new CannedTokenStream(0, 13, new Token[] { + token("what", 1, 1, 0, 4), + token("happened", 1, 1, 5, 13), + }); + assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + IOUtils.close(r, dir); + } + */ + + // Needs TermAutomatonQuery, which is in sandbox still: + /* + public void testAccurateGraphQuery3() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + Document doc = new Document(); + doc.add(newTextField("field", "say what the fudge happened", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + IndexSearcher s = newSearcher(r); + + SynonymMap.Builder b = new SynonymMap.Builder(); + add(b, "wtf", "what the fudge", true); + + SynonymMap map = b.build(); + + TokenStream in = new CannedTokenStream(0, 15, new Token[] { + token("say", 1, 1, 0, 3), + token("wtf", 1, 1, 3, 6), + token("happened", 1, 1, 7, 15), + }); + + TokenStreamToTermAutomatonQuery ts2q = new TokenStreamToTermAutomatonQuery(); + + assertEquals(1, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + // "what happened" should NOT match: + in = new CannedTokenStream(0, 13, new Token[] { + token("what", 1, 1, 0, 4), + token("happened", 1, 1, 5, 13), + }); + assertEquals(0, s.count(ts2q.toQuery("field", new SynonymGraphFilter(in, map, true)))); + + IOUtils.close(r, dir); + } + + private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { + final Token t = new Token(term, startOffset, endOffset); + t.setPositionIncrement(posInc); + t.setPositionLength(posLength); + return t; + } + */ + + private String randomNonEmptyString() { + while(true) { + String s = TestUtil.randomUnicodeString(random()).trim(); + //String s = TestUtil.randomSimpleString(random()).trim(); + if (s.length() != 0 && s.indexOf('\u0000') == -1) { + return s; + } + } + } + + // Adds MockGraphTokenFilter after SynFilter: + public void testRandomGraphAfter() throws Exception { + final int numIters = atLeast(3); + for (int i = 0; i < numIters; i++) { + SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); + } + final SynonymMap map = b.build(); + final boolean ignoreCase = random().nextBoolean(); + final boolean doFlatten = random().nextBoolean(); + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + TokenStream syns = new SynonymGraphFilter(tokenizer, map, ignoreCase); + TokenStream graph = new MockGraphTokenFilter(random(), syns); + if (doFlatten) { + graph = new FlattenGraphFilter(graph); + } + return new TokenStreamComponents(tokenizer, graph); + } + }; + + checkRandomData(random(), analyzer, 100); + analyzer.close(); + } + } + + public void testEmptyStringInput() throws IOException { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); + } + final boolean ignoreCase = random().nextBoolean(); + + Analyzer analyzer = getAnalyzer(b, ignoreCase); + + checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); + analyzer.close(); + } + } + + /** simple random test, doesn't verify correctness. + * does verify it doesnt throw exceptions, or that the stream doesn't misbehave + */ + public void testRandom2() throws Exception { + final int numIters = atLeast(3); + for (int i = 0; i < numIters; i++) { + SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); + } + final boolean ignoreCase = random().nextBoolean(); + final boolean doFlatten = random().nextBoolean(); + + Analyzer analyzer; + if (doFlatten) { + analyzer = getFlattenAnalyzer(b, ignoreCase); + } else { + analyzer = getAnalyzer(b, ignoreCase); + } + + checkRandomData(random(), analyzer, 100); + analyzer.close(); + } + } + + /** simple random test like testRandom2, but for larger docs + */ + public void testRandomHuge() throws Exception { + final int numIters = atLeast(3); + for (int i = 0; i < numIters; i++) { + SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); + final int numEntries = atLeast(10); + if (VERBOSE) { + System.out.println("TEST: iter=" + i + " numEntries=" + numEntries); + } + for (int j = 0; j < numEntries; j++) { + add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); + } + final boolean ignoreCase = random().nextBoolean(); + final boolean doFlatten = random().nextBoolean(); + + Analyzer analyzer; + if (doFlatten) { + analyzer = getFlattenAnalyzer(b, ignoreCase); + } else { + analyzer = getAnalyzer(b, ignoreCase); + } + + checkRandomData(random(), analyzer, 100, 1024); + analyzer.close(); + } + } + + public void testEmptyTerm() throws IOException { + final int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean()); + final int numEntries = atLeast(10); + for (int j = 0; j < numEntries; j++) { + add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean()); + } + final boolean ignoreCase = random().nextBoolean(); + + final Analyzer analyzer = getAnalyzer(b, ignoreCase); + + checkAnalysisConsistency(random(), analyzer, random().nextBoolean(), ""); + analyzer.close(); + } + } + + // LUCENE-3375 + public void testVanishingTermsNoFlatten() throws Exception { + String testFile = + "aaa => aaaa1 aaaa2 aaaa3\n" + + "bbb => bbbb1 bbbb2\n"; + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", + new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }); + + // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold + assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", + new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }); + analyzer.close(); + } + + // LUCENE-3375 + public void testVanishingTermsWithFlatten() throws Exception { + String testFile = + "aaa => aaaa1 aaaa2 aaaa3\n" + + "bbb => bbbb1 bbbb2\n"; + + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesTo(analyzer, "xyzzy bbb pot of gold", + new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }); + + // xyzzy aaa pot of gold -> xyzzy aaaa1 aaaa2 aaaa3 gold + assertAnalyzesTo(analyzer, "xyzzy aaa pot of gold", + new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }); + analyzer.close(); + } + + public void testBuilderDedup() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "a b", + new String[] { "ab" }, + new int[] { 1 }); + a.close(); + } + + public void testBuilderNoDedup() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(false); + final boolean keepOrig = false; + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + add(b, "a b", "ab", keepOrig); + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "a b", + new String[] { "ab", "ab", "ab" }, + new int[] { 1, 0, 0 }); + a.close(); + } + + public void testRecursion1() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add(b, "zoo", "zoo", keepOrig); + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "$", "zoo" }, + new int[] { 1, 1, 1, 1 }); + a.close(); + } + + public void testRecursion2() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = false; + add(b, "zoo", "zoo", keepOrig); + add(b, "zoo", "zoo zoo", keepOrig); + Analyzer a = getAnalyzer(b, true); + + // verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo"); + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" }, + new int[] { 1, 0, 1, 1, 0, 1, 1, 1, 0, 1 }); + a.close(); + } + + public void testRecursion3() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add(b, "zoo zoo", "zoo", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[]{"zoo", "zoo", "zoo", "$", "zoo"}, + new int[]{1, 0, 1, 1, 1}); + a.close(); + } + + public void testRecursion4() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add(b, "zoo zoo", "zoo", keepOrig); + add(b, "zoo", "zoo zoo", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + assertAnalyzesTo(a, "zoo zoo $ zoo", + new String[]{"zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo"}, + new int[]{1, 0, 1, 1, 1, 0, 1}); + a.close(); + } + + public void testKeepOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add(b, "a b", "ab", keepOrig); + add(b, "a c", "ac", keepOrig); + add(b, "a", "aa", keepOrig); + add(b, "b", "bb", keepOrig); + add(b, "z x c v", "zxcv", keepOrig); + add(b, "x c", "xc", keepOrig); + Analyzer a = getAnalyzer(b, true); + + assertAnalyzesTo(a, "$", + new String[] { "$" }, + new int[] { 1 }); + assertAnalyzesTo(a, "a", + new String[] { "aa", "a" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "a", + new String[] { "aa", "a" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "$ a", + new String[] { "$", "aa", "a" }, + new int[] { 1, 1, 0 }); + assertAnalyzesTo(a, "a $", + new String[] { "aa", "a", "$" }, + new int[] { 1, 0, 1 }); + assertAnalyzesTo(a, "$ a !", + new String[] { "$", "aa", "a", "!" }, + new int[] { 1, 1, 0, 1 }); + assertAnalyzesTo(a, "a a", + new String[] { "aa", "a", "aa", "a" }, + new int[] { 1, 0, 1, 0 }); + assertAnalyzesTo(a, "b", + new String[] { "bb", "b" }, + new int[] { 1, 0 }); + assertAnalyzesTo(a, "z x c v", + new String[] { "zxcv", "z", "x", "c", "v" }, + new int[] { 1, 0, 1, 1, 1 }); + assertAnalyzesTo(a, "z x c $", + new String[] { "z", "xc", "x", "c", "$" }, + new int[] { 1, 1, 0, 1, 1 }); + a.close(); + } + + /** + * verify type of token and positionLengths on synonyms of different word counts, with non preserving, explicit rules. + */ + public void testNonPreservingMultiwordSynonyms() throws Exception { + String testFile = + "aaa => two words\n" + + "bbb => one two, very many multiple words\n" + + "ee ff, gg, h i j k, h i => one\n" + + "cc dd => usa,united states,u s a,united states of america"; + + Analyzer analyzer = solrSynsToAnalyzer(testFile); + + assertAnalyzesTo(analyzer, "aaa", + new String[]{"two", "words"}, + new int[]{0, 0}, + new int[]{3, 3}, + new String[]{"SYNONYM", "SYNONYM"}, + new int[]{1, 1}, + new int[]{1, 1}); + + assertAnalyzesToPositions(analyzer, "amazing aaa", + new String[]{"amazing", "two", "words"}, + new String[]{"word", "SYNONYM", "SYNONYM"}, + new int[]{1, 1, 1}, + new int[]{1, 1, 1}); + + assertAnalyzesTo(analyzer, "p bbb s", + new String[]{"p", "one", "very", "two", "many", "multiple", "words", "s"}, + new int[]{0, 2, 2, 2, 2, 2, 2, 6}, + new int[]{1, 5, 5, 5, 5, 5, 5, 7}, + new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word"}, + new int[]{1, 1, 0, 1, 0, 1, 1, 1}, + new int[]{1, 1, 1, 3, 1, 1, 1, 1}); + + assertAnalyzesTo(analyzer, "p ee ff s", + new String[]{"p", "one", "s"}, + new int[]{0, 2, 8}, + new int[]{1, 7, 9}, + new String[]{"word", "SYNONYM", "word"}, + new int[]{1, 1, 1}, + new int[]{1, 1, 1}); + + assertAnalyzesTo(analyzer, "p h i j s", + new String[]{"p", "one", "j", "s"}, + new int[]{0, 2, 6, 8}, + new int[]{1, 5, 7, 9}, + new String[]{"word", "SYNONYM", "word", "word"}, + new int[]{1, 1, 1, 1}, + new int[]{1, 1, 1, 1}); + + analyzer.close(); + } + + private Analyzer getAnalyzer(SynonymMap.Builder b, final boolean ignoreCase) throws IOException { + final SynonymMap map = b.build(); + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + // Make a local variable so testRandomHuge doesn't share it across threads! + SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); + TestSynonymGraphFilter.this.flattenFilter = null; + TestSynonymGraphFilter.this.synFilter = synFilter; + return new TokenStreamComponents(tokenizer, synFilter); + } + }; + } + + /** Appends FlattenGraphFilter too */ + private Analyzer getFlattenAnalyzer(SynonymMap.Builder b, boolean ignoreCase) throws IOException { + final SynonymMap map = b.build(); + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + // Make a local variable so testRandomHuge doesn't share it across threads! + SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); + FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter); + TestSynonymGraphFilter.this.synFilter = synFilter; + TestSynonymGraphFilter.this.flattenFilter = flattenFilter; + return new TokenStreamComponents(tokenizer, flattenFilter); + } + }; + } + + private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) { + if (VERBOSE) { + //System.out.println(" add input=" + input + " output=" + output + " keepOrig=" + keepOrig); + } + CharsRefBuilder inputCharsRef = new CharsRefBuilder(); + SynonymMap.Builder.join(input.split(" +"), inputCharsRef); + + CharsRefBuilder outputCharsRef = new CharsRefBuilder(); + SynonymMap.Builder.join(output.split(" +"), outputCharsRef); + + b.add(inputCharsRef.get(), outputCharsRef.get(), keepOrig); + } + + private char[] randomBinaryChars(int minLen, int maxLen, double bias, char base) { + int len = TestUtil.nextInt(random(), minLen, maxLen); + char[] chars = new char[len]; + for(int i=0;i 0) { + b.append(' '); + } + b.append(c); + } + return b.toString(); + } + + private static class OneSyn { + char[] in; + char[] out; + boolean keepOrig; + + @Override + public String toString() { + return toTokenString(in) + " --> " + toTokenString(out) + " (keepOrig=" + keepOrig + ")"; + } + } + + public void testRandomSyns() throws Exception { + int synCount = atLeast(10); + double bias = random().nextDouble(); + boolean dedup = random().nextBoolean(); + + boolean flatten = random().nextBoolean(); + + SynonymMap.Builder b = new SynonymMap.Builder(dedup); + List syns = new ArrayList<>(); + // Makes random syns from random a / b tokens, mapping to random x / y tokens + if (VERBOSE) { + System.out.println("TEST: make " + synCount + " syns"); + System.out.println(" bias for a over b=" + bias); + System.out.println(" dedup=" + dedup); + System.out.println(" flatten=" + flatten); + } + + int maxSynLength = 0; + + for(int i=0;i states = new HashSet<>(); + states.add(0); + Transition t = new Transition(); + for(int i=0;i nextStates = new HashSet<>(); + for(int state : states) { + int count = a.initTransition(state, t); + for(int j=0;j= t.min && digit <= t.max) { + nextStates.add(t.dest); + } + } + } + states = nextStates; + if (states.isEmpty()) { + return false; + } + } + + for(int state : states) { + if (a.isAccept(state)) { + return true; + } + } + + return false; + } + + /** Stupid, slow brute-force, yet hopefully bug-free, synonym filter. */ + private Automaton slowSynFilter(String doc, List syns, boolean flatten) { + String[] tokens = doc.split(" +"); + if (VERBOSE) { + System.out.println(" doc has " + tokens.length + " tokens"); + } + int i=0; + Automaton.Builder a = new Automaton.Builder(); + int lastState = a.createState(); + while (i flatStates; + if (flatten) { + flatStates = new ArrayList<>(); + } else { + flatStates = null; + } + + if (keepOrig) { + // Add path for the original tokens + addSidePath(a, lastState, nextState, matches.get(0).in, flatStates); + } + + for(OneSyn syn : matches) { + addSidePath(a, lastState, nextState, syn.out, flatStates); + } + + i += matches.get(0).in.length; + } else { + a.addTransition(lastState, nextState, tokens[i].charAt(0)); + i++; + } + + lastState = nextState; + } + + a.setAccept(lastState, true); + + return topoSort(a.finish()); + } + + /** Just creates a side path from startState to endState with the provided tokens. */ + private static void addSidePath(Automaton.Builder a, int startState, int endState, char[] tokens, List flatStates) { + int lastState = startState; + for(int i=0;i= flatStates.size()) { + nextState = a.createState(); + if (flatStates != null) { + assert i == flatStates.size(); + flatStates.add(nextState); + } + } else { + nextState = flatStates.get(i); + } + a.addTransition(lastState, nextState, tokens[i]); + + lastState = nextState; + } + } + + private Automaton toAutomaton(TokenStream ts) throws IOException { + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + Automaton a = new Automaton(); + int srcNode = -1; + int destNode = -1; + int state = a.createState(); + while (ts.incrementToken()) { + assert termAtt.length() == 1; + char c = termAtt.charAt(0); + int posInc = posIncAtt.getPositionIncrement(); + if (posInc != 0) { + srcNode += posInc; + while (state < srcNode) { + state = a.createState(); + } + } + destNode = srcNode + posLenAtt.getPositionLength(); + while (state < destNode) { + state = a.createState(); + } + a.addTransition(srcNode, destNode, c); + } + ts.end(); + ts.close(); + a.finishState(); + a.setAccept(destNode, true); + return a; + } + + /* + private String toDot(TokenStream ts) throws IOException { + PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); + PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); + ts.reset(); + int srcNode = -1; + int destNode = -1; + + StringBuilder b = new StringBuilder(); + b.append("digraph Automaton {\n"); + b.append(" rankdir = LR\n"); + b.append(" node [width=0.2, height=0.2, fontsize=8]\n"); + b.append(" initial [shape=plaintext,label=\"\"]\n"); + b.append(" initial -> 0\n"); + + while (ts.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (posInc != 0) { + srcNode += posInc; + b.append(" "); + b.append(srcNode); + b.append(" [shape=circle,label=\"" + srcNode + "\"]\n"); + } + destNode = srcNode + posLenAtt.getPositionLength(); + b.append(" "); + b.append(srcNode); + b.append(" -> "); + b.append(destNode); + b.append(" [label=\""); + b.append(termAtt); + b.append("\""); + if (typeAtt.type().equals("word") == false) { + b.append(" color=red"); + } + b.append("]\n"); + } + ts.end(); + ts.close(); + + b.append('}'); + return b.toString(); + } + */ + + /** Renumbers nodes according to their topo sort */ + private Automaton topoSort(Automaton in) { + int[] newToOld = Operations.topoSortStates(in); + int[] oldToNew = new int[newToOld.length]; + + Automaton.Builder a = new Automaton.Builder(); + //System.out.println("remap:"); + for(int i=0;i " + i); + if (in.isAccept(newToOld[i])) { + a.setAccept(i, true); + //System.out.println(" **"); + } + } + + Transition t = new Transition(); + for(int i=0;i>> 1; + + final int[] synonymsIdxs = new int[count]; + for (int i = 0; i < count; i++) { + synonymsIdxs[i] = bytesReader.readVInt(); + } + + BytesRef scratchBytes = new BytesRef(); + map.words.get(synonymsIdxs[2], scratchBytes); + + int synonymLength = 1; + for (int i = scratchBytes.offset; i < scratchBytes.offset + scratchBytes.length; i++) { + if (scratchBytes.bytes[i] == SynonymMap.WORD_SEPARATOR) { + synonymLength++; + } + } + + assertEquals(count, 3); + assertEquals(synonymLength, 4); + + assertAnalyzesTo(analyzer, "spider man", + new String[]{"spiderman", "spider", "man"}, + new int[]{0, 0, 7}, + new int[]{10, 6, 10}, + new String[]{"SYNONYM", "word", "word"}, + new int[]{1, 0, 1}, + new int[]{2, 1, 1}); + + assertAnalyzesToPositions(analyzer, "amazing spider man", + new String[]{"amazing", "spiderman", "spider", "man"}, + new String[]{"word", "SYNONYM", "word", "word"}, + new int[]{1, 1, 0, 1}, + new int[]{1, 2, 1, 1}); + + // System.out.println(toDot(getAnalyzer(parser, true).tokenStream("field", new StringReader("the usa is wealthy")))); + + assertAnalyzesTo(analyzer, "the united states of america is wealthy", + new String[]{"the", "usa", "united", "u", "united", "states", "s", "states", "a", "of", "america", "is", "wealthy"}, + new int[] {0, 4, 4, 4, 4, 11, 11, 11, 18, 18, 21, 29, 32}, + new int[] {3, 28, 10, 10, 10, 28, 17, 17, 28, 20, 28, 31, 39}, + new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "word", "word", "word", "word"}, + new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, + new int[] {1, 4, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1}); + + assertAnalyzesToPositions(analyzer, "spiderman", + new String[]{"spider", "spiderman", "man"}, + new String[]{"SYNONYM", "word", "SYNONYM"}, + new int[]{1, 0, 1}, + new int[]{1, 2, 1}); + + assertAnalyzesTo(analyzer, "spiderman enemies", + new String[]{"spider", "spiderman", "man", "enemies"}, + new int[]{0, 0, 0, 10}, + new int[]{9, 9, 9, 17}, + new String[]{"SYNONYM", "word", "SYNONYM", "word"}, + new int[]{1, 0, 1, 1}, + new int[]{1, 2, 1, 1}); + + assertAnalyzesTo(analyzer, "the usa is wealthy", + new String[]{"the", "united", "u", "united", "usa", "states", "s", "states", "a", "of", "america", "is", "wealthy"}, + new int[] {0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 11}, + new int[] {3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 18}, + new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, + new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, + new int[] {1, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 1, 1}); + + assertAllStrings(analyzer, "the usa is wealthy", new String[] { + "the usa is wealthy", + "the united states is wealthy", + "the u s a is wealthy", + "the united states of america is wealthy", + // Wrong. Here only due to "sausagization" of the multi word synonyms. + "the u states is wealthy", + "the u states a is wealthy", + "the u s of america is wealthy", + "the u states of america is wealthy", + "the united s a is wealthy", + "the united states a is wealthy", + "the united s of america is wealthy"}); + + assertAnalyzesTo(analyzer, "the united states is wealthy", + new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "is", "wealthy"}, + new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21}, + new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28}, + new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, + new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, + new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1}, + false); + + assertAnalyzesTo(analyzer, "the united states of balance", + new String[]{"the", "usa", "u", "united", "united", "s", "states", "states", "a", "of", "america", "of", "balance"}, + new int[] {0, 4, 4, 4, 4, 11, 11, 11, 11, 11, 11, 18, 21}, + new int[] {3, 17, 10, 10, 10, 17, 17, 17, 17, 17, 17, 20, 28}, + new String[]{"word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "word", "SYNONYM", "SYNONYM", "SYNONYM", "word", "word"}, + new int[] {1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1}, + new int[] {1, 4, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1}); + + analyzer.close(); + } + + public void testMultiwordOffsets() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add(b, "national hockey league", "nhl", keepOrig); + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "national hockey league", + new String[]{"nhl", "national", "hockey", "league"}, + new int[]{0, 0, 9, 16}, + new int[]{22, 8, 15, 22}, + new int[]{1, 0, 1, 1}); + a.close(); + } + + public void testIncludeOrig() throws Exception { + SynonymMap.Builder b = new SynonymMap.Builder(true); + final boolean keepOrig = true; + add(b, "a b", "ab", keepOrig); + add(b, "a c", "ac", keepOrig); + add(b, "a", "aa", keepOrig); + add(b, "b", "bb", keepOrig); + add(b, "z x c v", "zxcv", keepOrig); + add(b, "x c", "xc", keepOrig); + + Analyzer a = getFlattenAnalyzer(b, true); + + assertAnalyzesTo(a, "$", + new String[]{"$"}, + new int[]{1}); + assertAnalyzesTo(a, "a", + new String[]{"aa", "a"}, + new int[]{1, 0}); + assertAnalyzesTo(a, "a", + new String[]{"aa", "a"}, + new int[]{1, 0}); + assertAnalyzesTo(a, "$ a", + new String[]{"$", "aa", "a"}, + new int[]{1, 1, 0}); + assertAnalyzesTo(a, "a $", + new String[]{"aa", "a", "$"}, + new int[]{1, 0, 1}); + assertAnalyzesTo(a, "$ a !", + new String[]{"$", "aa", "a", "!"}, + new int[]{1, 1, 0, 1}); + assertAnalyzesTo(a, "a a", + new String[]{"aa", "a", "aa", "a"}, + new int[]{1, 0, 1, 0}); + assertAnalyzesTo(a, "b", + new String[]{"bb", "b"}, + new int[]{1, 0}); + assertAnalyzesTo(a, "z x c v", + new String[]{"zxcv", "z", "x", "c", "v"}, + new int[]{1, 0, 1, 1, 1}); + assertAnalyzesTo(a, "z x c $", + new String[]{"z", "xc", "x", "c", "$"}, + new int[]{1, 1, 0, 1, 1}); + a.close(); + } + + /** + * Helper method to validate all strings that can be generated from a token stream. + * Uses {@link TokenStreamToAutomaton} to create an automaton. Asserts the finite strings of the automaton are all + * and only the given valid strings. + * @param analyzer analyzer containing the SynonymFilter under test. + * @param text text to be analyzed. + * @param expectedStrings all expected finite strings. + */ + public void assertAllStrings(Analyzer analyzer, String text, String[] expectedStrings) throws IOException { + TokenStream tokenStream = analyzer.tokenStream("dummy", text); + try { + Automaton automaton = new TokenStreamToAutomaton().toAutomaton(tokenStream); + Set finiteStrings = AutomatonTestUtil.getFiniteStringsRecursive(automaton, -1); + + assertEquals("Invalid resulting strings count. Expected " + expectedStrings.length + " was " + finiteStrings.size(), + expectedStrings.length, finiteStrings.size()); + + Set expectedStringsSet = new HashSet<>(Arrays.asList(expectedStrings)); + + BytesRefBuilder scratchBytesRefBuilder = new BytesRefBuilder(); + for (IntsRef ir: finiteStrings) { + String s = Util.toBytesRef(ir, scratchBytesRefBuilder).utf8ToString().replace((char) TokenStreamToAutomaton.POS_SEP, ' '); + assertTrue("Unexpected string found: " + s, expectedStringsSet.contains(s)); + } + } finally { + tokenStream.close(); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java index 0dd449c9961..e4a5bd912bd 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automaton.java @@ -579,14 +579,15 @@ public class Automaton implements Accountable { /** Returns the dot (graphviz) representation of this automaton. * This is extremely useful for visualizing the automaton. */ public String toDot() { - // TODO: breadth first search so we can see get layered output... + // TODO: breadth first search so we can get layered output... StringBuilder b = new StringBuilder(); b.append("digraph Automaton {\n"); b.append(" rankdir = LR\n"); + b.append(" node [width=0.2, height=0.2, fontsize=8]\n"); final int numStates = getNumStates(); if (numStates > 0) { - b.append(" initial [shape=plaintext,label=\"0\"]\n"); + b.append(" initial [shape=plaintext,label=\"\"]\n"); b.append(" initial -> 0\n"); } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index eedb5336624..718a9089ce2 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -370,10 +370,8 @@ final public class Operations { } /** Returns true if these two automata accept exactly the - * same language. This is a costly computation! Note - * also that a1 and a2 will be determinized as a side - * effect. Both automata must be determinized and have - * no dead states! */ + * same language. This is a costly computation! Both automata + * must be determinized and have no dead states! */ public static boolean sameLanguage(Automaton a1, Automaton a2) { if (a1 == a2) { return true; diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java index 4ce81ab35a9..7be9339914d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StatePair.java @@ -79,7 +79,9 @@ public class StatePair { */ @Override public int hashCode() { - return s1 ^ s2; + // Don't use s1 ^ s2 since it's vulnerable to the case where s1 == s2 always --> hashCode = 0, e.g. if you call Operations.sameLanguage, + // passing the same automaton against itself: + return s1 * 31 + s2; } @Override diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java index 0bb623fe36d..c23bc7d322b 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java @@ -184,22 +184,22 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { assertEquals("term "+i, output[i], termAtt.toString()); if (startOffsets != null) { - assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset()); + assertEquals("startOffset " + i + " term=" + termAtt, startOffsets[i], offsetAtt.startOffset()); } if (endOffsets != null) { - assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset()); + assertEquals("endOffset " + i + " term=" + termAtt, endOffsets[i], offsetAtt.endOffset()); } if (types != null) { - assertEquals("type "+i, types[i], typeAtt.type()); + assertEquals("type " + i + " term=" + termAtt, types[i], typeAtt.type()); } if (posIncrements != null) { - assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement()); + assertEquals("posIncrement " + i + " term=" + termAtt, posIncrements[i], posIncrAtt.getPositionIncrement()); } if (posLengths != null) { - assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength()); + assertEquals("posLength " + i + " term=" + termAtt, posLengths[i], posLengthAtt.getPositionLength()); } if (keywordAtts != null) { - assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); + assertEquals("keywordAtt " + i + " term=" + termAtt, keywordAtts[i], keywordAtt.isKeyword()); } // we can enforce some basic things about a few attributes even if the caller doesn't check: @@ -207,13 +207,13 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); if (finalOffset != null) { - assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); - assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), + assertTrue("startOffset (= " + startOffset + ") must be <= finalOffset (= " + finalOffset + ") term=" + termAtt, startOffset <= finalOffset.intValue()); + assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue() + " term=" + termAtt, endOffset <= finalOffset.intValue()); } if (offsetsAreCorrect) { - assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); + assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " term=" + termAtt, offsetAtt.startOffset() >= lastStartOffset); lastStartOffset = offsetAtt.startOffset(); } @@ -235,7 +235,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { // We've seen a token leaving from this position // before; verify the startOffset is the same: //System.out.println(" + vs " + pos + " -> " + startOffset); - assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); + assertEquals(i + " inconsistent startOffset: pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); } final int endPos = pos + posLength; @@ -248,7 +248,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { // We've seen a token arriving to this position // before; verify the endOffset is the same: //System.out.println(" + ve " + endPos + " -> " + endOffset); - assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); + assertEquals("inconsistent endOffset " + i + " pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); } } } @@ -350,16 +350,19 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException { checkResetException(a, input); + checkAnalysisConsistency(random(), a, true, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException { checkResetException(a, input); + checkAnalysisConsistency(random(), a, true, input); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length()); } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException { checkResetException(a, input); + checkAnalysisConsistency(random(), a, true, input, offsetsAreCorrect); assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect); } @@ -378,6 +381,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException { assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths); } + + public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, String[] types, int[] posIncrements, int[] posLengths) throws IOException { + assertAnalyzesTo(a, input, output, null, null, types, posIncrements, posLengths); + } public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException { assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null); @@ -598,7 +605,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { try { for (int i = 0; i < iterations; i++) { String text; - + if (random.nextInt(10) == 7) { // real data from linedocs text = docs.nextDoc().get("body"); @@ -622,7 +629,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { // synthetic text = TestUtil.randomAnalysisString(random, maxWordLength, simple); } - + try { checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField); if (iw != null) { @@ -768,7 +775,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } catch (IllegalStateException ise) { // Catch & ignore MockTokenizer's // anger... - if ("end() called before incrementToken() returned false!".equals(ise.getMessage())) { + if (ise.getMessage().contains("end() called in wrong state=")) { // OK } else { throw ise; @@ -793,7 +800,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase { } catch (IllegalStateException ise) { // Catch & ignore MockTokenizer's // anger... - if ("end() called before incrementToken() returned false!".equals(ise.getMessage())) { + if (ise.getMessage().contains("end() called in wrong state=")) { // OK } else { throw ise; diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java index 62567219e4c..76b71c122e7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java @@ -103,6 +103,7 @@ public class MockTokenizer extends Tokenizer { public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) { this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); } + /** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */ public MockTokenizer() { this(WHITESPACE, true); @@ -316,7 +317,7 @@ public class MockTokenizer extends Tokenizer { // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. // these tests should disable this check (in general you should consume the entire stream) if (streamState != State.INCREMENT_FALSE) { - fail("end() called before incrementToken() returned false!"); + fail("end() called in wrong state=" + streamState + "!"); } } finally { streamState = State.END;