diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java index 7ede190b61d..c1fa1f7cba1 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java @@ -17,22 +17,6 @@ package org.apache.lucene.analysis.synonym; -/** - * This filter "casts" token graphs down into a "flat" form, - * for indexing. This is an inherently lossy process: nodes (positions) - * along side paths are forcefully merged. - * - *
In general this means the output graph will accept token sequences - * that the input graph did not accept, and will also fail to accept - * token sequences that the input graph did accept. - * - *
This is only necessary at indexing time because Lucene cannot yet index
- * an arbitrary token graph. At search time there are better options, e.g.
- * the experimental TermAutomatonQuery
in sandbox.
- *
- * @lucene.experimental
- */
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -49,7 +33,12 @@ import org.apache.lucene.util.RollingBuffer;
* Converts an incoming graph token stream, such as one from
* {@link SynonymGraphFilter}, into a flat form so that
* all nodes form a single linear chain with no side paths. Every
- * path through the graph touches every node.
+ * path through the graph touches every node. This is necessary
+ * when indexing a graph token stream, because the index does not
+ * save {@link PositionLengthAttribute} and so it cannot
+ * preserve the graph structure. However, at search time,
+ * query parsers can correctly handle the graph and this token
+ * filter should not be used.
*
*
If the graph was not already flat to start, this * is likely a lossy process, i.e. it will often cause the @@ -234,7 +223,11 @@ public final class FlattenGraphFilter extends TokenFilter { // which would otherwise happen if the replacement has more tokens // than the input: int startOffset = Math.max(lastStartOffset, output.startOffset); - offsetAtt.setOffset(startOffset, outputEndNode.endOffset); + + // We must do this in case the incoming tokens have broken offsets: + int endOffset = Math.max(startOffset, outputEndNode.endOffset); + + offsetAtt.setOffset(startOffset, endOffset); lastStartOffset = startOffset; if (inputNode.nextOut == inputNode.tokens.size()) { @@ -382,7 +375,7 @@ public final class FlattenGraphFilter extends TokenFilter { // NOTE, shady: don't call super.end, because we did already from incrementToken } - clearAttributes(); + clearAttributes(); if (done) { // On exc, done is false, and we will not have set these: posIncAtt.setPositionIncrement(finalPosInc);