mirror of https://github.com/apache/lucene.git
LUCENE-6664: be more robust to broken token stream offsets
This commit is contained in:
parent
b4a002f7d8
commit
f6fb6941bb
|
@ -17,22 +17,6 @@
|
|||
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/**
|
||||
* This filter "casts" token graphs down into a "flat" form,
|
||||
* for indexing. This is an inherently lossy process: nodes (positions)
|
||||
* along side paths are forcefully merged.
|
||||
*
|
||||
* <p>In general this means the output graph will accept token sequences
|
||||
* that the input graph did not accept, and will also fail to accept
|
||||
* token sequences that the input graph did accept.
|
||||
*
|
||||
* <p>This is only necessary at indexing time because Lucene cannot yet index
|
||||
* an arbitrary token graph. At search time there are better options, e.g.
|
||||
* the experimental <code>TermAutomatonQuery</code> in sandbox.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -49,7 +33,12 @@ import org.apache.lucene.util.RollingBuffer;
|
|||
* Converts an incoming graph token stream, such as one from
|
||||
* {@link SynonymGraphFilter}, into a flat form so that
|
||||
* all nodes form a single linear chain with no side paths. Every
|
||||
* path through the graph touches every node.
|
||||
* path through the graph touches every node. This is necessary
|
||||
* when indexing a graph token stream, because the index does not
|
||||
* save {@link PositionLengthAttribute} and so it cannot
|
||||
* preserve the graph structure. However, at search time,
|
||||
* query parsers can correctly handle the graph and this token
|
||||
* filter should <b>not</b> be used.
|
||||
*
|
||||
* <p>If the graph was not already flat to start, this
|
||||
* is likely a lossy process, i.e. it will often cause the
|
||||
|
@ -234,7 +223,11 @@ public final class FlattenGraphFilter extends TokenFilter {
|
|||
// which would otherwise happen if the replacement has more tokens
|
||||
// than the input:
|
||||
int startOffset = Math.max(lastStartOffset, output.startOffset);
|
||||
offsetAtt.setOffset(startOffset, outputEndNode.endOffset);
|
||||
|
||||
// We must do this in case the incoming tokens have broken offsets:
|
||||
int endOffset = Math.max(startOffset, outputEndNode.endOffset);
|
||||
|
||||
offsetAtt.setOffset(startOffset, endOffset);
|
||||
lastStartOffset = startOffset;
|
||||
|
||||
if (inputNode.nextOut == inputNode.tokens.size()) {
|
||||
|
|
Loading…
Reference in New Issue