diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java index ea6f6cd542b..fe8ed72357c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java @@ -195,6 +195,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter { private int savedStartOffset; private int savedEndOffset; private AttributeSource.State savedState; + private int lastStartOffset; // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. @@ -373,12 +374,24 @@ public final class WordDelimiterGraphFilter extends TokenFilter { int endPart = bufferedParts[4*bufferedPos+3]; bufferedPos++; + int startOffset; + int endOffset; + if (hasIllegalOffsets) { - offsetAttribute.setOffset(savedStartOffset, savedEndOffset); + startOffset = savedStartOffset; + endOffset = savedEndOffset; } else { - offsetAttribute.setOffset(savedStartOffset + startPart, savedStartOffset + endPart); + startOffset = savedStartOffset + startPart; + endOffset = savedStartOffset + endPart; } + // never let offsets go backwards: + startOffset = Math.max(startOffset, lastStartOffset); + endOffset = Math.max(endOffset, lastStartOffset); + + offsetAttribute.setOffset(startOffset, endOffset); + lastStartOffset = startOffset; + if (termPart == null) { termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); } else { @@ -402,6 +415,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter { super.reset(); accumPosInc = 0; savedState = null; + lastStartOffset = 0; concat.clear(); concatAll.clear(); }