From bf45ab79ec4bb61232557803401d970e6eff171d Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 19 Dec 2023 11:17:38 +0100 Subject: [PATCH] Beef up `Terms#intersect` checks in `CheckIndex`. (#12926) Now also testing what happens with a non-null `startTerm`. This found bugs in `DirectPostingsFormat`. --- .../codecs/memory/DirectPostingsFormat.java | 17 +++-- .../org/apache/lucene/index/CheckIndex.java | 72 ++++++++++++++----- 2 files changed, 64 insertions(+), 25 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 4c355f314f1..8966fcbc6de 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -994,7 +994,10 @@ public final class DirectPostingsFormat extends PostingsFormat { while (label > states[i].transitionMax) { states[i].transitionUpto++; - assert states[i].transitionUpto < states[i].transitionCount; + if (states[i].transitionUpto >= states[i].transitionCount) { + // All transitions compare less than the required label + break; + } transitionAccessor.getNextTransition(states[i].transition); states[i].transitionMin = states[i].transition.min; states[i].transitionMax = states[i].transition.max; @@ -1119,12 +1122,14 @@ public final class DirectPostingsFormat extends PostingsFormat { } } - final int termOffset = termOffsets[termOrd]; - final int termLen = termOffsets[1 + termOrd] - termOffset; + if (termOrd >= 0) { + final int termOffset = termOffsets[termOrd]; + final int termLen = termOffsets[1 + termOrd] - termOffset; - if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) { - stateUpto -= skipUpto; - termOrd--; + if (!startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) { + stateUpto -= skipUpto; + termOrd--; + } } // if (DEBUG) { // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index cd601743f29..a555ce40001 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -2305,31 +2305,26 @@ public final class CheckIndex implements Closeable { } // Test Terms#intersect - TermsEnum allTerms = terms.iterator(); // An automaton that should match a good number of terms - Automaton a = + Automaton automaton = Operations.concatenate( Arrays.asList( Automata.makeAnyBinary(), Automata.makeCharRange('a', 'e'), Automata.makeAnyBinary())); - a = Operations.determinize(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); - CompiledAutomaton ca = new CompiledAutomaton(a); - ByteRunAutomaton runAutomaton = new ByteRunAutomaton(a); - TermsEnum filteredTerms = terms.intersect(ca, null); - for (BytesRef term = allTerms.next(); term != null; term = allTerms.next()) { - if (runAutomaton.run(term.bytes, term.offset, term.length)) { - BytesRef filteredTerm = filteredTerms.next(); - if (Objects.equals(term, filteredTerm) == false) { - throw new CheckIndexException( - "Expected next filtered term: " + term + ", but got " + filteredTerm); - } - } - } - BytesRef filteredTerm = filteredTerms.next(); - if (filteredTerm != null) { - throw new CheckIndexException("Expected exhausted TermsEnum, but got " + filteredTerm); - } + BytesRef startTerm = null; + checkTermsIntersect(terms, automaton, startTerm); + + startTerm = new BytesRef(); + checkTermsIntersect(terms, automaton, startTerm); + + automaton = Automata.makeAnyBinary(); + startTerm = new BytesRef(new byte[] {'l'}); + checkTermsIntersect(terms, automaton, startTerm); + + // a term that likely compares greater than every other term in the dictionary + startTerm = new BytesRef(new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF}); + checkTermsIntersect(terms, automaton, startTerm); } } @@ -2370,6 +2365,45 @@ public final class CheckIndex implements Closeable { return status; } + private static void checkTermsIntersect(Terms terms, Automaton automaton, BytesRef startTerm) + throws IOException { + TermsEnum allTerms = terms.iterator(); + automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton); + ByteRunAutomaton runAutomaton = new ByteRunAutomaton(automaton); + TermsEnum filteredTerms = terms.intersect(compiledAutomaton, startTerm); + BytesRef term; + if (startTerm != null) { + switch (allTerms.seekCeil(startTerm)) { + case FOUND: + term = allTerms.next(); + break; + case NOT_FOUND: + term = allTerms.term(); + break; + case END: + default: + term = null; + break; + } + } else { + term = allTerms.next(); + } + for (; term != null; term = allTerms.next()) { + if (runAutomaton.run(term.bytes, term.offset, term.length)) { + BytesRef filteredTerm = filteredTerms.next(); + if (Objects.equals(term, filteredTerm) == false) { + throw new CheckIndexException( + "Expected next filtered term: " + term + ", but got " + filteredTerm); + } + } + } + BytesRef filteredTerm = filteredTerms.next(); + if (filteredTerm != null) { + throw new CheckIndexException("Expected exhausted TermsEnum, but got " + filteredTerm); + } + } + /** * For use in tests only. *