Use Arrays.compareUnsigned instead of iterating compare. (#13252)

2024-04-19 16:01:16 +08:00 · 2024-04-19 16:01:16 +08:00 · 3024e66e4a
parent 1f1181a079
commit 3024e66e4a
3 changed files with 55 additions and 41 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -288,6 +288,8 @@ Improvements
 Optimizations
 ---------------------

+* GITHUB#13252: Replace handwritten loops compare with Arrays.compareUnsigned in SegmentTermsEnum. (zhouhui)
+
 * GITHUB#12996: Reduce ArrayUtil#grow in decompress. (Zhang Chao)

 * GITHUB#13115: Short circuit queued flush check when flush on update is disabled (Prabhat Sharma)
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java
@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene90.blocktree;

 import java.io.IOException;
 import java.io.PrintStream;
+import java.util.Arrays;
 import org.apache.lucene.codecs.BlockTermState;
 import org.apache.lucene.index.BaseTermsEnum;
 import org.apache.lucene.index.ImpactsEnum;
@ -387,31 +388,18 @@ final class SegmentTermsEnum extends BaseTermsEnum {
      }

      if (cmp == 0) {
-        final int targetUptoMid = targetUpto;
-
        // Second compare the rest of the term, but
        // don't save arc/output/frame; we only do this
        // to find out if the target term is before,
        // equal or after the current term
-        final int targetLimit2 = Math.min(target.length, term.length());
-        while (targetUpto < targetLimit2) {
-          cmp =
-              (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
-          // if (DEBUG) {
-          //    System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" +
-          // targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset +
-          // targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
-          // }
-          if (cmp != 0) {
-            break;
-          }
-          targetUpto++;
-        }
-
-        if (cmp == 0) {
-          cmp = term.length() - target.length;
-        }
-        targetUpto = targetUptoMid;
+        cmp =
+            Arrays.compareUnsigned(
+                term.bytes(),
+                targetUpto,
+                term.length(),
+                target.bytes,
+                target.offset + targetUpto,
+                target.offset + target.length);
      }

      if (cmp < 0) {
@ -666,28 +654,16 @@ final class SegmentTermsEnum extends BaseTermsEnum {
      }

      if (cmp == 0) {
-        final int targetUptoMid = targetUpto;
        // Second compare the rest of the term, but
        // don't save arc/output/frame:
-        final int targetLimit2 = Math.min(target.length, term.length());
-        while (targetUpto < targetLimit2) {
-          cmp =
-              (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
-          // if (DEBUG) {
-          // System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit
-          // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto])
-          // + " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")");
-          // }
-          if (cmp != 0) {
-            break;
-          }
-          targetUpto++;
-        }
-
-        if (cmp == 0) {
-          cmp = term.length() - target.length;
-        }
-        targetUpto = targetUptoMid;
+        cmp =
+            Arrays.compareUnsigned(
+                term.bytes(),
+                targetUpto,
+                term.length(),
+                target.bytes,
+                target.offset + targetUpto,
+                target.offset + target.length);
      }

      if (cmp < 0) {
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BasePostingsFormatTestCase.java
@ -369,6 +369,42 @@ public abstract class BasePostingsFormatTestCase extends BaseIndexFileFormatTest
    dir.close();
  }

+  // Test seek in disorder.
+  public void testDisorder() throws Exception {
+    Directory dir = newDirectory();
+
+    IndexWriterConfig iwc = newIndexWriterConfig(null);
+    iwc.setCodec(getCodec());
+    iwc.setMergePolicy(newTieredMergePolicy());
+    IndexWriter iw = new IndexWriter(dir, iwc);
+
+    for (int i = 0; i < 10000; i++) {
+      Document document = new Document();
+      document.add(new StringField("id", i + "", Field.Store.NO));
+      iw.addDocument(document);
+    }
+    iw.commit();
+    iw.forceMerge(1);
+
+    DirectoryReader reader = DirectoryReader.open(iw);
+    TermsEnum termsEnum = getOnlyLeafReader(reader).terms("id").iterator();
+
+    for (int i = 0; i < 20000; i++) {
+      int n = random().nextInt(0, 10000);
+      BytesRef target = new BytesRef(n + "");
+      // seekExact.
+      assertTrue(termsEnum.seekExact(target));
+      assertEquals(termsEnum.term(), target);
+      // seekCeil.
+      assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(target));
+      assertEquals(termsEnum.term(), target);
+    }
+
+    reader.close();
+    iw.close();
+    dir.close();
+  }
+
  protected void subCheckBinarySearch(TermsEnum termsEnum) throws Exception {}

  public void testBinarySearchTermLeaf() throws Exception {