From 126ac9a5fe00fbbc6870ef25ae3fc6af6cd7c557 Mon Sep 17 00:00:00 2001
From: Mike McCandless
Date: Sat, 27 Feb 2016 10:26:20 -0500
Subject: [PATCH] BytesRefHash.sort always sorts in unicode order
---
.../miscellaneous/StemmerOverrideFilter.java | 2 +-
.../lucene/index/SortedDocValuesWriter.java | 2 +-
.../index/SortedSetDocValuesWriter.java | 2 +-
.../lucene/index/TermsHashPerField.java | 2 +-
.../apache/lucene/search/ScoringRewrite.java | 2 +-
.../org/apache/lucene/util/BytesRefHash.java | 6 +--
.../apache/lucene/util/TestBytesRefHash.java | 37 ++++++++++++++++---
.../search/join/TermsIncludingScoreQuery.java | 2 +-
.../apache/lucene/search/join/TermsQuery.java | 2 +-
.../lucene/index/memory/MemoryIndex.java | 2 +-
.../index/BaseDocValuesFormatTestCase.java | 2 +-
11 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
index e78137ec669..32423e96e93 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
@@ -205,7 +205,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder<>(
FST.INPUT_TYPE.BYTE4, outputs);
- final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ final int[] sort = hash.sort();
IntsRefBuilder intsSpare = new IntsRefBuilder();
final int size = hash.size();
BytesRef spare = new BytesRef();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 2d8557ba9d6..6517218e1b4 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -112,7 +112,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
final int valueCount = hash.size();
final PackedLongValues ords = pending.build();
- final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ final int[] sortedValues = hash.sort();
final int[] ordMap = new int[valueCount];
for(int ord=0;ord {
/** Collapse the hash table and sort in-place; also sets
* this.sortedTermIDs to the results */
public int[] sortPostings() {
- sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ sortedTermIDs = bytesHash.sort();
return sortedTermIDs;
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
index e0917eb49b7..3a62e1599f4 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
@@ -109,7 +109,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite {
final int size = col.terms.size();
if (size > 0) {
- final int sort[] = col.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ final int sort[] = col.terms.sort();
final float[] boost = col.array.boost;
final TermContext[] termStates = col.array.termState;
for (int i = 0; i < size; i++) {
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
index 25b74a620e0..82cce033b96 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
@@ -156,11 +156,9 @@ public final class BytesRefHash {
* Note: This is a destructive operation. {@link #clear()} must be called in
* order to reuse this {@link BytesRefHash} instance.
*
- *
- * @param comp
- * the {@link Comparator} used for sorting
*/
- public int[] sort(final Comparator comp) {
+ public int[] sort() {
+ final Comparator comp = BytesRef.getUTF8SortedAsUnicodeComparator();
final int[] compact = compact();
new IntroSorter() {
@Override
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
index e44b283a44b..50d921bff48 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
@@ -17,14 +17,16 @@
package org.apache.lucene.util;
+import java.util.Arrays;
import java.util.BitSet;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map.Entry;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
-import java.util.Map.Entry;
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
import org.junit.Before;
@@ -166,16 +168,41 @@ public class TestBytesRefHash extends LuceneTestCase {
}
}
+ private static int[] codePoints(String input) {
+ int length = Character.codePointCount(input, 0, input.length());
+ int word[] = new int[length];
+ for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+ word[j++] = cp = input.codePointAt(i);
+ }
+ return word;
+ }
+
/**
* Test method for
- * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}.
+ * {@link org.apache.lucene.util.BytesRefHash#sort()}.
*/
@Test
public void testSort() {
BytesRefBuilder ref = new BytesRefBuilder();
int num = atLeast(2);
for (int j = 0; j < num; j++) {
- SortedSet strings = new TreeSet<>();
+
+ // Sorts by unicode code point order (is there a simple way, e.g. a Collator?)
+ SortedSet strings = new TreeSet<>(new Comparator() {
+ @Override
+ public int compare(String a, String b) {
+ int[] aCodePoints = codePoints(a);
+ int[] bCodePoints = codePoints(b);
+ for(int i=0;i bCodePoints[i]) {
+ return 1;
+ }
+ }
+ return aCodePoints.length - bCodePoints.length;
+ }
+ });
for (int i = 0; i < 797; i++) {
String str;
do {
@@ -185,9 +212,7 @@ public class TestBytesRefHash extends LuceneTestCase {
hash.add(ref.get());
strings.add(str);
}
- // We use the UTF-16 comparator here, because we need to be able to
- // compare to native String.compareTo() [UTF-16]:
- int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
+ int[] sort = hash.sort();
assertTrue(strings.size() < sort.length);
int i = 0;
BytesRef scratch = new BytesRef();
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
index 7c03103445b..65ab1f02775 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
@@ -55,7 +55,7 @@ class TermsIncludingScoreQuery extends Query {
this.terms = terms;
this.scores = scores;
this.originalQuery = originalQuery;
- this.ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ this.ords = terms.sort();
this.unwrittenOriginalQuery = originalQuery;
}
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
index eabc72a94d0..11b201d6342 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
@@ -48,7 +48,7 @@ class TermsQuery extends MultiTermQuery {
super(field);
this.fromQuery = fromQuery;
this.terms = terms;
- ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ ords = terms.sort();
}
@Override
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 29e60ba50b1..c6667249279 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -758,7 +758,7 @@ public class MemoryIndex {
*/
public void sortTerms() {
if (sortedTerms == null) {
- sortedTerms = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ sortedTerms = terms.sort();
}
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
index 79cfa0fbcb8..5a8a99f4d15 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@@ -1144,7 +1144,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
w.commit();
IndexReader reader = w.getReader();
SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field");
- int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ int[] sort = hash.sort();
BytesRef expected = new BytesRef();
assertEquals(hash.size(), docValues.getValueCount());
for (int i = 0; i < hash.size(); i++) {