From 126ac9a5fe00fbbc6870ef25ae3fc6af6cd7c557 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Sat, 27 Feb 2016 10:26:20 -0500 Subject: [PATCH] BytesRefHash.sort always sorts in unicode order --- .../miscellaneous/StemmerOverrideFilter.java | 2 +- .../lucene/index/SortedDocValuesWriter.java | 2 +- .../index/SortedSetDocValuesWriter.java | 2 +- .../lucene/index/TermsHashPerField.java | 2 +- .../apache/lucene/search/ScoringRewrite.java | 2 +- .../org/apache/lucene/util/BytesRefHash.java | 6 +-- .../apache/lucene/util/TestBytesRefHash.java | 37 ++++++++++++++++--- .../search/join/TermsIncludingScoreQuery.java | 2 +- .../apache/lucene/search/join/TermsQuery.java | 2 +- .../lucene/index/memory/MemoryIndex.java | 2 +- .../index/BaseDocValuesFormatTestCase.java | 2 +- 11 files changed, 42 insertions(+), 19 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java index e78137ec669..32423e96e93 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java @@ -205,7 +205,7 @@ public final class StemmerOverrideFilter extends TokenFilter { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); - final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java index 2d8557ba9d6..6517218e1b4 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java @@ -112,7 +112,7 @@ class SortedDocValuesWriter extends DocValuesWriter { final int valueCount = hash.size(); final PackedLongValues ords = pending.build(); - final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + final int[] sortedValues = hash.sort(); final int[] ordMap = new int[valueCount]; for(int ord=0;ord { /** Collapse the hash table and sort in-place; also sets * this.sortedTermIDs to the results */ public int[] sortPostings() { - sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + sortedTermIDs = bytesHash.sort(); return sortedTermIDs; } diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java index e0917eb49b7..3a62e1599f4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java @@ -109,7 +109,7 @@ public abstract class ScoringRewrite extends TermCollectingRewrite { final int size = col.terms.size(); if (size > 0) { - final int sort[] = col.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + final int sort[] = col.terms.sort(); final float[] boost = col.array.boost; final TermContext[] termStates = col.array.termState; for (int i = 0; i < size; i++) { diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java index 25b74a620e0..82cce033b96 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java @@ -156,11 +156,9 @@ public final class BytesRefHash { * Note: This is a destructive operation. {@link #clear()} must be called in * order to reuse this {@link BytesRefHash} instance. *

- * - * @param comp - * the {@link Comparator} used for sorting */ - public int[] sort(final Comparator comp) { + public int[] sort() { + final Comparator comp = BytesRef.getUTF8SortedAsUnicodeComparator(); final int[] compact = compact(); new IntroSorter() { @Override diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java index e44b283a44b..50d921bff48 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java @@ -17,14 +17,16 @@ package org.apache.lucene.util; +import java.util.Arrays; import java.util.BitSet; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.Map.Entry; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; -import java.util.Map.Entry; import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException; import org.junit.Before; @@ -166,16 +168,41 @@ public class TestBytesRefHash extends LuceneTestCase { } } + private static int[] codePoints(String input) { + int length = Character.codePointCount(input, 0, input.length()); + int word[] = new int[length]; + for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { + word[j++] = cp = input.codePointAt(i); + } + return word; + } + /** * Test method for - * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}. + * {@link org.apache.lucene.util.BytesRefHash#sort()}. */ @Test public void testSort() { BytesRefBuilder ref = new BytesRefBuilder(); int num = atLeast(2); for (int j = 0; j < num; j++) { - SortedSet strings = new TreeSet<>(); + + // Sorts by unicode code point order (is there a simple way, e.g. a Collator?) + SortedSet strings = new TreeSet<>(new Comparator() { + @Override + public int compare(String a, String b) { + int[] aCodePoints = codePoints(a); + int[] bCodePoints = codePoints(b); + for(int i=0;i bCodePoints[i]) { + return 1; + } + } + return aCodePoints.length - bCodePoints.length; + } + }); for (int i = 0; i < 797; i++) { String str; do { @@ -185,9 +212,7 @@ public class TestBytesRefHash extends LuceneTestCase { hash.add(ref.get()); strings.add(str); } - // We use the UTF-16 comparator here, because we need to be able to - // compare to native String.compareTo() [UTF-16]: - int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); + int[] sort = hash.sort(); assertTrue(strings.size() < sort.length); int i = 0; BytesRef scratch = new BytesRef(); diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java index 7c03103445b..65ab1f02775 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java @@ -55,7 +55,7 @@ class TermsIncludingScoreQuery extends Query { this.terms = terms; this.scores = scores; this.originalQuery = originalQuery; - this.ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + this.ords = terms.sort(); this.unwrittenOriginalQuery = originalQuery; } diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java index eabc72a94d0..11b201d6342 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java @@ -48,7 +48,7 @@ class TermsQuery extends MultiTermQuery { super(field); this.fromQuery = fromQuery; this.terms = terms; - ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + ords = terms.sort(); } @Override diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 29e60ba50b1..c6667249279 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -758,7 +758,7 @@ public class MemoryIndex { */ public void sortTerms() { if (sortedTerms == null) { - sortedTerms = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + sortedTerms = terms.sort(); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 79cfa0fbcb8..5a8a99f4d15 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -1144,7 +1144,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes w.commit(); IndexReader reader = w.getReader(); SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field"); - int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + int[] sort = hash.sort(); BytesRef expected = new BytesRef(); assertEquals(hash.size(), docValues.getValueCount()); for (int i = 0; i < hash.size(); i++) {