mirror of https://github.com/apache/lucene.git
BytesRefHash.sort always sorts in unicode order
This commit is contained in:
parent
70440bbbd2
commit
126ac9a5fe
|
@ -205,7 +205,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
|
||||||
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
|
org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
|
||||||
FST.INPUT_TYPE.BYTE4, outputs);
|
FST.INPUT_TYPE.BYTE4, outputs);
|
||||||
final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
final int[] sort = hash.sort();
|
||||||
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
IntsRefBuilder intsSpare = new IntsRefBuilder();
|
||||||
final int size = hash.size();
|
final int size = hash.size();
|
||||||
BytesRef spare = new BytesRef();
|
BytesRef spare = new BytesRef();
|
||||||
|
|
|
@ -112,7 +112,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
final int valueCount = hash.size();
|
final int valueCount = hash.size();
|
||||||
final PackedLongValues ords = pending.build();
|
final PackedLongValues ords = pending.build();
|
||||||
|
|
||||||
final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
final int[] sortedValues = hash.sort();
|
||||||
final int[] ordMap = new int[valueCount];
|
final int[] ordMap = new int[valueCount];
|
||||||
|
|
||||||
for(int ord=0;ord<valueCount;ord++) {
|
for(int ord=0;ord<valueCount;ord++) {
|
||||||
|
|
|
@ -152,7 +152,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
|
||||||
final PackedLongValues ords = pending.build();
|
final PackedLongValues ords = pending.build();
|
||||||
final PackedLongValues ordCounts = pendingCounts.build();
|
final PackedLongValues ordCounts = pendingCounts.build();
|
||||||
|
|
||||||
final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
final int[] sortedValues = hash.sort();
|
||||||
final int[] ordMap = new int[valueCount];
|
final int[] ordMap = new int[valueCount];
|
||||||
|
|
||||||
for(int ord=0;ord<valueCount;ord++) {
|
for(int ord=0;ord<valueCount;ord++) {
|
||||||
|
|
|
@ -93,7 +93,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
/** Collapse the hash table and sort in-place; also sets
|
/** Collapse the hash table and sort in-place; also sets
|
||||||
* this.sortedTermIDs to the results */
|
* this.sortedTermIDs to the results */
|
||||||
public int[] sortPostings() {
|
public int[] sortPostings() {
|
||||||
sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
sortedTermIDs = bytesHash.sort();
|
||||||
return sortedTermIDs;
|
return sortedTermIDs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -109,7 +109,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
|
||||||
|
|
||||||
final int size = col.terms.size();
|
final int size = col.terms.size();
|
||||||
if (size > 0) {
|
if (size > 0) {
|
||||||
final int sort[] = col.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
final int sort[] = col.terms.sort();
|
||||||
final float[] boost = col.array.boost;
|
final float[] boost = col.array.boost;
|
||||||
final TermContext[] termStates = col.array.termState;
|
final TermContext[] termStates = col.array.termState;
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
|
|
|
@ -156,11 +156,9 @@ public final class BytesRefHash {
|
||||||
* Note: This is a destructive operation. {@link #clear()} must be called in
|
* Note: This is a destructive operation. {@link #clear()} must be called in
|
||||||
* order to reuse this {@link BytesRefHash} instance.
|
* order to reuse this {@link BytesRefHash} instance.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
|
||||||
* @param comp
|
|
||||||
* the {@link Comparator} used for sorting
|
|
||||||
*/
|
*/
|
||||||
public int[] sort(final Comparator<BytesRef> comp) {
|
public int[] sort() {
|
||||||
|
final Comparator<BytesRef> comp = BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
final int[] compact = compact();
|
final int[] compact = compact();
|
||||||
new IntroSorter() {
|
new IntroSorter() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,14 +17,16 @@
|
||||||
package org.apache.lucene.util;
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Map.Entry;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.Map.Entry;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
|
import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
@ -166,16 +168,41 @@ public class TestBytesRefHash extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static int[] codePoints(String input) {
|
||||||
|
int length = Character.codePointCount(input, 0, input.length());
|
||||||
|
int word[] = new int[length];
|
||||||
|
for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
|
||||||
|
word[j++] = cp = input.codePointAt(i);
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test method for
|
* Test method for
|
||||||
* {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}.
|
* {@link org.apache.lucene.util.BytesRefHash#sort()}.
|
||||||
*/
|
*/
|
||||||
@Test
|
@Test
|
||||||
public void testSort() {
|
public void testSort() {
|
||||||
BytesRefBuilder ref = new BytesRefBuilder();
|
BytesRefBuilder ref = new BytesRefBuilder();
|
||||||
int num = atLeast(2);
|
int num = atLeast(2);
|
||||||
for (int j = 0; j < num; j++) {
|
for (int j = 0; j < num; j++) {
|
||||||
SortedSet<String> strings = new TreeSet<>();
|
|
||||||
|
// Sorts by unicode code point order (is there a simple way, e.g. a Collator?)
|
||||||
|
SortedSet<String> strings = new TreeSet<>(new Comparator<String>() {
|
||||||
|
@Override
|
||||||
|
public int compare(String a, String b) {
|
||||||
|
int[] aCodePoints = codePoints(a);
|
||||||
|
int[] bCodePoints = codePoints(b);
|
||||||
|
for(int i=0;i<Math.min(aCodePoints.length, bCodePoints.length);i++) {
|
||||||
|
if (aCodePoints[i] < bCodePoints[i]) {
|
||||||
|
return -1;
|
||||||
|
} else if (aCodePoints[i] > bCodePoints[i]) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return aCodePoints.length - bCodePoints.length;
|
||||||
|
}
|
||||||
|
});
|
||||||
for (int i = 0; i < 797; i++) {
|
for (int i = 0; i < 797; i++) {
|
||||||
String str;
|
String str;
|
||||||
do {
|
do {
|
||||||
|
@ -185,9 +212,7 @@ public class TestBytesRefHash extends LuceneTestCase {
|
||||||
hash.add(ref.get());
|
hash.add(ref.get());
|
||||||
strings.add(str);
|
strings.add(str);
|
||||||
}
|
}
|
||||||
// We use the UTF-16 comparator here, because we need to be able to
|
int[] sort = hash.sort();
|
||||||
// compare to native String.compareTo() [UTF-16]:
|
|
||||||
int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
|
|
||||||
assertTrue(strings.size() < sort.length);
|
assertTrue(strings.size() < sort.length);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
BytesRef scratch = new BytesRef();
|
BytesRef scratch = new BytesRef();
|
||||||
|
|
|
@ -55,7 +55,7 @@ class TermsIncludingScoreQuery extends Query {
|
||||||
this.terms = terms;
|
this.terms = terms;
|
||||||
this.scores = scores;
|
this.scores = scores;
|
||||||
this.originalQuery = originalQuery;
|
this.originalQuery = originalQuery;
|
||||||
this.ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
this.ords = terms.sort();
|
||||||
this.unwrittenOriginalQuery = originalQuery;
|
this.unwrittenOriginalQuery = originalQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ class TermsQuery extends MultiTermQuery {
|
||||||
super(field);
|
super(field);
|
||||||
this.fromQuery = fromQuery;
|
this.fromQuery = fromQuery;
|
||||||
this.terms = terms;
|
this.terms = terms;
|
||||||
ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
ords = terms.sort();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -758,7 +758,7 @@ public class MemoryIndex {
|
||||||
*/
|
*/
|
||||||
public void sortTerms() {
|
public void sortTerms() {
|
||||||
if (sortedTerms == null) {
|
if (sortedTerms == null) {
|
||||||
sortedTerms = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
sortedTerms = terms.sort();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1144,7 +1144,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
|
||||||
w.commit();
|
w.commit();
|
||||||
IndexReader reader = w.getReader();
|
IndexReader reader = w.getReader();
|
||||||
SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field");
|
SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field");
|
||||||
int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
int[] sort = hash.sort();
|
||||||
BytesRef expected = new BytesRef();
|
BytesRef expected = new BytesRef();
|
||||||
assertEquals(hash.size(), docValues.getValueCount());
|
assertEquals(hash.size(), docValues.getValueCount());
|
||||||
for (int i = 0; i < hash.size(); i++) {
|
for (int i = 0; i < hash.size(); i++) {
|
||||||
|
|
Loading…
Reference in New Issue