mirror of https://github.com/apache/lucene.git
SOLR-2530: Remove Noggit CharArr from FieldType
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1127326 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
530b894c60
commit
68a840c2b7
|
@ -26,6 +26,7 @@ import org.apache.lucene.index.TermFreqVector;
|
|||
import org.apache.lucene.index.TermPositionVector;
|
||||
import org.apache.lucene.index.TermVectorOffsetInfo;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
|
||||
|
@ -80,16 +81,16 @@ public class FieldTermStack {
|
|||
Set<String> termSet = fieldQuery.getTermSet( fieldName );
|
||||
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
|
||||
if( termSet == null ) return;
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
for( BytesRef term : tpv.getTerms() ){
|
||||
if( !termSet.contains( term.utf8ToString() ) ) continue;
|
||||
if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;
|
||||
int index = tpv.indexOf( term );
|
||||
TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
|
||||
if( tvois == null ) return; // just return to make null snippets
|
||||
int[] poss = tpv.getTermPositions( index );
|
||||
if( poss == null ) return; // just return to make null snippets
|
||||
for( int i = 0; i < tvois.length; i++ )
|
||||
termList.add( new TermInfo( term.utf8ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
|
||||
termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
|
||||
}
|
||||
|
||||
// sort by position
|
||||
|
|
|
@ -41,6 +41,7 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
|
|||
import org.apache.lucene.util.BitVector;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* Represented as a coupled graph of class instances, this
|
||||
|
@ -228,12 +229,13 @@ public class InstantiatedIndex
|
|||
if (fieldsC != null) {
|
||||
FieldsEnum fieldsEnum = fieldsC.iterator();
|
||||
String field;
|
||||
final CharsRef spare = new CharsRef();
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
if (fields == null || fields.contains(field)) {
|
||||
TermsEnum termsEnum = fieldsEnum.terms();
|
||||
BytesRef text;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
String termText = text.utf8ToString();
|
||||
String termText = text.utf8ToChars(spare).toString();
|
||||
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
|
||||
final long totalTermFreq = termsEnum.totalTermFreq();
|
||||
if (totalTermFreq != -1) {
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.search.regex;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.regexp.CharacterIterator;
|
||||
import org.apache.regexp.RE;
|
||||
|
@ -104,11 +105,11 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
|
||||
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
private RE regexp;
|
||||
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
private final CharsRef utf16 = new CharsRef(10);
|
||||
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||
|
||||
public char charAt(int pos) {
|
||||
return utf16.result[pos];
|
||||
return utf16.chars[pos];
|
||||
}
|
||||
|
||||
public boolean isEnd(int pos) {
|
||||
|
@ -120,7 +121,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
}
|
||||
|
||||
public String substring(int beginIndex, int endIndex) {
|
||||
return new String(utf16.result, beginIndex, endIndex - beginIndex);
|
||||
return new String(utf16.chars, beginIndex, endIndex - beginIndex);
|
||||
}
|
||||
|
||||
};
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/**
|
||||
|
@ -95,25 +96,11 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
|||
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
private final Pattern pattern;
|
||||
private final Matcher matcher;
|
||||
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
private final CharSequence utf16wrapper = new CharSequence() {
|
||||
|
||||
public int length() {
|
||||
return utf16.length;
|
||||
}
|
||||
|
||||
public char charAt(int index) {
|
||||
return utf16.result[index];
|
||||
}
|
||||
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
return new String(utf16.result, start, end - start);
|
||||
}
|
||||
};
|
||||
private final CharsRef utf16 = new CharsRef(10);
|
||||
|
||||
public JavaUtilRegexMatcher(String regex, int flags) {
|
||||
this.pattern = Pattern.compile(regex, flags);
|
||||
this.matcher = this.pattern.matcher(utf16wrapper);
|
||||
this.matcher = this.pattern.matcher(utf16);
|
||||
}
|
||||
|
||||
public boolean match(BytesRef term) {
|
||||
|
|
|
@ -48,6 +48,7 @@ import org.apache.lucene.search.TermQuery;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
||||
|
@ -850,8 +851,9 @@ public final class MoreLikeThis {
|
|||
{
|
||||
BytesRef[] terms = vector.getTerms();
|
||||
int freqs[]=vector.getTermFrequencies();
|
||||
final CharsRef spare = new CharsRef();
|
||||
for (int j = 0; j < terms.length; j++) {
|
||||
String term = terms[j].utf8ToString();
|
||||
final String term = terms[j].utf8ToChars(spare).toString();
|
||||
|
||||
if(isNoiseWord(term)){
|
||||
continue;
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.zip.DataFormatException;
|
|||
import java.io.ByteArrayOutputStream;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** Simple utility class providing static methods to
|
||||
|
@ -118,9 +119,9 @@ public class CompressionTools {
|
|||
/** Decompress the byte array previously returned by
|
||||
* compressString back into a String */
|
||||
public static String decompressString(byte[] value) throws DataFormatException {
|
||||
UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
|
||||
final byte[] bytes = decompress(value);
|
||||
CharsRef result = new CharsRef(bytes.length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
|
||||
return new String(result.result, 0, result.length);
|
||||
return new String(result.chars, 0, result.length);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
|
|||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.automaton.fst.Builder;
|
||||
|
@ -236,7 +237,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private int tf;
|
||||
private Bits skipDocs;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
|
||||
private final CharsRef scratchUTF16 = new CharsRef(10);
|
||||
|
||||
public SimpleTextDocsEnum() {
|
||||
this.inStart = SimpleTextFieldsReader.this.in;
|
||||
|
@ -286,7 +287,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return docID;
|
||||
}
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
|
||||
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
|
||||
docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
termFreq = 0;
|
||||
first = false;
|
||||
} else if (scratch.startsWith(POS)) {
|
||||
|
@ -323,8 +324,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private Bits skipDocs;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
private final BytesRef scratch2 = new BytesRef(10);
|
||||
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
|
||||
private final UnicodeUtil.UTF16Result scratchUTF16_2 = new UnicodeUtil.UTF16Result();
|
||||
private final CharsRef scratchUTF16 = new CharsRef(10);
|
||||
private final CharsRef scratchUTF16_2 = new CharsRef(10);
|
||||
private BytesRef payload;
|
||||
private long nextDocStart;
|
||||
|
||||
|
@ -368,7 +369,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
return docID;
|
||||
}
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
|
||||
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
|
||||
docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
|
||||
tf = 0;
|
||||
posStart = in.getFilePointer();
|
||||
first = false;
|
||||
|
@ -400,7 +401,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
readLine(in, scratch);
|
||||
assert scratch.startsWith(POS): "got line=" + scratch.utf8ToString();
|
||||
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
|
||||
final int pos = ArrayUtil.parseInt(scratchUTF16_2.result, 0, scratchUTF16_2.length);
|
||||
final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
|
||||
final long fp = in.getFilePointer();
|
||||
readLine(in, scratch);
|
||||
if (scratch.startsWith(PAYLOAD)) {
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.util;
|
|||
*/
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
/** Represents byte[], as a slice (offset + length) into an
|
||||
* existing byte[].
|
||||
|
@ -122,6 +121,7 @@ public final class BytesRef implements Comparable<BytesRef> {
|
|||
public void copy(char text[], int offset, int length) {
|
||||
UnicodeUtil.UTF16toUTF8(text, offset, length, this);
|
||||
}
|
||||
|
||||
public boolean bytesEquals(BytesRef other) {
|
||||
if (length == other.length) {
|
||||
int otherUpto = other.offset;
|
||||
|
@ -198,13 +198,15 @@ public final class BytesRef implements Comparable<BytesRef> {
|
|||
/** Interprets stored bytes as UTF8 bytes, returning the
|
||||
* resulting string */
|
||||
public String utf8ToString() {
|
||||
try {
|
||||
return new String(bytes, offset, length, "UTF-8");
|
||||
} catch (UnsupportedEncodingException uee) {
|
||||
// should not happen -- UTF8 is presumably supported
|
||||
// by all JREs
|
||||
throw new RuntimeException(uee);
|
||||
}
|
||||
final CharsRef ref = new CharsRef(length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
|
||||
return ref.toString();
|
||||
}
|
||||
|
||||
/** Interprets stored bytes as UTF8 bytes into the given {@link CharsRef} */
|
||||
public CharsRef utf8ToChars(CharsRef ref) {
|
||||
UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
|
||||
return ref;
|
||||
}
|
||||
|
||||
/** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */
|
||||
|
|
|
@ -0,0 +1,218 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Represents char[], as a slice (offset + length) into an existing char[].
|
||||
*
|
||||
* @lucene.internal
|
||||
*/
|
||||
public final class CharsRef implements Comparable<CharsRef>, CharSequence {
|
||||
private static final char[] EMPTY_ARRAY = new char[0];
|
||||
public char[] chars;
|
||||
public int offset;
|
||||
public int length;
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharsRef} initialized an empty array zero-length
|
||||
*/
|
||||
public CharsRef() {
|
||||
this(EMPTY_ARRAY, 0, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharsRef} initialized with an array of the given
|
||||
* capacity
|
||||
*/
|
||||
public CharsRef(int capacity) {
|
||||
chars = new char[capacity];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharsRef} initialized with the given array, offset and
|
||||
* length
|
||||
*/
|
||||
public CharsRef(char[] chars, int offset, int length) {
|
||||
assert chars != null;
|
||||
assert chars.length >= offset + length;
|
||||
this.chars = chars;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharsRef} initialized with the given Strings character
|
||||
* array
|
||||
*/
|
||||
public CharsRef(String string) {
|
||||
this.chars = string.toCharArray();
|
||||
this.offset = 0;
|
||||
this.length = chars.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharsRef} and copies the contents of the source into
|
||||
* the new instance.
|
||||
* @see #copy(CharsRef)
|
||||
*/
|
||||
public CharsRef(CharsRef other) {
|
||||
copy(other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
return new CharsRef(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 0;
|
||||
final int end = offset + length;
|
||||
for (int i = offset; i < end; i++) {
|
||||
result = prime * result + chars[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof CharsRef) {
|
||||
return charsEquals((CharsRef) other);
|
||||
}
|
||||
|
||||
if (other instanceof CharSequence) {
|
||||
final CharSequence seq = (CharSequence) other;
|
||||
if (length == seq.length()) {
|
||||
int n = length;
|
||||
int i = offset;
|
||||
int j = 0;
|
||||
while (n-- != 0) {
|
||||
if (chars[i++] != seq.charAt(j++))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean charsEquals(CharsRef other) {
|
||||
if (length == other.length) {
|
||||
int otherUpto = other.offset;
|
||||
final char[] otherChars = other.chars;
|
||||
final int end = offset + length;
|
||||
for (int upto = offset; upto < end; upto++, otherUpto++) {
|
||||
if (chars[upto] != otherChars[otherUpto]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/** Signed int order comparison */
|
||||
public int compareTo(CharsRef other) {
|
||||
if (this == other)
|
||||
return 0;
|
||||
|
||||
final char[] aChars = this.chars;
|
||||
int aUpto = this.offset;
|
||||
final char[] bChars = other.chars;
|
||||
int bUpto = other.offset;
|
||||
|
||||
final int aStop = aUpto + Math.min(this.length, other.length);
|
||||
|
||||
while (aUpto < aStop) {
|
||||
int aInt = aChars[aUpto++];
|
||||
int bInt = bChars[bUpto++];
|
||||
if (aInt > bInt) {
|
||||
return 1;
|
||||
} else if (aInt < bInt) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return this.length - other.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the given {@link CharsRef} referenced content into this instance
|
||||
* starting at offset 0.
|
||||
*
|
||||
* @param other
|
||||
* the {@link CharsRef} to copy
|
||||
*/
|
||||
public void copy(CharsRef other) {
|
||||
chars = ArrayUtil.grow(chars, other.length);
|
||||
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
|
||||
length = other.length;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
public void grow(int newLength) {
|
||||
if (chars.length < newLength) {
|
||||
chars = ArrayUtil.grow(chars, newLength);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies the given array into this CharsRef starting at offset 0
|
||||
*/
|
||||
public void copy(char[] otherChars, int otherOffset, int otherLength) {
|
||||
this.offset = 0;
|
||||
append(otherChars, otherOffset, otherLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends the given array to this CharsRef starting at the current offset
|
||||
*/
|
||||
public void append(char[] otherChars, int otherOffset, int otherLength) {
|
||||
grow(this.offset + otherLength);
|
||||
System.arraycopy(otherChars, otherOffset, this.chars, this.offset,
|
||||
otherLength);
|
||||
this.length = otherLength;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(chars, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
return chars[offset + index];
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence subSequence(int start, int end) {
|
||||
return new CharsRef(chars, offset + start, offset + end - 1);
|
||||
}
|
||||
}
|
|
@ -95,6 +95,19 @@ package org.apache.lucene.util;
|
|||
|
||||
public final class UnicodeUtil {
|
||||
|
||||
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
|
||||
* one would normally encounter, and definitely bigger than any UTF-8 terms.
|
||||
* <p>
|
||||
* WARNING: This is not a valid UTF8 Term
|
||||
**/
|
||||
public static final BytesRef BIG_TERM = new BytesRef(
|
||||
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
|
||||
); // TODO this is unrelated here find a better place for it
|
||||
|
||||
public static void main(String[] args) {
|
||||
System.out.println(Character.toChars(0x10FFFF + 1));
|
||||
}
|
||||
|
||||
private UnicodeUtil() {} // no instance
|
||||
|
||||
public static final int UNI_SUR_HIGH_START = 0xD800;
|
||||
|
@ -112,33 +125,6 @@ public final class UnicodeUtil {
|
|||
Character.MIN_SUPPLEMENTARY_CODE_POINT -
|
||||
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
|
||||
|
||||
/**
|
||||
* @lucene.internal
|
||||
*/
|
||||
public static final class UTF16Result {
|
||||
public char[] result = new char[10];
|
||||
public int[] offsets = new int[10];
|
||||
public int length;
|
||||
|
||||
public void setLength(int newLength) {
|
||||
if (result.length < newLength)
|
||||
result = ArrayUtil.grow(result, newLength);
|
||||
length = newLength;
|
||||
}
|
||||
|
||||
public void copyText(UTF16Result other) {
|
||||
setLength(other.length);
|
||||
System.arraycopy(other.result, 0, result, 0, length);
|
||||
}
|
||||
|
||||
public void copyText(String other) {
|
||||
final int otherLength = other.length();
|
||||
setLength(otherLength);
|
||||
other.getChars(0, otherLength, result, 0);
|
||||
length = otherLength;
|
||||
}
|
||||
}
|
||||
|
||||
/** Encode characters from a char[] source, starting at
|
||||
* offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */
|
||||
public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
|
||||
|
@ -302,135 +288,6 @@ public final class UnicodeUtil {
|
|||
result.length = upto;
|
||||
}
|
||||
|
||||
/** Convert UTF8 bytes into UTF16 characters. If offset
|
||||
* is non-zero, conversion starts at that starting point
|
||||
* in utf8, re-using the results from the previous call
|
||||
* up until offset. */
|
||||
public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
|
||||
|
||||
final int end = offset + length;
|
||||
char[] out = result.result;
|
||||
if (result.offsets.length <= end) {
|
||||
result.offsets = ArrayUtil.grow(result.offsets, end+1);
|
||||
}
|
||||
final int[] offsets = result.offsets;
|
||||
|
||||
// If incremental decoding fell in the middle of a
|
||||
// single unicode character, rollback to its start:
|
||||
int upto = offset;
|
||||
while(offsets[upto] == -1)
|
||||
upto--;
|
||||
|
||||
int outUpto = offsets[upto];
|
||||
|
||||
// Pre-allocate for worst case 1-for-1
|
||||
if (outUpto+length >= out.length) {
|
||||
out = result.result = ArrayUtil.grow(out, outUpto+length+1);
|
||||
}
|
||||
|
||||
while (upto < end) {
|
||||
|
||||
final int b = utf8[upto]&0xff;
|
||||
final int ch;
|
||||
|
||||
offsets[upto++] = outUpto;
|
||||
|
||||
if (b < 0xc0) {
|
||||
assert b < 0x80;
|
||||
ch = b;
|
||||
} else if (b < 0xe0) {
|
||||
ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
} else if (b < 0xf0) {
|
||||
ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
offsets[upto++] = -1;
|
||||
}
|
||||
|
||||
if (ch <= UNI_MAX_BMP) {
|
||||
// target is a character <= 0xFFFF
|
||||
out[outUpto++] = (char) ch;
|
||||
} else {
|
||||
// target is a character in range 0xFFFF - 0x10FFFF
|
||||
out[outUpto++] = (char) ((ch >> HALF_SHIFT) + 0xD7C0 /* UNI_SUR_HIGH_START - 64 */);
|
||||
out[outUpto++] = (char) ((ch & HALF_MASK) + UNI_SUR_LOW_START);
|
||||
}
|
||||
}
|
||||
offsets[upto] = outUpto;
|
||||
result.length = outUpto;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next valid UTF-16 String in UTF-16 order.
|
||||
* <p>
|
||||
* If the input String is already valid, it is returned.
|
||||
* Otherwise the next String in code unit order is returned.
|
||||
* </p>
|
||||
* @param s input String (possibly with unpaired surrogates)
|
||||
* @return next valid UTF-16 String in UTF-16 order
|
||||
*/
|
||||
public static String nextValidUTF16String(String s) {
|
||||
if (validUTF16String(s))
|
||||
return s;
|
||||
else {
|
||||
UTF16Result chars = new UTF16Result();
|
||||
chars.copyText(s);
|
||||
nextValidUTF16String(chars);
|
||||
return new String(chars.result, 0, chars.length);
|
||||
}
|
||||
}
|
||||
|
||||
public static void nextValidUTF16String(UTF16Result s) {
|
||||
final int size = s.length;
|
||||
for (int i = 0; i < size; i++) {
|
||||
char ch = s.result[i];
|
||||
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START
|
||||
&& ch <= UnicodeUtil.UNI_SUR_HIGH_END) {
|
||||
if (i < size - 1) {
|
||||
i++;
|
||||
char nextCH = s.result[i];
|
||||
if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START
|
||||
&& nextCH <= UnicodeUtil.UNI_SUR_LOW_END) {
|
||||
// Valid surrogate pair
|
||||
} else
|
||||
// Unmatched high surrogate
|
||||
if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated
|
||||
s.setLength(i + 1);
|
||||
s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
|
||||
return;
|
||||
} else { // SMP already enumerated
|
||||
if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) {
|
||||
s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
|
||||
s.setLength(i);
|
||||
} else {
|
||||
s.result[i - 1]++;
|
||||
s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
|
||||
s.setLength(i + 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// Unmatched high surrogate in final position, SMP not yet enumerated
|
||||
s.setLength(i + 2);
|
||||
s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START;
|
||||
return;
|
||||
}
|
||||
} else if (ch >= UnicodeUtil.UNI_SUR_LOW_START
|
||||
&& ch <= UnicodeUtil.UNI_SUR_LOW_END) {
|
||||
// Unmatched low surrogate, SMP already enumerated
|
||||
s.setLength(i + 1);
|
||||
s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only called from assert
|
||||
/*
|
||||
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
|
||||
|
@ -705,4 +562,51 @@ public final class UnicodeUtil {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if
|
||||
* it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
|
||||
* <p>
|
||||
* NOTE: Full characters are read, even if this reads past the length passed (and
|
||||
* can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
|
||||
* Explicit checks for valid UTF-8 are not performed.
|
||||
*/
|
||||
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
|
||||
int out_offset = chars.offset = 0;
|
||||
final char[] out = chars.chars = ArrayUtil.grow(chars.chars, length);
|
||||
final int limit = offset + length;
|
||||
while (offset < limit) {
|
||||
int b = utf8[offset++]&0xff;
|
||||
if (b < 0xc0) {
|
||||
assert b < 0x80;
|
||||
out[out_offset++] = (char)b;
|
||||
} else if (b < 0xe0) {
|
||||
out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
|
||||
} else if (b < 0xf0) {
|
||||
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
|
||||
offset += 2;
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
|
||||
offset += 3;
|
||||
if (ch < UNI_MAX_BMP) {
|
||||
out[out_offset++] = (char)ch;
|
||||
} else {
|
||||
int chHalf = ch - 0x0010000;
|
||||
out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
|
||||
out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
|
||||
}
|
||||
}
|
||||
}
|
||||
chars.length = out_offset - chars.offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)}
|
||||
* @see #UTF8toUTF16(byte[], int, int, CharsRef)
|
||||
*/
|
||||
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {
|
||||
UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.lucene.index.codecs.preflexrw;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -107,14 +108,14 @@ final class TermInfosWriter {
|
|||
}
|
||||
|
||||
// Currently used only by assert statements
|
||||
UnicodeUtil.UTF16Result utf16Result1;
|
||||
UnicodeUtil.UTF16Result utf16Result2;
|
||||
CharsRef utf16Result1;
|
||||
CharsRef utf16Result2;
|
||||
private final BytesRef scratchBytes = new BytesRef();
|
||||
|
||||
// Currently used only by assert statements
|
||||
private boolean initUTF16Results() {
|
||||
utf16Result1 = new UnicodeUtil.UTF16Result();
|
||||
utf16Result2 = new UnicodeUtil.UTF16Result();
|
||||
utf16Result1 = new CharsRef(10);
|
||||
utf16Result2 = new CharsRef(10);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -145,8 +146,8 @@ final class TermInfosWriter {
|
|||
len = utf16Result2.length;
|
||||
|
||||
for(int i=0;i<len;i++) {
|
||||
final char ch1 = utf16Result1.result[i];
|
||||
final char ch2 = utf16Result2.result[i];
|
||||
final char ch1 = utf16Result1.chars[i];
|
||||
final char ch2 = utf16Result2.chars[i];
|
||||
if (ch1 != ch2)
|
||||
return ch1-ch2;
|
||||
}
|
||||
|
|
|
@ -68,6 +68,7 @@ import org.apache.lucene.store.NoLockFactory;
|
|||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
@ -1631,7 +1632,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
public void testAllUnicodeChars() throws Throwable {
|
||||
|
||||
BytesRef utf8 = new BytesRef(10);
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
CharsRef utf16 = new CharsRef(10);
|
||||
char[] chars = new char[2];
|
||||
for(int ch=0;ch<0x0010FFFF;ch++) {
|
||||
|
||||
|
@ -1654,7 +1655,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
assertEquals("codepoint " + ch, s1, s2);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);
|
||||
assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
|
||||
assertEquals("codepoint " + ch, s1, new String(utf16.chars, 0, utf16.length));
|
||||
|
||||
byte[] b = s1.getBytes("UTF-8");
|
||||
assertEquals(utf8.length, b.length);
|
||||
|
@ -1721,7 +1722,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
char[] expected = new char[20];
|
||||
|
||||
BytesRef utf8 = new BytesRef(20);
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
CharsRef utf16 = new CharsRef(20);
|
||||
|
||||
int num = 100000 * RANDOM_MULTIPLIER;
|
||||
for (int iter = 0; iter < num; iter++) {
|
||||
|
@ -1738,62 +1739,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);
|
||||
assertEquals(utf16.length, 20);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16.result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// LUCENE-510
|
||||
public void testIncrementalUnicodeStrings() throws Throwable {
|
||||
char[] buffer = new char[20];
|
||||
char[] expected = new char[20];
|
||||
|
||||
BytesRef utf8 = new BytesRef(new byte[20]);
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
|
||||
|
||||
boolean hasIllegal = false;
|
||||
byte[] last = new byte[60];
|
||||
|
||||
int num = 100000 * RANDOM_MULTIPLIER;
|
||||
for (int iter = 0; iter < num; iter++) {
|
||||
|
||||
final int prefix;
|
||||
|
||||
if (iter == 0 || hasIllegal)
|
||||
prefix = 0;
|
||||
else
|
||||
prefix = nextInt(20);
|
||||
|
||||
hasIllegal = fillUnicode(buffer, expected, prefix, 20-prefix);
|
||||
|
||||
UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
|
||||
if (!hasIllegal) {
|
||||
byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
|
||||
assertEquals(b.length, utf8.length);
|
||||
for(int i=0;i<b.length;i++)
|
||||
assertEquals(b[i], utf8.bytes[i]);
|
||||
}
|
||||
|
||||
int bytePrefix = 20;
|
||||
if (iter == 0 || hasIllegal)
|
||||
bytePrefix = 0;
|
||||
else
|
||||
for(int i=0;i<20;i++)
|
||||
if (last[i] != utf8.bytes[i]) {
|
||||
bytePrefix = i;
|
||||
break;
|
||||
}
|
||||
System.arraycopy(utf8.bytes, 0, last, 0, utf8.length);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.bytes, bytePrefix, utf8.length-bytePrefix, utf16);
|
||||
assertEquals(20, utf16.length);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16.result[i]);
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16a);
|
||||
assertEquals(20, utf16a.length);
|
||||
for(int i=0;i<20;i++)
|
||||
assertEquals(expected[i], utf16a.result[i]);
|
||||
assertEquals(expected[i], utf16.chars[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.index.RandomIndexWriter;
|
|||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
@ -114,7 +115,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
|||
|
||||
private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
|
||||
CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
CharsRef utf16 = new CharsRef(10);
|
||||
|
||||
private SimpleAutomatonTermsEnum(TermsEnum tenum) throws IOException {
|
||||
super(tenum);
|
||||
|
@ -124,7 +125,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
|
|||
@Override
|
||||
protected AcceptStatus accept(BytesRef term) throws IOException {
|
||||
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||
return runAutomaton.run(utf16.result, 0, utf16.length) ?
|
||||
return runAutomaton.run(utf16.chars, 0, utf16.length) ?
|
||||
AcceptStatus.YES : AcceptStatus.NO;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -85,37 +85,6 @@ package org.apache.lucene.util;
|
|||
*/
|
||||
|
||||
public class TestUnicodeUtil extends LuceneTestCase {
|
||||
public void testNextValidUTF16String() {
|
||||
// valid UTF-16
|
||||
assertEquals("dogs", UnicodeUtil.nextValidUTF16String("dogs"));
|
||||
assertEquals("dogs\uD802\uDC02", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uD802\uDC02"));
|
||||
|
||||
// an illegal combination, where we have not yet enumerated into the supp
|
||||
// plane so we increment to H + \uDC00 (the lowest possible trail surrogate)
|
||||
assertEquals("dogs\uD801\uDC00", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uD801"));
|
||||
assertEquals("dogs\uD801\uDC00", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uD801b"));
|
||||
assertEquals("dogs\uD801\uDC00", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uD801\uD800"));
|
||||
|
||||
// an illegal combination where we have already enumerated the trail
|
||||
// we must increment the lead and start the trail back at the beginning.
|
||||
assertEquals("dogs\uD802\uDC00", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uD801\uE001"));
|
||||
|
||||
// an illegal combination where we have exhausted the supp plane
|
||||
// we must now move to the lower bmp.
|
||||
assertEquals("dogs\uE000", UnicodeUtil
|
||||
.nextValidUTF16String("dogs\uDBFF\uE001"));
|
||||
|
||||
// an unpaired trail surrogate. this is invalid when not preceded by a lead
|
||||
// surrogate. in this case we have to bump to \uE000 (the lowest possible
|
||||
// "upper BMP")
|
||||
assertEquals("dogs\uE000", UnicodeUtil.nextValidUTF16String("dogs\uDC00"));
|
||||
assertEquals("\uE000", UnicodeUtil.nextValidUTF16String("\uDC00dogs"));
|
||||
}
|
||||
|
||||
public void testCodePointCount() {
|
||||
BytesRef utf8 = new BytesRef(20);
|
||||
|
@ -197,4 +166,19 @@ public class TestUnicodeUtil extends LuceneTestCase {
|
|||
assertTrue(rc == -1);
|
||||
}
|
||||
}
|
||||
|
||||
public void testUTF8UTF16CharsRef() {
|
||||
for (int i = 0; i < 3989 * RANDOM_MULTIPLIER; i++) {
|
||||
String unicode = _TestUtil.randomRealisticUnicodeString(random);
|
||||
BytesRef ref = new BytesRef(unicode);
|
||||
char[] arr = new char[1 + random.nextInt(100)];
|
||||
int offset = random.nextInt(arr.length);
|
||||
int len = random.nextInt(arr.length - offset);
|
||||
CharsRef cRef = new CharsRef(arr, offset, len);
|
||||
UnicodeUtil.UTF8toUTF16(ref, cRef);
|
||||
assertEquals(cRef.toString(), unicode);
|
||||
assertEquals(cRef, unicode); // CharSeq
|
||||
assertEquals(cRef, ref.utf8ToString()); // CharSeq
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiFields;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -143,13 +144,14 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
|||
*/
|
||||
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
|
||||
HashSet<String> stopWords = new HashSet<String>();
|
||||
Terms terms = MultiFields.getTerms(reader, fieldName);
|
||||
final Terms terms = MultiFields.getTerms(reader, fieldName);
|
||||
final CharsRef spare = new CharsRef();
|
||||
if (terms != null) {
|
||||
TermsEnum te = terms.iterator();
|
||||
final TermsEnum te = terms.iterator();
|
||||
BytesRef text;
|
||||
while ((text = te.next()) != null) {
|
||||
if (te.docFreq() > maxDocFreq) {
|
||||
stopWords.add(text.utf8ToString());
|
||||
stopWords.add(text.utf8ToChars(spare).toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||
|
||||
/**
|
||||
|
@ -322,7 +323,7 @@ public class DirectSpellChecker {
|
|||
*/
|
||||
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
|
||||
boolean morePopular, float accuracy) throws IOException {
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
String text = term.text();
|
||||
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
|
||||
return new SuggestWord[0];
|
||||
|
@ -358,11 +359,11 @@ public class DirectSpellChecker {
|
|||
int inspections = numSug * maxInspections;
|
||||
|
||||
// try ed=1 first, in case we get lucky
|
||||
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy);
|
||||
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
|
||||
if (maxEdits > 1 && terms.size() < inspections) {
|
||||
HashSet<ScoreTerm> moreTerms = new HashSet<ScoreTerm>();
|
||||
moreTerms.addAll(terms);
|
||||
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy));
|
||||
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
|
||||
terms = moreTerms;
|
||||
}
|
||||
|
||||
|
@ -372,7 +373,7 @@ public class DirectSpellChecker {
|
|||
int index = suggestions.length - 1;
|
||||
for (ScoreTerm s : terms) {
|
||||
SuggestWord suggestion = new SuggestWord();
|
||||
suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString();
|
||||
suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToChars(spare).toString();
|
||||
suggestion.score = s.score;
|
||||
suggestion.freq = s.docfreq;
|
||||
suggestions[index--] = suggestion;
|
||||
|
@ -388,7 +389,7 @@ public class DirectSpellChecker {
|
|||
}
|
||||
|
||||
private Collection<ScoreTerm> suggestSimilar(Term term, int numSug,
|
||||
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
|
||||
IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException {
|
||||
|
||||
AttributeSource atts = new AttributeSource();
|
||||
MaxNonCompetitiveBoostAttribute maxBoostAtt =
|
||||
|
@ -425,7 +426,7 @@ public class DirectSpellChecker {
|
|||
// undo FuzzyTermsEnum's scale factor for a real scaled lev score
|
||||
score = boost / e.getScaleFactor() + e.getMinSimilarity();
|
||||
} else {
|
||||
termAsString = candidateTerm.utf8ToString();
|
||||
termAsString = candidateTerm.utf8ToChars(spare).toString();
|
||||
score = distance.getDistance(term.text(), termAsString);
|
||||
}
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
@ -42,6 +43,7 @@ public class HighFrequencyDictionary implements Dictionary {
|
|||
private IndexReader reader;
|
||||
private String field;
|
||||
private float thresh;
|
||||
private final CharsRef spare = new CharsRef();
|
||||
|
||||
public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
|
||||
this.reader = reader;
|
||||
|
@ -89,7 +91,7 @@ public class HighFrequencyDictionary implements Dictionary {
|
|||
}
|
||||
hasNextCalled = false;
|
||||
|
||||
return (actualTerm != null) ? actualTerm.utf8ToString() : null;
|
||||
return (actualTerm != null) ? actualTerm.utf8ToChars(spare).toString() : null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Iterator;
|
|||
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
@ -56,6 +57,7 @@ public class LuceneDictionary implements Dictionary {
|
|||
final class LuceneIterator implements Iterator<String> {
|
||||
private TermsEnum termsEnum;
|
||||
private BytesRef pendingTerm;
|
||||
private final CharsRef spare = new CharsRef();
|
||||
|
||||
LuceneIterator() {
|
||||
try {
|
||||
|
@ -74,7 +76,7 @@ public class LuceneDictionary implements Dictionary {
|
|||
return null;
|
||||
}
|
||||
|
||||
String result = pendingTerm.utf8ToString();
|
||||
final String result = pendingTerm.utf8ToChars(spare).toString();
|
||||
|
||||
try {
|
||||
pendingTerm = termsEnum.next();
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.index.Payload;
|
|||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.AttributeReflector;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.SorterTemplate;
|
||||
import org.apache.solr.analysis.CharFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
|
@ -39,8 +40,6 @@ import org.apache.solr.request.SolrQueryRequest;
|
|||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
|
||||
import org.apache.noggit.CharArr;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
|
@ -235,18 +234,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
|
|||
|
||||
FieldType fieldType = context.getFieldType();
|
||||
|
||||
final CharArr textBuf = new CharArr();
|
||||
for (int i = 0, c = tokens.size(); i < c; i++) {
|
||||
AttributeSource token = tokens.get(i);
|
||||
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
|
||||
final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
|
||||
BytesRef rawBytes = termAtt.getBytesRef();
|
||||
termAtt.fillBytesRef();
|
||||
|
||||
textBuf.reset();
|
||||
fieldType.indexedToReadable(rawBytes, textBuf);
|
||||
final String text = textBuf.toString();
|
||||
|
||||
final String text = fieldType.indexedToReadable(rawBytes, new CharsRef(rawBytes.length)).toString();
|
||||
tokenNamedList.add("text", text);
|
||||
|
||||
if (token.hasAttribute(CharTermAttribute.class)) {
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.analysis.CharFilterFactory;
|
||||
|
@ -232,6 +233,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
|
||||
private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema ) throws IOException
|
||||
{
|
||||
final CharsRef spare = new CharsRef();
|
||||
SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
|
||||
for( Object o : doc.getFields() ) {
|
||||
Fieldable fieldable = (Fieldable)o;
|
||||
|
@ -265,7 +267,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
if( v != null ) {
|
||||
SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
|
||||
for( int i=0; i<v.size(); i++ ) {
|
||||
tfv.add( v.getTerms()[i].utf8ToString(), v.getTermFrequencies()[i] );
|
||||
tfv.add( v.getTerms()[i].utf8ToChars(spare).toString(), v.getTermFrequencies()[i] );
|
||||
}
|
||||
f.add( "termVector", tfv );
|
||||
}
|
||||
|
@ -624,7 +626,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
private static Map<String,TopTermQueue> getTopTerms( IndexReader reader, Set<String> fields, int numTerms, Set<String> junkWords ) throws Exception
|
||||
{
|
||||
Map<String,TopTermQueue> info = new HashMap<String, TopTermQueue>();
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
Fields fieldsC = MultiFields.getFields(reader);
|
||||
if (fieldsC != null) {
|
||||
FieldsEnum fieldsEnum = fieldsC.iterator();
|
||||
|
@ -634,7 +636,7 @@ public class LukeRequestHandler extends RequestHandlerBase
|
|||
TermsEnum termsEnum = fieldsEnum.terms();
|
||||
BytesRef text;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
String t = text.utf8ToString();
|
||||
String t = text.utf8ToChars(spare).toString();
|
||||
|
||||
// Compute distinct terms for every field
|
||||
TopTermQueue tiq = info.get( field );
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
import org.apache.solr.cloud.CloudDescriptor;
|
||||
import org.apache.solr.cloud.ZkController;
|
||||
|
@ -455,7 +456,7 @@ public class QueryComponent extends SearchComponent
|
|||
{
|
||||
SolrQueryRequest req = rb.req;
|
||||
SolrQueryResponse rsp = rb.rsp;
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
// The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't
|
||||
// currently have an option to return sort field values. Because of this, we
|
||||
// take the documents given and re-derive the sort values.
|
||||
|
@ -524,7 +525,7 @@ public class QueryComponent extends SearchComponent
|
|||
// String field in Lucene, which returns the terms
|
||||
// data as BytesRef:
|
||||
if (val instanceof BytesRef) {
|
||||
field.setValue(((BytesRef)val).utf8ToString());
|
||||
field.setValue(((BytesRef)val).utf8ToChars(spare).toString());
|
||||
val = ft.toObject(field);
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Map;
|
|||
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.StatsParams;
|
||||
|
@ -270,19 +271,15 @@ class SimpleStats {
|
|||
}
|
||||
finfo[i++] = new FieldFacetStats( f, si, ft, 0 );
|
||||
}
|
||||
|
||||
final CharsRef spare = new CharsRef();
|
||||
final BytesRef tempBR = new BytesRef();
|
||||
final CharArr spare = new CharArr();
|
||||
|
||||
DocIterator iter = docs.iterator();
|
||||
while (iter.hasNext()) {
|
||||
int docID = iter.nextDoc();
|
||||
BytesRef raw = all.getTermText(docID, tempBR);
|
||||
Double v = null;
|
||||
if( raw != null ) {
|
||||
spare.reset();
|
||||
all.ft.indexedToReadable(raw, spare);
|
||||
v = Double.parseDouble(spare.toString());
|
||||
v = Double.parseDouble(all.ft.indexedToReadable(raw, spare).toString());
|
||||
allstats.accumulate(v);
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -18,7 +18,7 @@ package org.apache.solr.handler.component;
|
|||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.*;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -178,8 +178,7 @@ public class TermsComponent extends SearchComponent {
|
|||
|
||||
int i = 0;
|
||||
BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
|
||||
CharArr external = new CharArr();
|
||||
|
||||
CharsRef external = new CharsRef();
|
||||
while (term != null && (i<limit || sort)) {
|
||||
boolean externalized = false; // did we fill in "external" yet for this term?
|
||||
|
||||
|
@ -189,8 +188,8 @@ public class TermsComponent extends SearchComponent {
|
|||
if (pattern != null) {
|
||||
// indexed text or external text?
|
||||
// TODO: support "raw" mode?
|
||||
external.reset();
|
||||
ft.indexedToReadable(term, external);
|
||||
externalized = true;
|
||||
if (!pattern.matcher(external).matches()) {
|
||||
term = termsEnum.next();
|
||||
continue;
|
||||
|
@ -213,13 +212,9 @@ public class TermsComponent extends SearchComponent {
|
|||
|
||||
// TODO: handle raw somehow
|
||||
if (!externalized) {
|
||||
external.reset();
|
||||
ft.indexedToReadable(term, external);
|
||||
}
|
||||
String label = external.toString();
|
||||
|
||||
|
||||
fieldTerms.add(label, docFreq);
|
||||
fieldTerms.add(external.toString(), docFreq);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -230,7 +225,6 @@ public class TermsComponent extends SearchComponent {
|
|||
if (sort) {
|
||||
for (CountPair<BytesRef, Integer> item : queue) {
|
||||
if (i >= limit) break;
|
||||
external.reset();
|
||||
ft.indexedToReadable(item.key, external);
|
||||
fieldTerms.add(external.toString(), item.val);
|
||||
i++;
|
||||
|
|
|
@ -23,9 +23,11 @@ import org.apache.lucene.search.DocIdSet;
|
|||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.Direct16;
|
||||
import org.apache.lucene.util.packed.Direct32;
|
||||
import org.apache.lucene.util.packed.Direct8;
|
||||
|
@ -37,7 +39,6 @@ import org.apache.solr.schema.FieldType;
|
|||
import org.apache.solr.search.DocSet;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.BoundedTreeSet;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
@ -244,7 +245,7 @@ class PerSegmentSingleValuedFaceting {
|
|||
BytesRef prefixRef = new BytesRef(prefix);
|
||||
startTermIndex = si.binarySearchLookup(prefixRef, tempBR);
|
||||
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
|
||||
prefixRef.append(ByteUtils.bigTerm);
|
||||
prefixRef.append(UnicodeUtil.BIG_TERM);
|
||||
// TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed passing start/end
|
||||
endTermIndex = si.binarySearchLookup(prefixRef, tempBR);
|
||||
assert endTermIndex < 0;
|
||||
|
@ -339,6 +340,8 @@ abstract class FacetCollector {
|
|||
|
||||
// This collector expects facets to be collected in index order
|
||||
class CountSortedFacetCollector extends FacetCollector {
|
||||
private final CharsRef spare = new CharsRef();
|
||||
|
||||
final int offset;
|
||||
final int limit;
|
||||
final int maxsize;
|
||||
|
@ -360,7 +363,7 @@ class CountSortedFacetCollector extends FacetCollector {
|
|||
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
|
||||
// index order, so we already know that the keys are ordered. This can be very
|
||||
// important if a lot of the counts are repeated (like zero counts would be).
|
||||
queue.add(new SimpleFacets.CountPair<String,Integer>(term.utf8ToString(), count));
|
||||
queue.add(new SimpleFacets.CountPair<String,Integer>(term.utf8ToChars(spare).toString(), count));
|
||||
if (queue.size()>=maxsize) min=queue.last().val;
|
||||
}
|
||||
return false;
|
||||
|
@ -383,12 +386,13 @@ class CountSortedFacetCollector extends FacetCollector {
|
|||
|
||||
// This collector expects facets to be collected in index order
|
||||
class IndexSortedFacetCollector extends FacetCollector {
|
||||
private final CharsRef spare = new CharsRef();
|
||||
|
||||
int offset;
|
||||
int limit;
|
||||
final int mincount;
|
||||
final NamedList<Integer> res = new NamedList<Integer>();
|
||||
|
||||
|
||||
public IndexSortedFacetCollector(int offset, int limit, int mincount) {
|
||||
this.offset = offset;
|
||||
this.limit = limit>0 ? limit : Integer.MAX_VALUE;
|
||||
|
@ -407,7 +411,7 @@ class IndexSortedFacetCollector extends FacetCollector {
|
|||
}
|
||||
|
||||
if (limit > 0) {
|
||||
res.add(term.utf8ToString(), count);
|
||||
res.add(term.utf8ToChars(spare).toString(), count);
|
||||
limit--;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,12 +21,13 @@ import org.apache.lucene.index.*;
|
|||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.Direct16;
|
||||
import org.apache.lucene.util.packed.Direct32;
|
||||
import org.apache.lucene.util.packed.Direct8;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.FacetParams;
|
||||
import org.apache.solr.common.params.RequiredSolrParams;
|
||||
|
@ -41,7 +42,6 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.schema.*;
|
||||
import org.apache.solr.search.*;
|
||||
import org.apache.solr.util.BoundedTreeSet;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.DateMathParser;
|
||||
import org.apache.solr.handler.component.ResponseBuilder;
|
||||
import org.apache.solr.util.LongPriorityQueue;
|
||||
|
@ -109,7 +109,7 @@ public class SimpleFacets {
|
|||
if (localParams == null) return;
|
||||
|
||||
// remove local params unless it's a query
|
||||
if (type != FacetParams.FACET_QUERY) {
|
||||
if (type != FacetParams.FACET_QUERY) { // TODO Cut over to an Enum here
|
||||
facetValue = localParams.get(CommonParams.VALUE);
|
||||
}
|
||||
|
||||
|
@ -128,7 +128,7 @@ public class SimpleFacets {
|
|||
String excludeStr = localParams.get(CommonParams.EXCLUDE);
|
||||
if (excludeStr == null) return;
|
||||
|
||||
Map tagMap = (Map)req.getContext().get("tags");
|
||||
Map<?,?> tagMap = (Map<?,?>)req.getContext().get("tags");
|
||||
if (tagMap != null && rb != null) {
|
||||
List<String> excludeTagList = StrUtils.splitSmart(excludeStr,',');
|
||||
|
||||
|
@ -137,7 +137,7 @@ public class SimpleFacets {
|
|||
Object olst = tagMap.get(excludeTag);
|
||||
// tagMap has entries of List<String,List<QParser>>, but subject to change in the future
|
||||
if (!(olst instanceof Collection)) continue;
|
||||
for (Object o : (Collection)olst) {
|
||||
for (Object o : (Collection<?>)olst) {
|
||||
if (!(o instanceof QParser)) continue;
|
||||
QParser qp = (QParser)o;
|
||||
excludeSet.put(qp.getQuery(), Boolean.TRUE);
|
||||
|
@ -435,7 +435,7 @@ public class SimpleFacets {
|
|||
if (prefix!=null) {
|
||||
startTermIndex = si.binarySearchLookup(prefixRef, br);
|
||||
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
|
||||
prefixRef.append(ByteUtils.bigTerm);
|
||||
prefixRef.append(UnicodeUtil.BIG_TERM);
|
||||
endTermIndex = si.binarySearchLookup(prefixRef, br);
|
||||
assert endTermIndex < 0;
|
||||
endTermIndex = -endTermIndex-1;
|
||||
|
@ -446,8 +446,7 @@ public class SimpleFacets {
|
|||
|
||||
final int nTerms=endTermIndex-startTermIndex;
|
||||
int missingCount = -1;
|
||||
|
||||
CharArr spare = new CharArr();
|
||||
final CharsRef charsRef = new CharsRef(10);
|
||||
if (nTerms>0 && docs.size() >= mincount) {
|
||||
|
||||
// count collection array only needs to be as big as the number of terms we are
|
||||
|
@ -547,10 +546,8 @@ public class SimpleFacets {
|
|||
long pair = sorted[i];
|
||||
int c = (int)(pair >>> 32);
|
||||
int tnum = Integer.MAX_VALUE - (int)pair;
|
||||
|
||||
spare.reset();
|
||||
ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), spare);
|
||||
res.add(spare.toString(), c);
|
||||
ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), charsRef);
|
||||
res.add(charsRef.toString(), c);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -567,9 +564,8 @@ public class SimpleFacets {
|
|||
int c = counts[i];
|
||||
if (c<mincount || --off>=0) continue;
|
||||
if (--lim<0) break;
|
||||
spare.reset();
|
||||
ft.indexedToReadable(si.lookup(startTermIndex+i, br), spare);
|
||||
res.add(spare.toString(), c);
|
||||
ft.indexedToReadable(si.lookup(startTermIndex+i, br), charsRef);
|
||||
res.add(charsRef.toString(), c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -657,7 +653,7 @@ public class SimpleFacets {
|
|||
}
|
||||
|
||||
DocsEnum docsEnum = null;
|
||||
CharArr spare = new CharArr();
|
||||
CharsRef charsRef = new CharsRef(10);
|
||||
|
||||
if (docs.size() >= mincount) {
|
||||
while (term != null) {
|
||||
|
@ -742,9 +738,8 @@ public class SimpleFacets {
|
|||
} else {
|
||||
if (c >= mincount && --off<0) {
|
||||
if (--lim<0) break;
|
||||
spare.reset();
|
||||
ft.indexedToReadable(term, spare);
|
||||
res.add(spare.toString(), c);
|
||||
ft.indexedToReadable(term, charsRef);
|
||||
res.add(charsRef.toString(), c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -757,9 +752,8 @@ public class SimpleFacets {
|
|||
for (CountPair<BytesRef,Integer> p : queue) {
|
||||
if (--off>=0) continue;
|
||||
if (--lim<0) break;
|
||||
spare.reset();
|
||||
ft.indexedToReadable(p.key, spare);
|
||||
res.add(spare.toString(), p.val);
|
||||
ft.indexedToReadable(p.key, charsRef);
|
||||
res.add(charsRef.toString(), p.val);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.common.params.FacetParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -33,13 +32,14 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.TrieField;
|
||||
import org.apache.solr.search.*;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.LongPriorityQueue;
|
||||
import org.apache.solr.util.PrimUtils;
|
||||
import org.apache.solr.handler.component.StatsValues;
|
||||
import org.apache.solr.handler.component.FieldFacetStats;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
@ -227,13 +227,13 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
TermsEnum te = getOrdTermsEnum(searcher.getIndexReader());
|
||||
if (prefix != null && prefix.length() > 0) {
|
||||
BytesRef prefixBr = new BytesRef(prefix);
|
||||
final BytesRef prefixBr = new BytesRef(prefix);
|
||||
if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
|
||||
startTerm = numTermsInField;
|
||||
} else {
|
||||
startTerm = (int) te.ord();
|
||||
}
|
||||
prefixBr.append(ByteUtils.bigTerm);
|
||||
prefixBr.append(UnicodeUtil.BIG_TERM);
|
||||
if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
|
||||
endTerm = numTermsInField;
|
||||
} else {
|
||||
|
@ -331,8 +331,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
CharArr spare = new CharArr();
|
||||
final CharsRef charsRef = new CharsRef();
|
||||
|
||||
int off=offset;
|
||||
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
|
||||
|
@ -408,7 +407,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
|
||||
int idx = indirect[i];
|
||||
int tnum = (int)sorted[idx];
|
||||
String label = getReadableValue(getTermValue(te, tnum), ft, spare);
|
||||
final String label = getReadableValue(getTermValue(te, tnum), ft, charsRef);
|
||||
//System.out.println(" label=" + label);
|
||||
res.setName(idx - sortedIdxStart, label);
|
||||
}
|
||||
|
@ -428,7 +427,7 @@ public class UnInvertedField extends DocTermOrds {
|
|||
if (c<mincount || --off>=0) continue;
|
||||
if (--lim<0) break;
|
||||
|
||||
String label = getReadableValue(getTermValue(te, i), ft, spare);
|
||||
final String label = getReadableValue(getTermValue(te, i), ft, charsRef);
|
||||
res.add(label, c);
|
||||
}
|
||||
}
|
||||
|
@ -582,14 +581,12 @@ public class UnInvertedField extends DocTermOrds {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
final CharsRef charsRef = new CharsRef();
|
||||
// add results in index order
|
||||
CharArr spare = new CharArr();
|
||||
|
||||
for (i = 0; i < numTermsInField; i++) {
|
||||
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
|
||||
if (c == 0) continue;
|
||||
String label = getReadableValue(getTermValue(te, i), ft, spare);
|
||||
String label = getReadableValue(getTermValue(te, i), ft, charsRef);
|
||||
// TODO: we should avoid this re-parse
|
||||
Double value = Double.parseDouble(label);
|
||||
|
||||
|
@ -621,14 +618,8 @@ public class UnInvertedField extends DocTermOrds {
|
|||
|
||||
}
|
||||
|
||||
String getReadableValue(BytesRef termval, FieldType ft, CharArr spare) {
|
||||
if (spare == null) {
|
||||
spare = new CharArr();
|
||||
} else {
|
||||
spare.reset();
|
||||
}
|
||||
ft.indexedToReadable(termval, spare);
|
||||
return spare.toString();
|
||||
String getReadableValue(BytesRef termval, FieldType ft, CharsRef charsRef) {
|
||||
return ft.indexedToReadable(termval, charsRef).toString();
|
||||
}
|
||||
|
||||
/** may return a reused BytesRef */
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.solr.schema;
|
|||
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.search.function.ValueSource;
|
||||
import org.apache.solr.search.function.OrdFieldSource;
|
||||
|
@ -130,13 +130,17 @@ public class BoolField extends FieldType {
|
|||
return ch=='T' ? "true" : "false";
|
||||
}
|
||||
|
||||
private static final CharsRef TRUE = new CharsRef("true");
|
||||
private static final CharsRef FALSE = new CharsRef("false");
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
if (input.length > 0 && input.bytes[input.offset] == 'T') {
|
||||
out.write("true");
|
||||
charsRef.copy(TRUE);
|
||||
} else {
|
||||
out.write("false");
|
||||
charsRef.copy(FALSE);
|
||||
}
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,14 +23,13 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.util.DateUtil;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.search.function.*;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.DateMathParser;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -131,6 +130,8 @@ public class DateField extends FieldType {
|
|||
|
||||
protected static String NOW = "NOW";
|
||||
protected static char Z = 'Z';
|
||||
private static char[] Z_ARRAY = new char[] {Z};
|
||||
|
||||
|
||||
@Override
|
||||
public String toInternal(String val) {
|
||||
|
@ -184,7 +185,7 @@ public class DateField extends FieldType {
|
|||
public Fieldable createField(SchemaField field, Object value, float boost) {
|
||||
// Convert to a string before indexing
|
||||
if(value instanceof Date) {
|
||||
value = toInternal( (Date)value ) + 'Z';
|
||||
value = toInternal( (Date)value ) + Z;
|
||||
}
|
||||
return super.createField(field, value, boost);
|
||||
}
|
||||
|
@ -199,9 +200,10 @@ public class DateField extends FieldType {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
ByteUtils.UTF8toUTF16(input, out);
|
||||
out.write(Z);
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
input.utf8ToChars(charsRef);
|
||||
charsRef.append(Z_ARRAY, 0, 1);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -479,10 +481,8 @@ class DateFieldSource extends FieldCacheSource {
|
|||
if (ord == 0) {
|
||||
return null;
|
||||
} else {
|
||||
BytesRef br = termsIndex.lookup(ord, new BytesRef());
|
||||
CharArr spare = new CharArr();
|
||||
ft.indexedToReadable(br, spare);
|
||||
return spare.toString();
|
||||
final BytesRef br = termsIndex.lookup(ord, spare);
|
||||
return ft.indexedToReadable(br, spareChars).toString();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -492,7 +492,7 @@ class DateFieldSource extends FieldCacheSource {
|
|||
if (ord == 0) {
|
||||
return null;
|
||||
} else {
|
||||
BytesRef br = termsIndex.lookup(ord, new BytesRef());
|
||||
final BytesRef br = termsIndex.lookup(ord, new BytesRef());
|
||||
return ft.toObject(null, br);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,8 +30,8 @@ import org.apache.lucene.search.SortField;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TermRangeQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.analysis.SolrAnalyzer;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
@ -39,7 +39,6 @@ import org.apache.solr.response.TextResponseWriter;
|
|||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.search.Sorting;
|
||||
import org.apache.solr.search.function.ValueSource;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -354,9 +353,9 @@ public abstract class FieldType extends FieldProperties {
|
|||
}
|
||||
|
||||
public Object toObject(SchemaField sf, BytesRef term) {
|
||||
CharArr ext = new CharArr(term.length);
|
||||
indexedToReadable(term, ext);
|
||||
Fieldable f = createField(sf, ext.toString(), 1.0f);
|
||||
final CharsRef ref = new CharsRef(term.length);
|
||||
indexedToReadable(term, ref);
|
||||
final Fieldable f = createField(sf, ref.toString(), 1.0f);
|
||||
return toObject(f);
|
||||
}
|
||||
|
||||
|
@ -365,9 +364,10 @@ public abstract class FieldType extends FieldProperties {
|
|||
return indexedForm;
|
||||
}
|
||||
|
||||
/** Given an indexed term, append the human readable representation to out */
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
ByteUtils.UTF8toUTF16(input, out);
|
||||
/** Given an indexed term, append the human readable representation*/
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef output) {
|
||||
input.utf8ToChars(output);
|
||||
return output;
|
||||
}
|
||||
|
||||
/** Given the stored field, return the human readable representation */
|
||||
|
@ -390,7 +390,7 @@ public abstract class FieldType extends FieldProperties {
|
|||
|
||||
/** Given the readable value, return the term value that will match it. */
|
||||
public void readableToIndexed(CharSequence val, BytesRef result) {
|
||||
String internal = readableToIndexed(val.toString());
|
||||
final String internal = readableToIndexed(val.toString());
|
||||
UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.solr.schema;
|
|||
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.MutableValueDouble;
|
||||
import org.apache.solr.search.MutableValue;
|
||||
import org.apache.solr.search.QParser;
|
||||
|
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
|
|||
import org.apache.solr.search.function.StringIndexDocValues;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.NumberUtils;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
||||
|
@ -78,9 +77,12 @@ public class SortableDoubleField extends FieldType {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
// TODO: this could be more efficient, but the sortable types should be deprecated instead
|
||||
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
|
||||
input.utf8ToChars(charsRef);
|
||||
final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray();
|
||||
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -90,9 +92,6 @@ public class SortableDoubleField extends FieldType {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
class SortableDoubleFieldSource extends FieldCacheSource {
|
||||
protected double defVal;
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.solr.schema;
|
|||
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.MutableValueFloat;
|
||||
import org.apache.solr.search.MutableValue;
|
||||
import org.apache.solr.search.QParser;
|
||||
|
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
|
|||
import org.apache.solr.search.function.StringIndexDocValues;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.NumberUtils;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
||||
|
@ -77,10 +76,11 @@ public class SortableFloatField extends FieldType {
|
|||
return NumberUtils.SortableStr2floatStr(indexedForm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
// TODO: this could be more efficient, but the sortable types should be deprecated instead
|
||||
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
|
||||
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
|
||||
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.solr.schema;
|
|||
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.MutableValueInt;
|
||||
import org.apache.solr.search.MutableValue;
|
||||
import org.apache.solr.search.QParser;
|
||||
|
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
|
|||
import org.apache.solr.search.function.StringIndexDocValues;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.NumberUtils;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
||||
|
@ -75,10 +74,11 @@ public class SortableIntField extends FieldType {
|
|||
return NumberUtils.SortableStr2int(indexedForm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
// TODO: this could be more efficient, but the sortable types should be deprecated instead
|
||||
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
|
||||
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
|
||||
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.solr.schema;
|
|||
|
||||
import org.apache.lucene.search.SortField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.MutableValueLong;
|
||||
import org.apache.solr.search.MutableValue;
|
||||
import org.apache.solr.search.QParser;
|
||||
|
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
|
|||
import org.apache.solr.search.function.StringIndexDocValues;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
import org.apache.solr.util.NumberUtils;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
||||
|
@ -67,10 +66,11 @@ public class SortableLongField extends FieldType {
|
|||
return NumberUtils.SortableStr2long(indexedForm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
// TODO: this could be more efficient, but the sortable types should be deprecated instead
|
||||
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
|
||||
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
|
||||
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.solr.response.TextResponseWriter;
|
||||
import org.apache.solr.search.function.ValueSource;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
@ -54,7 +53,7 @@ public class StrField extends FieldType {
|
|||
|
||||
@Override
|
||||
public Object toObject(SchemaField sf, BytesRef term) {
|
||||
return ByteUtils.UTF8toUTF16(term);
|
||||
return term.utf8ToString();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,12 +18,9 @@
|
|||
package org.apache.solr.schema;
|
||||
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.search.function.DocValues;
|
||||
import org.apache.solr.search.function.FieldCacheSource;
|
||||
import org.apache.solr.search.function.StringIndexDocValues;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
|
@ -81,7 +80,7 @@ public class TextField extends FieldType {
|
|||
|
||||
@Override
|
||||
public Object toObject(SchemaField sf, BytesRef term) {
|
||||
return ByteUtils.UTF8toUTF16(term);
|
||||
return term.utf8ToString();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.schema;
|
||||
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.search.function.ValueSource;
|
||||
import org.apache.solr.search.QParser;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
@ -26,6 +25,7 @@ import org.apache.lucene.search.SortField;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.NumericRangeQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Date;
|
||||
|
@ -111,10 +111,10 @@ public class TrieDateField extends DateField {
|
|||
public String indexedToReadable(String _indexedForm) {
|
||||
return wrappedField.indexedToReadable(_indexedForm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
wrappedField.indexedToReadable(input, out);
|
||||
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
|
||||
// TODO: this could be more efficient, but the sortable types should be deprecated instead
|
||||
return wrappedField.indexedToReadable(input, charsRef);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -26,8 +26,8 @@ import org.apache.lucene.search.cache.FloatValuesCreator;
|
|||
import org.apache.lucene.search.cache.IntValuesCreator;
|
||||
import org.apache.lucene.search.cache.LongValuesCreator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.NumericUtils;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.solr.analysis.*;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.response.TextResponseWriter;
|
||||
|
@ -296,7 +296,7 @@ public class TrieField extends FieldType {
|
|||
@Override
|
||||
public String readableToIndexed(String val) {
|
||||
// TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts!
|
||||
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
|
||||
final BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
|
||||
readableToIndexed(val, bytes);
|
||||
return bytes.utf8ToString();
|
||||
}
|
||||
|
@ -363,31 +363,29 @@ public class TrieField extends FieldType {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void indexedToReadable(BytesRef input, CharArr out) {
|
||||
BytesRef indexedForm = input;
|
||||
String s;
|
||||
|
||||
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRef charsRef) {
|
||||
final char[] value;
|
||||
switch (type) {
|
||||
case INTEGER:
|
||||
s = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) );
|
||||
value = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) ).toCharArray();
|
||||
break;
|
||||
case FLOAT:
|
||||
s = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) );
|
||||
value = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) ).toCharArray();
|
||||
break;
|
||||
case LONG:
|
||||
s = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) );
|
||||
value = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) ).toCharArray();
|
||||
break;
|
||||
case DOUBLE:
|
||||
s = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) );
|
||||
value = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray();
|
||||
break;
|
||||
case DATE:
|
||||
s = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) );
|
||||
value = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray();
|
||||
break;
|
||||
default:
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
|
||||
}
|
||||
|
||||
out.write(s);
|
||||
charsRef.copy(value, 0, value.length);
|
||||
return charsRef;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,11 +21,11 @@ import org.apache.lucene.search.*;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.packed.Direct16;
|
||||
import org.apache.lucene.util.packed.Direct32;
|
||||
import org.apache.lucene.util.packed.Direct8;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -34,7 +34,7 @@ public class MissingStringLastComparatorSource extends FieldComparatorSource {
|
|||
private final BytesRef missingValueProxy;
|
||||
|
||||
public MissingStringLastComparatorSource() {
|
||||
this(ByteUtils.bigTerm);
|
||||
this(UnicodeUtil.BIG_TERM);
|
||||
}
|
||||
|
||||
/** Creates a {@link FieldComparatorSource} that sorts null last in a normal ascending sort.
|
||||
|
|
|
@ -17,14 +17,13 @@
|
|||
package org.apache.solr.search;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
public class MutableValueStr extends MutableValue {
|
||||
public BytesRef value = new BytesRef();
|
||||
|
||||
@Override
|
||||
public Object toObject() {
|
||||
return exists ? ByteUtils.UTF8toUTF16(value) : null;
|
||||
return exists ? value.utf8ToString() : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -22,7 +22,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
@ -43,8 +42,7 @@ public class IDFValueSource extends DocFreqValueSource {
|
|||
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
|
||||
Similarity sim = searcher.getSimilarityProvider().get(field);
|
||||
// todo: we need docFreq that takes a BytesRef
|
||||
String strVal = ByteUtils.UTF8toUTF16(indexedBytes);
|
||||
int docfreq = searcher.docFreq(new Term(indexedField, strVal));
|
||||
int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString()));
|
||||
float idf = sim.idf(docfreq, searcher.maxDoc());
|
||||
return new ConstDoubleDocValues(idf, this);
|
||||
}
|
||||
|
|
|
@ -21,10 +21,9 @@ import org.apache.lucene.search.FieldCache;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.solr.search.MutableValue;
|
||||
import org.apache.solr.search.MutableValueStr;
|
||||
import org.apache.solr.util.ByteUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -36,7 +35,7 @@ public abstract class StringIndexDocValues extends DocValues {
|
|||
protected final ValueSource vs;
|
||||
protected final MutableValueStr val = new MutableValueStr();
|
||||
protected final BytesRef spare = new BytesRef();
|
||||
protected final CharArr spareChars = new CharArr();
|
||||
protected final CharsRef spareChars = new CharsRef();
|
||||
|
||||
public StringIndexDocValues(ValueSource vs, AtomicReaderContext context, String field) throws IOException {
|
||||
try {
|
||||
|
@ -75,8 +74,7 @@ public abstract class StringIndexDocValues extends DocValues {
|
|||
int ord=termsIndex.getOrd(doc);
|
||||
if (ord==0) return null;
|
||||
termsIndex.lookup(ord, spare);
|
||||
spareChars.reset();
|
||||
ByteUtils.UTF8toUTF16(spare, spareChars);
|
||||
spare.utf8ToChars(spareChars);
|
||||
return spareChars.toString();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,81 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.noggit.CharArr;
|
||||
|
||||
|
||||
public class ByteUtils {
|
||||
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
|
||||
* one would normally encounter, and definitely bigger than any UTF-8 terms */
|
||||
public static final BytesRef bigTerm = new BytesRef(
|
||||
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
|
||||
);
|
||||
|
||||
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
|
||||
* Full characters are read, even if this reads past the length passed (and can result in
|
||||
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
|
||||
* The char[] out should probably have enough room to hold the worst case of each byte becoming a Java char.
|
||||
*/
|
||||
public static int UTF8toUTF16(byte[] utf8, int offset, int len, char[] out, int out_offset) {
|
||||
int out_start = out_offset;
|
||||
final int limit = offset + len;
|
||||
while (offset < limit) {
|
||||
int b = utf8[offset++]&0xff;
|
||||
|
||||
if (b < 0xc0) {
|
||||
assert b < 0x80;
|
||||
out[out_offset++] = (char)b;
|
||||
} else if (b < 0xe0) {
|
||||
out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
|
||||
} else if (b < 0xf0) {
|
||||
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
|
||||
offset += 2;
|
||||
} else {
|
||||
assert b < 0xf8;
|
||||
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
|
||||
offset += 3;
|
||||
if (ch < 0xffff) {
|
||||
out[out_offset++] = (char)ch;
|
||||
} else {
|
||||
int chHalf = ch - 0x0010000;
|
||||
out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
|
||||
out[out_offset++] = (char) ((chHalf & 0x3FFL) + 0xDC00);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out_offset - out_start;
|
||||
}
|
||||
|
||||
/** Convert UTF8 bytes into UTF16 characters. */
|
||||
public static void UTF8toUTF16(BytesRef utf8, CharArr out) {
|
||||
// TODO: do in chunks if the input is large
|
||||
out.reserve(utf8.length);
|
||||
int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out.getArray(), out.getEnd());
|
||||
out.setEnd(out.getEnd() + n);
|
||||
}
|
||||
|
||||
/** Convert UTF8 bytes into a String */
|
||||
public static String UTF8toUTF16(BytesRef utf8) {
|
||||
char[] out = new char[utf8.length];
|
||||
int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out, 0);
|
||||
return new String(out,0,n);
|
||||
}
|
||||
}
|
|
@ -19,6 +19,7 @@
|
|||
org.apache.lucene.util.AttributeSource,
|
||||
org.apache.lucene.util.Attribute,
|
||||
org.apache.lucene.util.BytesRef,
|
||||
org.apache.lucene.util.CharsRef,
|
||||
org.apache.lucene.analysis.TokenStream,
|
||||
org.apache.lucene.index.Payload,
|
||||
org.apache.lucene.analysis.CharReader,
|
||||
|
@ -32,8 +33,7 @@
|
|||
org.apache.solr.schema.FieldType,
|
||||
org.apache.solr.schema.SchemaField,
|
||||
org.apache.solr.common.util.XML,
|
||||
javax.servlet.jsp.JspWriter,java.io.IOException,
|
||||
org.apache.noggit.CharArr
|
||||
javax.servlet.jsp.JspWriter,java.io.IOException
|
||||
"%>
|
||||
<%@ page import="java.io.Reader"%>
|
||||
<%@ page import="java.io.StringReader"%>
|
||||
|
@ -287,9 +287,7 @@
|
|||
bytes = new BytesRef(spare);
|
||||
rawText = (token.hasAttribute(CharTermAttribute.class)) ?
|
||||
token.getAttribute(CharTermAttribute.class).toString() : null;
|
||||
final CharArr textBuf = new CharArr(bytes.length);
|
||||
ft.indexedToReadable(bytes, textBuf);
|
||||
text = textBuf.toString();
|
||||
text = ft.indexedToReadable(bytes, new CharsRef()).toString();
|
||||
token.reflectWith(new AttributeReflector() {
|
||||
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
|
||||
// leave out position and raw term
|
||||
|
|
Loading…
Reference in New Issue