SOLR-2530: Remove Noggit CharArr from FieldType

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1127326 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-05-24 22:44:36 +00:00
parent 530b894c60
commit 68a840c2b7
44 changed files with 492 additions and 549 deletions

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
/**
* <code>FieldTermStack</code> is a stack that keeps query terms in the specified field
@ -80,16 +81,16 @@ public class FieldTermStack {
Set<String> termSet = fieldQuery.getTermSet( fieldName );
// just return to make null snippet if un-matched fieldName specified when fieldMatch == true
if( termSet == null ) return;
final CharsRef spare = new CharsRef();
for( BytesRef term : tpv.getTerms() ){
if( !termSet.contains( term.utf8ToString() ) ) continue;
if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;
int index = tpv.indexOf( term );
TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
if( tvois == null ) return; // just return to make null snippets
int[] poss = tpv.getTermPositions( index );
if( poss == null ) return; // just return to make null snippets
for( int i = 0; i < tvois.length; i++ )
termList.add( new TermInfo( term.utf8ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) );
}
// sort by position

View File

@ -41,6 +41,7 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.util.BitVector;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
/**
* Represented as a coupled graph of class instances, this
@ -228,12 +229,13 @@ public class InstantiatedIndex
if (fieldsC != null) {
FieldsEnum fieldsEnum = fieldsC.iterator();
String field;
final CharsRef spare = new CharsRef();
while((field = fieldsEnum.next()) != null) {
if (fields == null || fields.contains(field)) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
while((text = termsEnum.next()) != null) {
String termText = text.utf8ToString();
String termText = text.utf8ToChars(spare).toString();
InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText);
final long totalTermFreq = termsEnum.totalTermFreq();
if (totalTermFreq != -1) {

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.regex;
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.regexp.CharacterIterator;
import org.apache.regexp.RE;
@ -104,11 +105,11 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharsRef utf16 = new CharsRef(10);
private final CharacterIterator utf16wrapper = new CharacterIterator() {
public char charAt(int pos) {
return utf16.result[pos];
return utf16.chars[pos];
}
public boolean isEnd(int pos) {
@ -120,7 +121,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
}
public String substring(int beginIndex, int endIndex) {
return new String(utf16.result, beginIndex, endIndex - beginIndex);
return new String(utf16.chars, beginIndex, endIndex - beginIndex);
}
};

View File

@ -21,6 +21,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
/**
@ -95,25 +96,11 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
private final Pattern pattern;
private final Matcher matcher;
private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private final CharSequence utf16wrapper = new CharSequence() {
public int length() {
return utf16.length;
}
public char charAt(int index) {
return utf16.result[index];
}
public CharSequence subSequence(int start, int end) {
return new String(utf16.result, start, end - start);
}
};
private final CharsRef utf16 = new CharsRef(10);
public JavaUtilRegexMatcher(String regex, int flags) {
this.pattern = Pattern.compile(regex, flags);
this.matcher = this.pattern.matcher(utf16wrapper);
this.matcher = this.pattern.matcher(utf16);
}
public boolean match(BytesRef term) {

View File

@ -48,6 +48,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
@ -850,8 +851,9 @@ public final class MoreLikeThis {
{
BytesRef[] terms = vector.getTerms();
int freqs[]=vector.getTermFrequencies();
final CharsRef spare = new CharsRef();
for (int j = 0; j < terms.length; j++) {
String term = terms[j].utf8ToString();
final String term = terms[j].utf8ToChars(spare).toString();
if(isNoiseWord(term)){
continue;

View File

@ -23,6 +23,7 @@ import java.util.zip.DataFormatException;
import java.io.ByteArrayOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
/** Simple utility class providing static methods to
@ -118,9 +119,9 @@ public class CompressionTools {
/** Decompress the byte array previously returned by
* compressString back into a String */
public static String decompressString(byte[] value) throws DataFormatException {
UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result();
final byte[] bytes = decompress(value);
CharsRef result = new CharsRef(bytes.length);
UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);
return new String(result.result, 0, result.length);
return new String(result.chars, 0, result.length);
}
}

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.fst.Builder;
@ -236,7 +237,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
private int tf;
private Bits skipDocs;
private final BytesRef scratch = new BytesRef(10);
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
private final CharsRef scratchUTF16 = new CharsRef(10);
public SimpleTextDocsEnum() {
this.inStart = SimpleTextFieldsReader.this.in;
@ -286,7 +287,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
return docID;
}
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
termFreq = 0;
first = false;
} else if (scratch.startsWith(POS)) {
@ -323,8 +324,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
private Bits skipDocs;
private final BytesRef scratch = new BytesRef(10);
private final BytesRef scratch2 = new BytesRef(10);
private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result();
private final UnicodeUtil.UTF16Result scratchUTF16_2 = new UnicodeUtil.UTF16Result();
private final CharsRef scratchUTF16 = new CharsRef(10);
private final CharsRef scratchUTF16_2 = new CharsRef(10);
private BytesRef payload;
private long nextDocStart;
@ -368,7 +369,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
return docID;
}
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16);
docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length);
docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
tf = 0;
posStart = in.getFilePointer();
first = false;
@ -400,7 +401,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
readLine(in, scratch);
assert scratch.startsWith(POS): "got line=" + scratch.utf8ToString();
UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2);
final int pos = ArrayUtil.parseInt(scratchUTF16_2.result, 0, scratchUTF16_2.length);
final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
final long fp = in.getFilePointer();
readLine(in, scratch);
if (scratch.startsWith(PAYLOAD)) {

View File

@ -18,7 +18,6 @@ package org.apache.lucene.util;
*/
import java.util.Comparator;
import java.io.UnsupportedEncodingException;
/** Represents byte[], as a slice (offset + length) into an
* existing byte[].
@ -122,6 +121,7 @@ public final class BytesRef implements Comparable<BytesRef> {
public void copy(char text[], int offset, int length) {
UnicodeUtil.UTF16toUTF8(text, offset, length, this);
}
public boolean bytesEquals(BytesRef other) {
if (length == other.length) {
int otherUpto = other.offset;
@ -198,13 +198,15 @@ public final class BytesRef implements Comparable<BytesRef> {
/** Interprets stored bytes as UTF8 bytes, returning the
* resulting string */
public String utf8ToString() {
try {
return new String(bytes, offset, length, "UTF-8");
} catch (UnsupportedEncodingException uee) {
// should not happen -- UTF8 is presumably supported
// by all JREs
throw new RuntimeException(uee);
}
final CharsRef ref = new CharsRef(length);
UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
return ref.toString();
}
/** Interprets stored bytes as UTF8 bytes into the given {@link CharsRef} */
public CharsRef utf8ToChars(CharsRef ref) {
UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref);
return ref;
}
/** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */

View File

@ -0,0 +1,218 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Represents char[], as a slice (offset + length) into an existing char[].
*
* @lucene.internal
*/
public final class CharsRef implements Comparable<CharsRef>, CharSequence {
private static final char[] EMPTY_ARRAY = new char[0];
public char[] chars;
public int offset;
public int length;
/**
* Creates a new {@link CharsRef} initialized an empty array zero-length
*/
public CharsRef() {
this(EMPTY_ARRAY, 0, 0);
}
/**
* Creates a new {@link CharsRef} initialized with an array of the given
* capacity
*/
public CharsRef(int capacity) {
chars = new char[capacity];
}
/**
* Creates a new {@link CharsRef} initialized with the given array, offset and
* length
*/
public CharsRef(char[] chars, int offset, int length) {
assert chars != null;
assert chars.length >= offset + length;
this.chars = chars;
this.offset = offset;
this.length = length;
}
/**
* Creates a new {@link CharsRef} initialized with the given Strings character
* array
*/
public CharsRef(String string) {
this.chars = string.toCharArray();
this.offset = 0;
this.length = chars.length;
}
/**
* Creates a new {@link CharsRef} and copies the contents of the source into
* the new instance.
* @see #copy(CharsRef)
*/
public CharsRef(CharsRef other) {
copy(other);
}
@Override
public Object clone() {
return new CharsRef(this);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 0;
final int end = offset + length;
for (int i = offset; i < end; i++) {
result = prime * result + chars[i];
}
return result;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other instanceof CharsRef) {
return charsEquals((CharsRef) other);
}
if (other instanceof CharSequence) {
final CharSequence seq = (CharSequence) other;
if (length == seq.length()) {
int n = length;
int i = offset;
int j = 0;
while (n-- != 0) {
if (chars[i++] != seq.charAt(j++))
return false;
}
return true;
}
}
return false;
}
public boolean charsEquals(CharsRef other) {
if (length == other.length) {
int otherUpto = other.offset;
final char[] otherChars = other.chars;
final int end = offset + length;
for (int upto = offset; upto < end; upto++, otherUpto++) {
if (chars[upto] != otherChars[otherUpto]) {
return false;
}
}
return true;
} else {
return false;
}
}
/** Signed int order comparison */
public int compareTo(CharsRef other) {
if (this == other)
return 0;
final char[] aChars = this.chars;
int aUpto = this.offset;
final char[] bChars = other.chars;
int bUpto = other.offset;
final int aStop = aUpto + Math.min(this.length, other.length);
while (aUpto < aStop) {
int aInt = aChars[aUpto++];
int bInt = bChars[bUpto++];
if (aInt > bInt) {
return 1;
} else if (aInt < bInt) {
return -1;
}
}
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
}
/**
* Copies the given {@link CharsRef} referenced content into this instance
* starting at offset 0.
*
* @param other
* the {@link CharsRef} to copy
*/
public void copy(CharsRef other) {
chars = ArrayUtil.grow(chars, other.length);
System.arraycopy(other.chars, other.offset, chars, 0, other.length);
length = other.length;
offset = 0;
}
public void grow(int newLength) {
if (chars.length < newLength) {
chars = ArrayUtil.grow(chars, newLength);
}
}
/**
* Copies the given array into this CharsRef starting at offset 0
*/
public void copy(char[] otherChars, int otherOffset, int otherLength) {
this.offset = 0;
append(otherChars, otherOffset, otherLength);
}
/**
* Appends the given array to this CharsRef starting at the current offset
*/
public void append(char[] otherChars, int otherOffset, int otherLength) {
grow(this.offset + otherLength);
System.arraycopy(otherChars, otherOffset, this.chars, this.offset,
otherLength);
this.length = otherLength;
}
@Override
public String toString() {
return new String(chars, offset, length);
}
@Override
public int length() {
return length;
}
@Override
public char charAt(int index) {
return chars[offset + index];
}
@Override
public CharSequence subSequence(int start, int end) {
return new CharsRef(chars, offset + start, offset + end - 1);
}
}

View File

@ -94,6 +94,19 @@ package org.apache.lucene.util;
*/
public final class UnicodeUtil {
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
* one would normally encounter, and definitely bigger than any UTF-8 terms.
* <p>
* WARNING: This is not a valid UTF8 Term
**/
public static final BytesRef BIG_TERM = new BytesRef(
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
); // TODO this is unrelated here find a better place for it
public static void main(String[] args) {
System.out.println(Character.toChars(0x10FFFF + 1));
}
private UnicodeUtil() {} // no instance
@ -112,33 +125,6 @@ public final class UnicodeUtil {
Character.MIN_SUPPLEMENTARY_CODE_POINT -
(UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START;
/**
* @lucene.internal
*/
public static final class UTF16Result {
public char[] result = new char[10];
public int[] offsets = new int[10];
public int length;
public void setLength(int newLength) {
if (result.length < newLength)
result = ArrayUtil.grow(result, newLength);
length = newLength;
}
public void copyText(UTF16Result other) {
setLength(other.length);
System.arraycopy(other.result, 0, result, 0, length);
}
public void copyText(String other) {
final int otherLength = other.length();
setLength(otherLength);
other.getChars(0, otherLength, result, 0);
length = otherLength;
}
}
/** Encode characters from a char[] source, starting at
* offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */
public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) {
@ -302,135 +288,6 @@ public final class UnicodeUtil {
result.length = upto;
}
/** Convert UTF8 bytes into UTF16 characters. If offset
* is non-zero, conversion starts at that starting point
* in utf8, re-using the results from the previous call
* up until offset. */
public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) {
final int end = offset + length;
char[] out = result.result;
if (result.offsets.length <= end) {
result.offsets = ArrayUtil.grow(result.offsets, end+1);
}
final int[] offsets = result.offsets;
// If incremental decoding fell in the middle of a
// single unicode character, rollback to its start:
int upto = offset;
while(offsets[upto] == -1)
upto--;
int outUpto = offsets[upto];
// Pre-allocate for worst case 1-for-1
if (outUpto+length >= out.length) {
out = result.result = ArrayUtil.grow(out, outUpto+length+1);
}
while (upto < end) {
final int b = utf8[upto]&0xff;
final int ch;
offsets[upto++] = outUpto;
if (b < 0xc0) {
assert b < 0x80;
ch = b;
} else if (b < 0xe0) {
ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f);
offsets[upto++] = -1;
} else if (b < 0xf0) {
ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f);
offsets[upto++] = -1;
offsets[upto++] = -1;
} else {
assert b < 0xf8;
ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f);
offsets[upto++] = -1;
offsets[upto++] = -1;
offsets[upto++] = -1;
}
if (ch <= UNI_MAX_BMP) {
// target is a character <= 0xFFFF
out[outUpto++] = (char) ch;
} else {
// target is a character in range 0xFFFF - 0x10FFFF
out[outUpto++] = (char) ((ch >> HALF_SHIFT) + 0xD7C0 /* UNI_SUR_HIGH_START - 64 */);
out[outUpto++] = (char) ((ch & HALF_MASK) + UNI_SUR_LOW_START);
}
}
offsets[upto] = outUpto;
result.length = outUpto;
}
/**
* Get the next valid UTF-16 String in UTF-16 order.
* <p>
* If the input String is already valid, it is returned.
* Otherwise the next String in code unit order is returned.
* </p>
* @param s input String (possibly with unpaired surrogates)
* @return next valid UTF-16 String in UTF-16 order
*/
public static String nextValidUTF16String(String s) {
if (validUTF16String(s))
return s;
else {
UTF16Result chars = new UTF16Result();
chars.copyText(s);
nextValidUTF16String(chars);
return new String(chars.result, 0, chars.length);
}
}
public static void nextValidUTF16String(UTF16Result s) {
final int size = s.length;
for (int i = 0; i < size; i++) {
char ch = s.result[i];
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START
&& ch <= UnicodeUtil.UNI_SUR_HIGH_END) {
if (i < size - 1) {
i++;
char nextCH = s.result[i];
if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START
&& nextCH <= UnicodeUtil.UNI_SUR_LOW_END) {
// Valid surrogate pair
} else
// Unmatched high surrogate
if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated
s.setLength(i + 1);
s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
return;
} else { // SMP already enumerated
if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) {
s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
s.setLength(i);
} else {
s.result[i - 1]++;
s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START;
s.setLength(i + 1);
}
return;
}
} else {
// Unmatched high surrogate in final position, SMP not yet enumerated
s.setLength(i + 2);
s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START;
return;
}
} else if (ch >= UnicodeUtil.UNI_SUR_LOW_START
&& ch <= UnicodeUtil.UNI_SUR_LOW_END) {
// Unmatched low surrogate, SMP already enumerated
s.setLength(i + 1);
s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1);
return;
}
}
}
// Only called from assert
/*
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
@ -705,4 +562,51 @@ public final class UnicodeUtil {
}
return sb.toString();
}
/**
* Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if
* it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
* <p>
* NOTE: Full characters are read, even if this reads past the length passed (and
* can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed).
* Explicit checks for valid UTF-8 are not performed.
*/
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) {
int out_offset = chars.offset = 0;
final char[] out = chars.chars = ArrayUtil.grow(chars.chars, length);
final int limit = offset + length;
while (offset < limit) {
int b = utf8[offset++]&0xff;
if (b < 0xc0) {
assert b < 0x80;
out[out_offset++] = (char)b;
} else if (b < 0xe0) {
out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
} else if (b < 0xf0) {
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
offset += 2;
} else {
assert b < 0xf8;
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
offset += 3;
if (ch < UNI_MAX_BMP) {
out[out_offset++] = (char)ch;
} else {
int chHalf = ch - 0x0010000;
out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00);
}
}
}
chars.length = out_offset - chars.offset;
}
/**
* Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)}
* @see #UTF8toUTF16(byte[], int, int, CharsRef)
*/
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {
UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars);
}
}

View File

@ -21,6 +21,7 @@ package org.apache.lucene.index.codecs.preflexrw;
import java.io.IOException;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.util.BytesRef;
@ -107,14 +108,14 @@ final class TermInfosWriter {
}
// Currently used only by assert statements
UnicodeUtil.UTF16Result utf16Result1;
UnicodeUtil.UTF16Result utf16Result2;
CharsRef utf16Result1;
CharsRef utf16Result2;
private final BytesRef scratchBytes = new BytesRef();
// Currently used only by assert statements
private boolean initUTF16Results() {
utf16Result1 = new UnicodeUtil.UTF16Result();
utf16Result2 = new UnicodeUtil.UTF16Result();
utf16Result1 = new CharsRef(10);
utf16Result2 = new CharsRef(10);
return true;
}
@ -145,8 +146,8 @@ final class TermInfosWriter {
len = utf16Result2.length;
for(int i=0;i<len;i++) {
final char ch1 = utf16Result1.result[i];
final char ch2 = utf16Result2.result[i];
final char ch1 = utf16Result1.chars[i];
final char ch2 = utf16Result2.chars[i];
if (ch1 != ch2)
return ch1-ch2;
}

View File

@ -68,6 +68,7 @@ import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SingleInstanceLockFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.UnicodeUtil;
@ -1631,7 +1632,7 @@ public class TestIndexWriter extends LuceneTestCase {
public void testAllUnicodeChars() throws Throwable {
BytesRef utf8 = new BytesRef(10);
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
CharsRef utf16 = new CharsRef(10);
char[] chars = new char[2];
for(int ch=0;ch<0x0010FFFF;ch++) {
@ -1654,7 +1655,7 @@ public class TestIndexWriter extends LuceneTestCase {
assertEquals("codepoint " + ch, s1, s2);
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);
assertEquals("codepoint " + ch, s1, new String(utf16.result, 0, utf16.length));
assertEquals("codepoint " + ch, s1, new String(utf16.chars, 0, utf16.length));
byte[] b = s1.getBytes("UTF-8");
assertEquals(utf8.length, b.length);
@ -1721,7 +1722,7 @@ public class TestIndexWriter extends LuceneTestCase {
char[] expected = new char[20];
BytesRef utf8 = new BytesRef(20);
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
CharsRef utf16 = new CharsRef(20);
int num = 100000 * RANDOM_MULTIPLIER;
for (int iter = 0; iter < num; iter++) {
@ -1738,62 +1739,7 @@ public class TestIndexWriter extends LuceneTestCase {
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);
assertEquals(utf16.length, 20);
for(int i=0;i<20;i++)
assertEquals(expected[i], utf16.result[i]);
}
}
// LUCENE-510
public void testIncrementalUnicodeStrings() throws Throwable {
char[] buffer = new char[20];
char[] expected = new char[20];
BytesRef utf8 = new BytesRef(new byte[20]);
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
UnicodeUtil.UTF16Result utf16a = new UnicodeUtil.UTF16Result();
boolean hasIllegal = false;
byte[] last = new byte[60];
int num = 100000 * RANDOM_MULTIPLIER;
for (int iter = 0; iter < num; iter++) {
final int prefix;
if (iter == 0 || hasIllegal)
prefix = 0;
else
prefix = nextInt(20);
hasIllegal = fillUnicode(buffer, expected, prefix, 20-prefix);
UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
if (!hasIllegal) {
byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
assertEquals(b.length, utf8.length);
for(int i=0;i<b.length;i++)
assertEquals(b[i], utf8.bytes[i]);
}
int bytePrefix = 20;
if (iter == 0 || hasIllegal)
bytePrefix = 0;
else
for(int i=0;i<20;i++)
if (last[i] != utf8.bytes[i]) {
bytePrefix = i;
break;
}
System.arraycopy(utf8.bytes, 0, last, 0, utf8.length);
UnicodeUtil.UTF8toUTF16(utf8.bytes, bytePrefix, utf8.length-bytePrefix, utf16);
assertEquals(20, utf16.length);
for(int i=0;i<20;i++)
assertEquals(expected[i], utf16.result[i]);
UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16a);
assertEquals(20, utf16a.length);
for(int i=0;i<20;i++)
assertEquals(expected[i], utf16a.result[i]);
assertEquals(expected[i], utf16.chars[i]);
}
}

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
@ -114,7 +115,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
CharsRef utf16 = new CharsRef(10);
private SimpleAutomatonTermsEnum(TermsEnum tenum) throws IOException {
super(tenum);
@ -124,7 +125,7 @@ public class TestRegexpRandom2 extends LuceneTestCase {
@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
return runAutomaton.run(utf16.result, 0, utf16.length) ?
return runAutomaton.run(utf16.chars, 0, utf16.length) ?
AcceptStatus.YES : AcceptStatus.NO;
}
}

View File

@ -85,37 +85,6 @@ package org.apache.lucene.util;
*/
public class TestUnicodeUtil extends LuceneTestCase {
public void testNextValidUTF16String() {
// valid UTF-16
assertEquals("dogs", UnicodeUtil.nextValidUTF16String("dogs"));
assertEquals("dogs\uD802\uDC02", UnicodeUtil
.nextValidUTF16String("dogs\uD802\uDC02"));
// an illegal combination, where we have not yet enumerated into the supp
// plane so we increment to H + \uDC00 (the lowest possible trail surrogate)
assertEquals("dogs\uD801\uDC00", UnicodeUtil
.nextValidUTF16String("dogs\uD801"));
assertEquals("dogs\uD801\uDC00", UnicodeUtil
.nextValidUTF16String("dogs\uD801b"));
assertEquals("dogs\uD801\uDC00", UnicodeUtil
.nextValidUTF16String("dogs\uD801\uD800"));
// an illegal combination where we have already enumerated the trail
// we must increment the lead and start the trail back at the beginning.
assertEquals("dogs\uD802\uDC00", UnicodeUtil
.nextValidUTF16String("dogs\uD801\uE001"));
// an illegal combination where we have exhausted the supp plane
// we must now move to the lower bmp.
assertEquals("dogs\uE000", UnicodeUtil
.nextValidUTF16String("dogs\uDBFF\uE001"));
// an unpaired trail surrogate. this is invalid when not preceded by a lead
// surrogate. in this case we have to bump to \uE000 (the lowest possible
// "upper BMP")
assertEquals("dogs\uE000", UnicodeUtil.nextValidUTF16String("dogs\uDC00"));
assertEquals("\uE000", UnicodeUtil.nextValidUTF16String("\uDC00dogs"));
}
public void testCodePointCount() {
BytesRef utf8 = new BytesRef(20);
@ -197,4 +166,19 @@ public class TestUnicodeUtil extends LuceneTestCase {
assertTrue(rc == -1);
}
}
public void testUTF8UTF16CharsRef() {
for (int i = 0; i < 3989 * RANDOM_MULTIPLIER; i++) {
String unicode = _TestUtil.randomRealisticUnicodeString(random);
BytesRef ref = new BytesRef(unicode);
char[] arr = new char[1 + random.nextInt(100)];
int offset = random.nextInt(arr.length);
int len = random.nextInt(arr.length - offset);
CharsRef cRef = new CharsRef(arr, offset, len);
UnicodeUtil.UTF8toUTF16(ref, cRef);
assertEquals(cRef.toString(), unicode);
assertEquals(cRef, unicode); // CharSeq
assertEquals(cRef, ref.utf8ToString()); // CharSeq
}
}
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.MultiFields;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.BytesRef;
@ -143,13 +144,14 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
*/
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
HashSet<String> stopWords = new HashSet<String>();
Terms terms = MultiFields.getTerms(reader, fieldName);
final Terms terms = MultiFields.getTerms(reader, fieldName);
final CharsRef spare = new CharsRef();
if (terms != null) {
TermsEnum te = terms.iterator();
final TermsEnum te = terms.iterator();
BytesRef text;
while ((text = te.next()) != null) {
if (te.docFreq() > maxDocFreq) {
stopWords.add(text.utf8ToString());
stopWords.add(text.utf8ToChars(spare).toString());
}
}
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
@ -322,7 +323,7 @@ public class DirectSpellChecker {
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
boolean morePopular, float accuracy) throws IOException {
final CharsRef spare = new CharsRef();
String text = term.text();
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
return new SuggestWord[0];
@ -358,11 +359,11 @@ public class DirectSpellChecker {
int inspections = numSug * maxInspections;
// try ed=1 first, in case we get lucky
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy);
terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
if (maxEdits > 1 && terms.size() < inspections) {
HashSet<ScoreTerm> moreTerms = new HashSet<ScoreTerm>();
moreTerms.addAll(terms);
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy));
moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
terms = moreTerms;
}
@ -372,7 +373,7 @@ public class DirectSpellChecker {
int index = suggestions.length - 1;
for (ScoreTerm s : terms) {
SuggestWord suggestion = new SuggestWord();
suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString();
suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToChars(spare).toString();
suggestion.score = s.score;
suggestion.freq = s.docfreq;
suggestions[index--] = suggestion;
@ -388,7 +389,7 @@ public class DirectSpellChecker {
}
private Collection<ScoreTerm> suggestSimilar(Term term, int numSug,
IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException {
IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException {
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
@ -425,7 +426,7 @@ public class DirectSpellChecker {
// undo FuzzyTermsEnum's scale factor for a real scaled lev score
score = boost / e.getScaleFactor() + e.getMinSimilarity();
} else {
termAsString = candidateTerm.utf8ToString();
termAsString = candidateTerm.utf8ToChars(spare).toString();
score = distance.getDistance(term.text(), termAsString);
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.BytesRef;
@ -42,6 +43,7 @@ public class HighFrequencyDictionary implements Dictionary {
private IndexReader reader;
private String field;
private float thresh;
private final CharsRef spare = new CharsRef();
public HighFrequencyDictionary(IndexReader reader, String field, float thresh) {
this.reader = reader;
@ -89,7 +91,7 @@ public class HighFrequencyDictionary implements Dictionary {
}
hasNextCalled = false;
return (actualTerm != null) ? actualTerm.utf8ToString() : null;
return (actualTerm != null) ? actualTerm.utf8ToChars(spare).toString() : null;
}
public boolean hasNext() {

View File

@ -23,6 +23,7 @@ import java.util.Iterator;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.StringHelper;
@ -56,6 +57,7 @@ public class LuceneDictionary implements Dictionary {
final class LuceneIterator implements Iterator<String> {
private TermsEnum termsEnum;
private BytesRef pendingTerm;
private final CharsRef spare = new CharsRef();
LuceneIterator() {
try {
@ -74,7 +76,7 @@ public class LuceneDictionary implements Dictionary {
return null;
}
String result = pendingTerm.utf8ToString();
final String result = pendingTerm.utf8ToChars(spare).toString();
try {
pendingTerm = termsEnum.next();

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.Payload;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.SorterTemplate;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
@ -39,8 +40,6 @@ import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.noggit.CharArr;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
@ -235,18 +234,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
FieldType fieldType = context.getFieldType();
final CharArr textBuf = new CharArr();
for (int i = 0, c = tokens.size(); i < c; i++) {
AttributeSource token = tokens.get(i);
final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
BytesRef rawBytes = termAtt.getBytesRef();
termAtt.fillBytesRef();
textBuf.reset();
fieldType.indexedToReadable(rawBytes, textBuf);
final String text = textBuf.toString();
final String text = fieldType.indexedToReadable(rawBytes, new CharsRef(rawBytes.length)).toString();
tokenNamedList.add("text", text);
if (token.hasAttribute(CharTermAttribute.class)) {

View File

@ -46,6 +46,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.analysis.CharFilterFactory;
@ -232,6 +233,7 @@ public class LukeRequestHandler extends RequestHandlerBase
private static SimpleOrderedMap<Object> getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema ) throws IOException
{
final CharsRef spare = new CharsRef();
SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
for( Object o : doc.getFields() ) {
Fieldable fieldable = (Fieldable)o;
@ -265,7 +267,7 @@ public class LukeRequestHandler extends RequestHandlerBase
if( v != null ) {
SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
for( int i=0; i<v.size(); i++ ) {
tfv.add( v.getTerms()[i].utf8ToString(), v.getTermFrequencies()[i] );
tfv.add( v.getTerms()[i].utf8ToChars(spare).toString(), v.getTermFrequencies()[i] );
}
f.add( "termVector", tfv );
}
@ -624,7 +626,7 @@ public class LukeRequestHandler extends RequestHandlerBase
private static Map<String,TopTermQueue> getTopTerms( IndexReader reader, Set<String> fields, int numTerms, Set<String> junkWords ) throws Exception
{
Map<String,TopTermQueue> info = new HashMap<String, TopTermQueue>();
final CharsRef spare = new CharsRef();
Fields fieldsC = MultiFields.getFields(reader);
if (fieldsC != null) {
FieldsEnum fieldsEnum = fieldsC.iterator();
@ -634,7 +636,7 @@ public class LukeRequestHandler extends RequestHandlerBase
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
while((text = termsEnum.next()) != null) {
String t = text.utf8ToString();
String t = text.utf8ToChars(spare).toString();
// Compute distinct terms for every field
TopTermQueue tiq = info.get( field );

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.ReaderUtil;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.ZkController;
@ -455,7 +456,7 @@ public class QueryComponent extends SearchComponent
{
SolrQueryRequest req = rb.req;
SolrQueryResponse rsp = rb.rsp;
final CharsRef spare = new CharsRef();
// The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't
// currently have an option to return sort field values. Because of this, we
// take the documents given and re-derive the sort values.
@ -524,7 +525,7 @@ public class QueryComponent extends SearchComponent
// String field in Lucene, which returns the terms
// data as BytesRef:
if (val instanceof BytesRef) {
field.setValue(((BytesRef)val).utf8ToString());
field.setValue(((BytesRef)val).utf8ToChars(spare).toString());
val = ft.toObject(field);
}

View File

@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.noggit.CharArr;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
@ -270,19 +271,15 @@ class SimpleStats {
}
finfo[i++] = new FieldFacetStats( f, si, ft, 0 );
}
final CharsRef spare = new CharsRef();
final BytesRef tempBR = new BytesRef();
final CharArr spare = new CharArr();
DocIterator iter = docs.iterator();
while (iter.hasNext()) {
int docID = iter.nextDoc();
BytesRef raw = all.getTermText(docID, tempBR);
Double v = null;
if( raw != null ) {
spare.reset();
all.ft.indexedToReadable(raw, spare);
v = Double.parseDouble(spare.toString());
v = Double.parseDouble(all.ft.indexedToReadable(raw, spare).toString());
allstats.accumulate(v);
}
else {

View File

@ -18,7 +18,7 @@ package org.apache.solr.handler.component;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.*;
import org.apache.solr.common.util.NamedList;
@ -178,8 +178,7 @@ public class TermsComponent extends SearchComponent {
int i = 0;
BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
CharArr external = new CharArr();
CharsRef external = new CharsRef();
while (term != null && (i<limit || sort)) {
boolean externalized = false; // did we fill in "external" yet for this term?
@ -189,8 +188,8 @@ public class TermsComponent extends SearchComponent {
if (pattern != null) {
// indexed text or external text?
// TODO: support "raw" mode?
external.reset();
ft.indexedToReadable(term, external);
externalized = true;
if (!pattern.matcher(external).matches()) {
term = termsEnum.next();
continue;
@ -213,13 +212,9 @@ public class TermsComponent extends SearchComponent {
// TODO: handle raw somehow
if (!externalized) {
external.reset();
ft.indexedToReadable(term, external);
}
String label = external.toString();
fieldTerms.add(label, docFreq);
fieldTerms.add(external.toString(), docFreq);
i++;
}
}
@ -230,7 +225,6 @@ public class TermsComponent extends SearchComponent {
if (sort) {
for (CountPair<BytesRef, Integer> item : queue) {
if (i >= limit) break;
external.reset();
ft.indexedToReadable(item.key, external);
fieldTerms.add(external.toString(), item.val);
i++;

View File

@ -23,9 +23,11 @@ import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.Direct16;
import org.apache.lucene.util.packed.Direct32;
import org.apache.lucene.util.packed.Direct8;
@ -37,7 +39,6 @@ import org.apache.solr.schema.FieldType;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
import java.util.*;
@ -244,7 +245,7 @@ class PerSegmentSingleValuedFaceting {
BytesRef prefixRef = new BytesRef(prefix);
startTermIndex = si.binarySearchLookup(prefixRef, tempBR);
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
prefixRef.append(ByteUtils.bigTerm);
prefixRef.append(UnicodeUtil.BIG_TERM);
// TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed passing start/end
endTermIndex = si.binarySearchLookup(prefixRef, tempBR);
assert endTermIndex < 0;
@ -339,6 +340,8 @@ abstract class FacetCollector {
// This collector expects facets to be collected in index order
class CountSortedFacetCollector extends FacetCollector {
private final CharsRef spare = new CharsRef();
final int offset;
final int limit;
final int maxsize;
@ -360,7 +363,7 @@ class CountSortedFacetCollector extends FacetCollector {
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
// index order, so we already know that the keys are ordered. This can be very
// important if a lot of the counts are repeated (like zero counts would be).
queue.add(new SimpleFacets.CountPair<String,Integer>(term.utf8ToString(), count));
queue.add(new SimpleFacets.CountPair<String,Integer>(term.utf8ToChars(spare).toString(), count));
if (queue.size()>=maxsize) min=queue.last().val;
}
return false;
@ -383,12 +386,13 @@ class CountSortedFacetCollector extends FacetCollector {
// This collector expects facets to be collected in index order
class IndexSortedFacetCollector extends FacetCollector {
private final CharsRef spare = new CharsRef();
int offset;
int limit;
final int mincount;
final NamedList<Integer> res = new NamedList<Integer>();
public IndexSortedFacetCollector(int offset, int limit, int mincount) {
this.offset = offset;
this.limit = limit>0 ? limit : Integer.MAX_VALUE;
@ -407,7 +411,7 @@ class IndexSortedFacetCollector extends FacetCollector {
}
if (limit > 0) {
res.add(term.utf8ToString(), count);
res.add(term.utf8ToChars(spare).toString(), count);
limit--;
}

View File

@ -21,12 +21,13 @@ import org.apache.lucene.index.*;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.Direct16;
import org.apache.lucene.util.packed.Direct32;
import org.apache.lucene.util.packed.Direct8;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.noggit.CharArr;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.RequiredSolrParams;
@ -41,7 +42,6 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.*;
import org.apache.solr.search.*;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.DateMathParser;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.util.LongPriorityQueue;
@ -109,7 +109,7 @@ public class SimpleFacets {
if (localParams == null) return;
// remove local params unless it's a query
if (type != FacetParams.FACET_QUERY) {
if (type != FacetParams.FACET_QUERY) { // TODO Cut over to an Enum here
facetValue = localParams.get(CommonParams.VALUE);
}
@ -128,7 +128,7 @@ public class SimpleFacets {
String excludeStr = localParams.get(CommonParams.EXCLUDE);
if (excludeStr == null) return;
Map tagMap = (Map)req.getContext().get("tags");
Map<?,?> tagMap = (Map<?,?>)req.getContext().get("tags");
if (tagMap != null && rb != null) {
List<String> excludeTagList = StrUtils.splitSmart(excludeStr,',');
@ -137,7 +137,7 @@ public class SimpleFacets {
Object olst = tagMap.get(excludeTag);
// tagMap has entries of List<String,List<QParser>>, but subject to change in the future
if (!(olst instanceof Collection)) continue;
for (Object o : (Collection)olst) {
for (Object o : (Collection<?>)olst) {
if (!(o instanceof QParser)) continue;
QParser qp = (QParser)o;
excludeSet.put(qp.getQuery(), Boolean.TRUE);
@ -435,7 +435,7 @@ public class SimpleFacets {
if (prefix!=null) {
startTermIndex = si.binarySearchLookup(prefixRef, br);
if (startTermIndex<0) startTermIndex=-startTermIndex-1;
prefixRef.append(ByteUtils.bigTerm);
prefixRef.append(UnicodeUtil.BIG_TERM);
endTermIndex = si.binarySearchLookup(prefixRef, br);
assert endTermIndex < 0;
endTermIndex = -endTermIndex-1;
@ -446,8 +446,7 @@ public class SimpleFacets {
final int nTerms=endTermIndex-startTermIndex;
int missingCount = -1;
CharArr spare = new CharArr();
final CharsRef charsRef = new CharsRef(10);
if (nTerms>0 && docs.size() >= mincount) {
// count collection array only needs to be as big as the number of terms we are
@ -547,10 +546,8 @@ public class SimpleFacets {
long pair = sorted[i];
int c = (int)(pair >>> 32);
int tnum = Integer.MAX_VALUE - (int)pair;
spare.reset();
ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), spare);
res.add(spare.toString(), c);
ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), charsRef);
res.add(charsRef.toString(), c);
}
} else {
@ -567,9 +564,8 @@ public class SimpleFacets {
int c = counts[i];
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
spare.reset();
ft.indexedToReadable(si.lookup(startTermIndex+i, br), spare);
res.add(spare.toString(), c);
ft.indexedToReadable(si.lookup(startTermIndex+i, br), charsRef);
res.add(charsRef.toString(), c);
}
}
}
@ -657,7 +653,7 @@ public class SimpleFacets {
}
DocsEnum docsEnum = null;
CharArr spare = new CharArr();
CharsRef charsRef = new CharsRef(10);
if (docs.size() >= mincount) {
while (term != null) {
@ -742,9 +738,8 @@ public class SimpleFacets {
} else {
if (c >= mincount && --off<0) {
if (--lim<0) break;
spare.reset();
ft.indexedToReadable(term, spare);
res.add(spare.toString(), c);
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
@ -757,9 +752,8 @@ public class SimpleFacets {
for (CountPair<BytesRef,Integer> p : queue) {
if (--off>=0) continue;
if (--lim<0) break;
spare.reset();
ft.indexedToReadable(p.key, spare);
res.add(spare.toString(), p.val);
ft.indexedToReadable(p.key, charsRef);
res.add(charsRef.toString(), p.val);
}
}

View File

@ -24,7 +24,6 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.StringHelper;
import org.apache.noggit.CharArr;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException;
@ -33,13 +32,14 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.*;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.LongPriorityQueue;
import org.apache.solr.util.PrimUtils;
import org.apache.solr.handler.component.StatsValues;
import org.apache.solr.handler.component.FieldFacetStats;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.util.HashMap;
@ -227,13 +227,13 @@ public class UnInvertedField extends DocTermOrds {
TermsEnum te = getOrdTermsEnum(searcher.getIndexReader());
if (prefix != null && prefix.length() > 0) {
BytesRef prefixBr = new BytesRef(prefix);
final BytesRef prefixBr = new BytesRef(prefix);
if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
startTerm = numTermsInField;
} else {
startTerm = (int) te.ord();
}
prefixBr.append(ByteUtils.bigTerm);
prefixBr.append(UnicodeUtil.BIG_TERM);
if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) {
endTerm = numTermsInField;
} else {
@ -331,8 +331,7 @@ public class UnInvertedField extends DocTermOrds {
}
}
}
CharArr spare = new CharArr();
final CharsRef charsRef = new CharsRef();
int off=offset;
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
@ -408,7 +407,7 @@ public class UnInvertedField extends DocTermOrds {
for (int i=sortedIdxStart; i<sortedIdxEnd; i++) {
int idx = indirect[i];
int tnum = (int)sorted[idx];
String label = getReadableValue(getTermValue(te, tnum), ft, spare);
final String label = getReadableValue(getTermValue(te, tnum), ft, charsRef);
//System.out.println(" label=" + label);
res.setName(idx - sortedIdxStart, label);
}
@ -428,7 +427,7 @@ public class UnInvertedField extends DocTermOrds {
if (c<mincount || --off>=0) continue;
if (--lim<0) break;
String label = getReadableValue(getTermValue(te, i), ft, spare);
final String label = getReadableValue(getTermValue(te, i), ft, charsRef);
res.add(label, c);
}
}
@ -582,14 +581,12 @@ public class UnInvertedField extends DocTermOrds {
}
}
}
final CharsRef charsRef = new CharsRef();
// add results in index order
CharArr spare = new CharArr();
for (i = 0; i < numTermsInField; i++) {
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
if (c == 0) continue;
String label = getReadableValue(getTermValue(te, i), ft, spare);
String label = getReadableValue(getTermValue(te, i), ft, charsRef);
// TODO: we should avoid this re-parse
Double value = Double.parseDouble(label);
@ -621,14 +618,8 @@ public class UnInvertedField extends DocTermOrds {
}
String getReadableValue(BytesRef termval, FieldType ft, CharArr spare) {
if (spare == null) {
spare = new CharArr();
} else {
spare.reset();
}
ft.indexedToReadable(termval, spare);
return spare.toString();
String getReadableValue(BytesRef termval, FieldType ft, CharsRef charsRef) {
return ft.indexedToReadable(termval, charsRef).toString();
}
/** may return a reused BytesRef */

View File

@ -19,7 +19,7 @@ package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.QParser;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.OrdFieldSource;
@ -130,13 +130,17 @@ public class BoolField extends FieldType {
return ch=='T' ? "true" : "false";
}
private static final CharsRef TRUE = new CharsRef("true");
private static final CharsRef FALSE = new CharsRef("false");
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
if (input.length > 0 && input.bytes[input.offset] == 'T') {
out.write("true");
charsRef.copy(TRUE);
} else {
out.write("false");
charsRef.copy(FALSE);
}
return charsRef;
}
@Override

View File

@ -23,14 +23,13 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.function.*;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.DateMathParser;
import java.io.IOException;
@ -131,6 +130,8 @@ public class DateField extends FieldType {
protected static String NOW = "NOW";
protected static char Z = 'Z';
private static char[] Z_ARRAY = new char[] {Z};
@Override
public String toInternal(String val) {
@ -184,7 +185,7 @@ public class DateField extends FieldType {
public Fieldable createField(SchemaField field, Object value, float boost) {
// Convert to a string before indexing
if(value instanceof Date) {
value = toInternal( (Date)value ) + 'Z';
value = toInternal( (Date)value ) + Z;
}
return super.createField(field, value, boost);
}
@ -199,9 +200,10 @@ public class DateField extends FieldType {
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
ByteUtils.UTF8toUTF16(input, out);
out.write(Z);
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
input.utf8ToChars(charsRef);
charsRef.append(Z_ARRAY, 0, 1);
return charsRef;
}
@Override
@ -479,10 +481,8 @@ class DateFieldSource extends FieldCacheSource {
if (ord == 0) {
return null;
} else {
BytesRef br = termsIndex.lookup(ord, new BytesRef());
CharArr spare = new CharArr();
ft.indexedToReadable(br, spare);
return spare.toString();
final BytesRef br = termsIndex.lookup(ord, spare);
return ft.indexedToReadable(br, spareChars).toString();
}
}
@ -492,7 +492,7 @@ class DateFieldSource extends FieldCacheSource {
if (ord == 0) {
return null;
} else {
BytesRef br = termsIndex.lookup(ord, new BytesRef());
final BytesRef br = termsIndex.lookup(ord, new BytesRef());
return ft.toObject(null, br);
}
}

View File

@ -30,8 +30,8 @@ import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.noggit.CharArr;
import org.apache.solr.analysis.SolrAnalyzer;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@ -39,7 +39,6 @@ import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.Sorting;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.util.ByteUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -354,9 +353,9 @@ public abstract class FieldType extends FieldProperties {
}
public Object toObject(SchemaField sf, BytesRef term) {
CharArr ext = new CharArr(term.length);
indexedToReadable(term, ext);
Fieldable f = createField(sf, ext.toString(), 1.0f);
final CharsRef ref = new CharsRef(term.length);
indexedToReadable(term, ref);
final Fieldable f = createField(sf, ref.toString(), 1.0f);
return toObject(f);
}
@ -365,9 +364,10 @@ public abstract class FieldType extends FieldProperties {
return indexedForm;
}
/** Given an indexed term, append the human readable representation to out */
public void indexedToReadable(BytesRef input, CharArr out) {
ByteUtils.UTF8toUTF16(input, out);
/** Given an indexed term, append the human readable representation*/
public CharsRef indexedToReadable(BytesRef input, CharsRef output) {
input.utf8ToChars(output);
return output;
}
/** Given the stored field, return the human readable representation */
@ -390,7 +390,7 @@ public abstract class FieldType extends FieldProperties {
/** Given the readable value, return the term value that will match it. */
public void readableToIndexed(CharSequence val, BytesRef result) {
String internal = readableToIndexed(val.toString());
final String internal = readableToIndexed(val.toString());
UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result);
}

View File

@ -19,7 +19,7 @@ package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.MutableValueDouble;
import org.apache.solr.search.MutableValue;
import org.apache.solr.search.QParser;
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
@ -78,9 +77,12 @@ public class SortableDoubleField extends FieldType {
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
input.utf8ToChars(charsRef);
final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray();
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
return charsRef;
}
@Override
@ -90,9 +92,6 @@ public class SortableDoubleField extends FieldType {
}
}
class SortableDoubleFieldSource extends FieldCacheSource {
protected double defVal;

View File

@ -19,7 +19,7 @@ package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.MutableValueFloat;
import org.apache.solr.search.MutableValue;
import org.apache.solr.search.QParser;
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
@ -77,10 +76,11 @@ public class SortableFloatField extends FieldType {
return NumberUtils.SortableStr2floatStr(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
return charsRef;
}
@Override

View File

@ -19,7 +19,7 @@ package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.MutableValueInt;
import org.apache.solr.search.MutableValue;
import org.apache.solr.search.QParser;
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
@ -75,10 +74,11 @@ public class SortableIntField extends FieldType {
return NumberUtils.SortableStr2int(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
return charsRef;
}
@Override

View File

@ -19,7 +19,7 @@ package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.MutableValueLong;
import org.apache.solr.search.MutableValue;
import org.apache.solr.search.QParser;
@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
@ -67,10 +66,11 @@ public class SortableLongField extends FieldType {
return NumberUtils.SortableStr2long(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray();
charsRef.copy(indexedToReadable, 0, indexedToReadable.length);
return charsRef;
}
@Override

View File

@ -23,7 +23,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.QParser;
import org.apache.solr.util.ByteUtils;
import java.util.Map;
import java.io.IOException;
@ -54,7 +53,7 @@ public class StrField extends FieldType {
@Override
public Object toObject(SchemaField sf, BytesRef term) {
return ByteUtils.UTF8toUTF16(term);
return term.utf8ToString();
}
}

View File

@ -18,12 +18,9 @@
package org.apache.solr.schema;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.FieldCacheSource;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
import java.util.Map;

View File

@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.util.ByteUtils;
import java.util.Map;
import java.util.List;
@ -81,7 +80,7 @@ public class TextField extends FieldType {
@Override
public Object toObject(SchemaField sf, BytesRef term) {
return ByteUtils.UTF8toUTF16(term);
return term.utf8ToString();
}
@Override

View File

@ -17,7 +17,6 @@
package org.apache.solr.schema;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.QParser;
import org.apache.solr.response.TextResponseWriter;
@ -26,6 +25,7 @@ import org.apache.lucene.search.SortField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import java.util.Map;
import java.util.Date;
@ -111,10 +111,10 @@ public class TrieDateField extends DateField {
public String indexedToReadable(String _indexedForm) {
return wrappedField.indexedToReadable(_indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
wrappedField.indexedToReadable(input, out);
public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
return wrappedField.indexedToReadable(input, charsRef);
}
@Override

View File

@ -26,8 +26,8 @@ import org.apache.lucene.search.cache.FloatValuesCreator;
import org.apache.lucene.search.cache.IntValuesCreator;
import org.apache.lucene.search.cache.LongValuesCreator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.noggit.CharArr;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
@ -296,7 +296,7 @@ public class TrieField extends FieldType {
@Override
public String readableToIndexed(String val) {
// TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts!
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
final BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
readableToIndexed(val, bytes);
return bytes.utf8ToString();
}
@ -363,31 +363,29 @@ public class TrieField extends FieldType {
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
BytesRef indexedForm = input;
String s;
public CharsRef indexedToReadable(BytesRef indexedForm, CharsRef charsRef) {
final char[] value;
switch (type) {
case INTEGER:
s = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) );
value = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) ).toCharArray();
break;
case FLOAT:
s = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) );
value = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) ).toCharArray();
break;
case LONG:
s = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) );
value = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) ).toCharArray();
break;
case DOUBLE:
s = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) );
value = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray();
break;
case DATE:
s = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) );
value = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray();
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
}
out.write(s);
charsRef.copy(value, 0, value.length);
return charsRef;
}
@Override

View File

@ -21,11 +21,11 @@ import org.apache.lucene.search.*;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.Direct16;
import org.apache.lucene.util.packed.Direct32;
import org.apache.lucene.util.packed.Direct8;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
@ -34,7 +34,7 @@ public class MissingStringLastComparatorSource extends FieldComparatorSource {
private final BytesRef missingValueProxy;
public MissingStringLastComparatorSource() {
this(ByteUtils.bigTerm);
this(UnicodeUtil.BIG_TERM);
}
/** Creates a {@link FieldComparatorSource} that sorts null last in a normal ascending sort.

View File

@ -17,14 +17,13 @@
package org.apache.solr.search;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.util.ByteUtils;
public class MutableValueStr extends MutableValue {
public BytesRef value = new BytesRef();
@Override
public Object toObject() {
return exists ? ByteUtils.UTF8toUTF16(value) : null;
return exists ? value.utf8ToString() : null;
}
@Override

View File

@ -22,7 +22,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
import java.util.Map;
@ -43,8 +42,7 @@ public class IDFValueSource extends DocFreqValueSource {
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
Similarity sim = searcher.getSimilarityProvider().get(field);
// todo: we need docFreq that takes a BytesRef
String strVal = ByteUtils.UTF8toUTF16(indexedBytes);
int docfreq = searcher.docFreq(new Term(indexedField, strVal));
int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString()));
float idf = sim.idf(docfreq, searcher.maxDoc());
return new ConstDoubleDocValues(idf, this);
}

View File

@ -21,10 +21,9 @@ import org.apache.lucene.search.FieldCache;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.search.MutableValue;
import org.apache.solr.search.MutableValueStr;
import org.apache.solr.util.ByteUtils;
import java.io.IOException;
@ -36,7 +35,7 @@ public abstract class StringIndexDocValues extends DocValues {
protected final ValueSource vs;
protected final MutableValueStr val = new MutableValueStr();
protected final BytesRef spare = new BytesRef();
protected final CharArr spareChars = new CharArr();
protected final CharsRef spareChars = new CharsRef();
public StringIndexDocValues(ValueSource vs, AtomicReaderContext context, String field) throws IOException {
try {
@ -75,8 +74,7 @@ public abstract class StringIndexDocValues extends DocValues {
int ord=termsIndex.getOrd(doc);
if (ord==0) return null;
termsIndex.lookup(ord, spare);
spareChars.reset();
ByteUtils.UTF8toUTF16(spare, spareChars);
spare.utf8ToChars(spareChars);
return spareChars.toString();
}

View File

@ -1,81 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
public class ByteUtils {
/** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
* one would normally encounter, and definitely bigger than any UTF-8 terms */
public static final BytesRef bigTerm = new BytesRef(
new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
);
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
* Full characters are read, even if this reads past the length passed (and can result in
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
* The char[] out should probably have enough room to hold the worst case of each byte becoming a Java char.
*/
public static int UTF8toUTF16(byte[] utf8, int offset, int len, char[] out, int out_offset) {
int out_start = out_offset;
final int limit = offset + len;
while (offset < limit) {
int b = utf8[offset++]&0xff;
if (b < 0xc0) {
assert b < 0x80;
out[out_offset++] = (char)b;
} else if (b < 0xe0) {
out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
} else if (b < 0xf0) {
out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
offset += 2;
} else {
assert b < 0xf8;
int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
offset += 3;
if (ch < 0xffff) {
out[out_offset++] = (char)ch;
} else {
int chHalf = ch - 0x0010000;
out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
out[out_offset++] = (char) ((chHalf & 0x3FFL) + 0xDC00);
}
}
}
return out_offset - out_start;
}
/** Convert UTF8 bytes into UTF16 characters. */
public static void UTF8toUTF16(BytesRef utf8, CharArr out) {
// TODO: do in chunks if the input is large
out.reserve(utf8.length);
int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out.getArray(), out.getEnd());
out.setEnd(out.getEnd() + n);
}
/** Convert UTF8 bytes into a String */
public static String UTF8toUTF16(BytesRef utf8) {
char[] out = new char[utf8.length];
int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out, 0);
return new String(out,0,n);
}
}

View File

@ -19,6 +19,7 @@
org.apache.lucene.util.AttributeSource,
org.apache.lucene.util.Attribute,
org.apache.lucene.util.BytesRef,
org.apache.lucene.util.CharsRef,
org.apache.lucene.analysis.TokenStream,
org.apache.lucene.index.Payload,
org.apache.lucene.analysis.CharReader,
@ -32,8 +33,7 @@
org.apache.solr.schema.FieldType,
org.apache.solr.schema.SchemaField,
org.apache.solr.common.util.XML,
javax.servlet.jsp.JspWriter,java.io.IOException,
org.apache.noggit.CharArr
javax.servlet.jsp.JspWriter,java.io.IOException
"%>
<%@ page import="java.io.Reader"%>
<%@ page import="java.io.StringReader"%>
@ -287,9 +287,7 @@
bytes = new BytesRef(spare);
rawText = (token.hasAttribute(CharTermAttribute.class)) ?
token.getAttribute(CharTermAttribute.class).toString() : null;
final CharArr textBuf = new CharArr(bytes.length);
ft.indexedToReadable(bytes, textBuf);
text = textBuf.toString();
text = ft.indexedToReadable(bytes, new CharsRef()).toString();
token.reflectWith(new AttributeReflector() {
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
// leave out position and raw term