From 68a840c2b75d93173b1ee6f5e0dcc55e2e6fdc84 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 24 May 2011 22:44:36 +0000 Subject: [PATCH] SOLR-2530: Remove Noggit CharArr from FieldType git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1127326 13f79535-47bb-0310-9956-ffa450edef68 --- .../vectorhighlight/FieldTermStack.java | 7 +- .../store/instantiated/InstantiatedIndex.java | 4 +- .../regex/JakartaRegexpCapabilities.java | 7 +- .../regex/JavaUtilRegexCapabilities.java | 19 +- .../lucene/search/similar/MoreLikeThis.java | 4 +- .../lucene/document/CompressionTools.java | 5 +- .../simpletext/SimpleTextFieldsReader.java | 13 +- .../java/org/apache/lucene/util/BytesRef.java | 18 +- .../java/org/apache/lucene/util/CharsRef.java | 218 ++++++++++++++++++ .../org/apache/lucene/util/UnicodeUtil.java | 216 +++++------------ .../codecs/preflexrw/TermInfosWriter.java | 13 +- .../apache/lucene/index/TestIndexWriter.java | 64 +---- .../lucene/search/TestRegexpRandom2.java | 5 +- .../apache/lucene/util/TestUnicodeUtil.java | 46 ++-- .../query/QueryAutoStopWordAnalyzer.java | 8 +- .../search/spell/DirectSpellChecker.java | 13 +- .../search/spell/HighFrequencyDictionary.java | 4 +- .../lucene/search/spell/LuceneDictionary.java | 4 +- .../handler/AnalysisRequestHandlerBase.java | 10 +- .../handler/admin/LukeRequestHandler.java | 8 +- .../handler/component/QueryComponent.java | 5 +- .../handler/component/StatsComponent.java | 9 +- .../handler/component/TermsComponent.java | 14 +- .../PerSegmentSingleValuedFaceting.java | 14 +- .../org/apache/solr/request/SimpleFacets.java | 38 ++- .../apache/solr/request/UnInvertedField.java | 31 +-- .../org/apache/solr/schema/BoolField.java | 12 +- .../org/apache/solr/schema/DateField.java | 22 +- .../org/apache/solr/schema/FieldType.java | 18 +- .../solr/schema/SortableDoubleField.java | 13 +- .../solr/schema/SortableFloatField.java | 10 +- .../apache/solr/schema/SortableIntField.java | 10 +- .../apache/solr/schema/SortableLongField.java | 10 +- .../java/org/apache/solr/schema/StrField.java | 3 +- .../apache/solr/schema/StrFieldSource.java | 3 - .../org/apache/solr/schema/TextField.java | 3 +- .../org/apache/solr/schema/TrieDateField.java | 8 +- .../org/apache/solr/schema/TrieField.java | 24 +- .../MissingStringLastComparatorSource.java | 4 +- .../apache/solr/search/MutableValueStr.java | 3 +- .../solr/search/function/IDFValueSource.java | 4 +- .../search/function/StringIndexDocValues.java | 8 +- .../java/org/apache/solr/util/ByteUtils.java | 81 ------- solr/src/webapp/web/admin/analysis.jsp | 8 +- 44 files changed, 492 insertions(+), 549 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/util/CharsRef.java delete mode 100755 solr/src/java/org/apache/solr/util/ByteUtils.java diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java index 9ff5b4d86ad..949d47e749b 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; /** * FieldTermStack is a stack that keeps query terms in the specified field @@ -80,16 +81,16 @@ public class FieldTermStack { Set termSet = fieldQuery.getTermSet( fieldName ); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if( termSet == null ) return; - + final CharsRef spare = new CharsRef(); for( BytesRef term : tpv.getTerms() ){ - if( !termSet.contains( term.utf8ToString() ) ) continue; + if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue; int index = tpv.indexOf( term ); TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets for( int i = 0; i < tvois.length; i++ ) - termList.add( new TermInfo( term.utf8ToString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); + termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); } // sort by position diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java index b98f3cb337e..302480eec07 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java @@ -41,6 +41,7 @@ import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.util.BitVector; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; /** * Represented as a coupled graph of class instances, this @@ -228,12 +229,13 @@ public class InstantiatedIndex if (fieldsC != null) { FieldsEnum fieldsEnum = fieldsC.iterator(); String field; + final CharsRef spare = new CharsRef(); while((field = fieldsEnum.next()) != null) { if (fields == null || fields.contains(field)) { TermsEnum termsEnum = fieldsEnum.terms(); BytesRef text; while((text = termsEnum.next()) != null) { - String termText = text.utf8ToString(); + String termText = text.utf8ToChars(spare).toString(); InstantiatedTerm instantiatedTerm = new InstantiatedTerm(field, termText); final long totalTermFreq = termsEnum.totalTermFreq(); if (totalTermFreq != -1) { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java index 96fc2dff18a..f1c5dac2754 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java @@ -18,6 +18,7 @@ package org.apache.lucene.search.regex; */ import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.regexp.CharacterIterator; import org.apache.regexp.RE; @@ -104,11 +105,11 @@ public class JakartaRegexpCapabilities implements RegexCapabilities { class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher { private RE regexp; - private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + private final CharsRef utf16 = new CharsRef(10); private final CharacterIterator utf16wrapper = new CharacterIterator() { public char charAt(int pos) { - return utf16.result[pos]; + return utf16.chars[pos]; } public boolean isEnd(int pos) { @@ -120,7 +121,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities { } public String substring(int beginIndex, int endIndex) { - return new String(utf16.result, beginIndex, endIndex - beginIndex); + return new String(utf16.chars, beginIndex, endIndex - beginIndex); } }; diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java index f1a238d74cc..f24bc2b8d90 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java @@ -21,6 +21,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; /** @@ -95,25 +96,11 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities { class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher { private final Pattern pattern; private final Matcher matcher; - private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); - private final CharSequence utf16wrapper = new CharSequence() { - - public int length() { - return utf16.length; - } - - public char charAt(int index) { - return utf16.result[index]; - } - - public CharSequence subSequence(int start, int end) { - return new String(utf16.result, start, end - start); - } - }; + private final CharsRef utf16 = new CharsRef(10); public JavaUtilRegexMatcher(String regex, int flags) { this.pattern = Pattern.compile(regex, flags); - this.matcher = this.pattern.matcher(utf16wrapper); + this.matcher = this.pattern.matcher(utf16); } public boolean match(BytesRef term) { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java index e2a2851008d..0b97af58b12 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java @@ -48,6 +48,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; @@ -850,8 +851,9 @@ public final class MoreLikeThis { { BytesRef[] terms = vector.getTerms(); int freqs[]=vector.getTermFrequencies(); + final CharsRef spare = new CharsRef(); for (int j = 0; j < terms.length; j++) { - String term = terms[j].utf8ToString(); + final String term = terms[j].utf8ToChars(spare).toString(); if(isNoiseWord(term)){ continue; diff --git a/lucene/src/java/org/apache/lucene/document/CompressionTools.java b/lucene/src/java/org/apache/lucene/document/CompressionTools.java index 1746b85db53..78e5949d312 100644 --- a/lucene/src/java/org/apache/lucene/document/CompressionTools.java +++ b/lucene/src/java/org/apache/lucene/document/CompressionTools.java @@ -23,6 +23,7 @@ import java.util.zip.DataFormatException; import java.io.ByteArrayOutputStream; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; /** Simple utility class providing static methods to @@ -118,9 +119,9 @@ public class CompressionTools { /** Decompress the byte array previously returned by * compressString back into a String */ public static String decompressString(byte[] value) throws DataFormatException { - UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result(); final byte[] bytes = decompress(value); + CharsRef result = new CharsRef(bytes.length); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result); - return new String(result.result, 0, result.length); + return new String(result.chars, 0, result.length); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java index ea74a6b6627..5f5b68cb044 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.fst.Builder; @@ -236,7 +237,7 @@ class SimpleTextFieldsReader extends FieldsProducer { private int tf; private Bits skipDocs; private final BytesRef scratch = new BytesRef(10); - private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result(); + private final CharsRef scratchUTF16 = new CharsRef(10); public SimpleTextDocsEnum() { this.inStart = SimpleTextFieldsReader.this.in; @@ -286,7 +287,7 @@ class SimpleTextFieldsReader extends FieldsProducer { return docID; } UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16); - docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length); + docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); termFreq = 0; first = false; } else if (scratch.startsWith(POS)) { @@ -323,8 +324,8 @@ class SimpleTextFieldsReader extends FieldsProducer { private Bits skipDocs; private final BytesRef scratch = new BytesRef(10); private final BytesRef scratch2 = new BytesRef(10); - private final UnicodeUtil.UTF16Result scratchUTF16 = new UnicodeUtil.UTF16Result(); - private final UnicodeUtil.UTF16Result scratchUTF16_2 = new UnicodeUtil.UTF16Result(); + private final CharsRef scratchUTF16 = new CharsRef(10); + private final CharsRef scratchUTF16_2 = new CharsRef(10); private BytesRef payload; private long nextDocStart; @@ -368,7 +369,7 @@ class SimpleTextFieldsReader extends FieldsProducer { return docID; } UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+DOC.length, scratch.length-DOC.length, scratchUTF16); - docID = ArrayUtil.parseInt(scratchUTF16.result, 0, scratchUTF16.length); + docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length); tf = 0; posStart = in.getFilePointer(); first = false; @@ -400,7 +401,7 @@ class SimpleTextFieldsReader extends FieldsProducer { readLine(in, scratch); assert scratch.startsWith(POS): "got line=" + scratch.utf8ToString(); UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+POS.length, scratch.length-POS.length, scratchUTF16_2); - final int pos = ArrayUtil.parseInt(scratchUTF16_2.result, 0, scratchUTF16_2.length); + final int pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length); final long fp = in.getFilePointer(); readLine(in, scratch); if (scratch.startsWith(PAYLOAD)) { diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index 8099722afe5..22d65457b1f 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -18,7 +18,6 @@ package org.apache.lucene.util; */ import java.util.Comparator; -import java.io.UnsupportedEncodingException; /** Represents byte[], as a slice (offset + length) into an * existing byte[]. @@ -122,6 +121,7 @@ public final class BytesRef implements Comparable { public void copy(char text[], int offset, int length) { UnicodeUtil.UTF16toUTF8(text, offset, length, this); } + public boolean bytesEquals(BytesRef other) { if (length == other.length) { int otherUpto = other.offset; @@ -198,13 +198,15 @@ public final class BytesRef implements Comparable { /** Interprets stored bytes as UTF8 bytes, returning the * resulting string */ public String utf8ToString() { - try { - return new String(bytes, offset, length, "UTF-8"); - } catch (UnsupportedEncodingException uee) { - // should not happen -- UTF8 is presumably supported - // by all JREs - throw new RuntimeException(uee); - } + final CharsRef ref = new CharsRef(length); + UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref); + return ref.toString(); + } + + /** Interprets stored bytes as UTF8 bytes into the given {@link CharsRef} */ + public CharsRef utf8ToChars(CharsRef ref) { + UnicodeUtil.UTF8toUTF16(bytes, offset, length, ref); + return ref; } /** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */ diff --git a/lucene/src/java/org/apache/lucene/util/CharsRef.java b/lucene/src/java/org/apache/lucene/util/CharsRef.java new file mode 100644 index 00000000000..618ae707f9c --- /dev/null +++ b/lucene/src/java/org/apache/lucene/util/CharsRef.java @@ -0,0 +1,218 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Represents char[], as a slice (offset + length) into an existing char[]. + * + * @lucene.internal + */ +public final class CharsRef implements Comparable, CharSequence { + private static final char[] EMPTY_ARRAY = new char[0]; + public char[] chars; + public int offset; + public int length; + + /** + * Creates a new {@link CharsRef} initialized an empty array zero-length + */ + public CharsRef() { + this(EMPTY_ARRAY, 0, 0); + } + + /** + * Creates a new {@link CharsRef} initialized with an array of the given + * capacity + */ + public CharsRef(int capacity) { + chars = new char[capacity]; + } + + /** + * Creates a new {@link CharsRef} initialized with the given array, offset and + * length + */ + public CharsRef(char[] chars, int offset, int length) { + assert chars != null; + assert chars.length >= offset + length; + this.chars = chars; + this.offset = offset; + this.length = length; + } + + /** + * Creates a new {@link CharsRef} initialized with the given Strings character + * array + */ + public CharsRef(String string) { + this.chars = string.toCharArray(); + this.offset = 0; + this.length = chars.length; + } + + /** + * Creates a new {@link CharsRef} and copies the contents of the source into + * the new instance. + * @see #copy(CharsRef) + */ + public CharsRef(CharsRef other) { + copy(other); + } + + @Override + public Object clone() { + return new CharsRef(this); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 0; + final int end = offset + length; + for (int i = offset; i < end; i++) { + result = prime * result + chars[i]; + } + return result; + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other instanceof CharsRef) { + return charsEquals((CharsRef) other); + } + + if (other instanceof CharSequence) { + final CharSequence seq = (CharSequence) other; + if (length == seq.length()) { + int n = length; + int i = offset; + int j = 0; + while (n-- != 0) { + if (chars[i++] != seq.charAt(j++)) + return false; + } + return true; + } + } + return false; + } + + public boolean charsEquals(CharsRef other) { + if (length == other.length) { + int otherUpto = other.offset; + final char[] otherChars = other.chars; + final int end = offset + length; + for (int upto = offset; upto < end; upto++, otherUpto++) { + if (chars[upto] != otherChars[otherUpto]) { + return false; + } + } + return true; + } else { + return false; + } + } + + /** Signed int order comparison */ + public int compareTo(CharsRef other) { + if (this == other) + return 0; + + final char[] aChars = this.chars; + int aUpto = this.offset; + final char[] bChars = other.chars; + int bUpto = other.offset; + + final int aStop = aUpto + Math.min(this.length, other.length); + + while (aUpto < aStop) { + int aInt = aChars[aUpto++]; + int bInt = bChars[bUpto++]; + if (aInt > bInt) { + return 1; + } else if (aInt < bInt) { + return -1; + } + } + + // One is a prefix of the other, or, they are equal: + return this.length - other.length; + } + + /** + * Copies the given {@link CharsRef} referenced content into this instance + * starting at offset 0. + * + * @param other + * the {@link CharsRef} to copy + */ + public void copy(CharsRef other) { + chars = ArrayUtil.grow(chars, other.length); + System.arraycopy(other.chars, other.offset, chars, 0, other.length); + length = other.length; + offset = 0; + } + + public void grow(int newLength) { + if (chars.length < newLength) { + chars = ArrayUtil.grow(chars, newLength); + } + } + + /** + * Copies the given array into this CharsRef starting at offset 0 + */ + public void copy(char[] otherChars, int otherOffset, int otherLength) { + this.offset = 0; + append(otherChars, otherOffset, otherLength); + } + + /** + * Appends the given array to this CharsRef starting at the current offset + */ + public void append(char[] otherChars, int otherOffset, int otherLength) { + grow(this.offset + otherLength); + System.arraycopy(otherChars, otherOffset, this.chars, this.offset, + otherLength); + this.length = otherLength; + } + + @Override + public String toString() { + return new String(chars, offset, length); + } + + @Override + public int length() { + return length; + } + + @Override + public char charAt(int index) { + return chars[offset + index]; + } + + @Override + public CharSequence subSequence(int start, int end) { + return new CharsRef(chars, offset + start, offset + end - 1); + } +} \ No newline at end of file diff --git a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java index 32268ac5d93..a3c689891d5 100644 --- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -94,6 +94,19 @@ package org.apache.lucene.util; */ public final class UnicodeUtil { + + /** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms + * one would normally encounter, and definitely bigger than any UTF-8 terms. + *

+ * WARNING: This is not a valid UTF8 Term + **/ + public static final BytesRef BIG_TERM = new BytesRef( + new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1} + ); // TODO this is unrelated here find a better place for it + + public static void main(String[] args) { + System.out.println(Character.toChars(0x10FFFF + 1)); + } private UnicodeUtil() {} // no instance @@ -112,33 +125,6 @@ public final class UnicodeUtil { Character.MIN_SUPPLEMENTARY_CODE_POINT - (UNI_SUR_HIGH_START << HALF_SHIFT) - UNI_SUR_LOW_START; - /** - * @lucene.internal - */ - public static final class UTF16Result { - public char[] result = new char[10]; - public int[] offsets = new int[10]; - public int length; - - public void setLength(int newLength) { - if (result.length < newLength) - result = ArrayUtil.grow(result, newLength); - length = newLength; - } - - public void copyText(UTF16Result other) { - setLength(other.length); - System.arraycopy(other.result, 0, result, 0, length); - } - - public void copyText(String other) { - final int otherLength = other.length(); - setLength(otherLength); - other.getChars(0, otherLength, result, 0); - length = otherLength; - } - } - /** Encode characters from a char[] source, starting at * offset for length chars. Returns a hash of the resulting bytes. After encoding, result.offset will always be 0. */ public static int UTF16toUTF8WithHash(final char[] source, final int offset, final int length, BytesRef result) { @@ -302,135 +288,6 @@ public final class UnicodeUtil { result.length = upto; } - /** Convert UTF8 bytes into UTF16 characters. If offset - * is non-zero, conversion starts at that starting point - * in utf8, re-using the results from the previous call - * up until offset. */ - public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) { - - final int end = offset + length; - char[] out = result.result; - if (result.offsets.length <= end) { - result.offsets = ArrayUtil.grow(result.offsets, end+1); - } - final int[] offsets = result.offsets; - - // If incremental decoding fell in the middle of a - // single unicode character, rollback to its start: - int upto = offset; - while(offsets[upto] == -1) - upto--; - - int outUpto = offsets[upto]; - - // Pre-allocate for worst case 1-for-1 - if (outUpto+length >= out.length) { - out = result.result = ArrayUtil.grow(out, outUpto+length+1); - } - - while (upto < end) { - - final int b = utf8[upto]&0xff; - final int ch; - - offsets[upto++] = outUpto; - - if (b < 0xc0) { - assert b < 0x80; - ch = b; - } else if (b < 0xe0) { - ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f); - offsets[upto++] = -1; - } else if (b < 0xf0) { - ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f); - offsets[upto++] = -1; - offsets[upto++] = -1; - } else { - assert b < 0xf8; - ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f); - offsets[upto++] = -1; - offsets[upto++] = -1; - offsets[upto++] = -1; - } - - if (ch <= UNI_MAX_BMP) { - // target is a character <= 0xFFFF - out[outUpto++] = (char) ch; - } else { - // target is a character in range 0xFFFF - 0x10FFFF - out[outUpto++] = (char) ((ch >> HALF_SHIFT) + 0xD7C0 /* UNI_SUR_HIGH_START - 64 */); - out[outUpto++] = (char) ((ch & HALF_MASK) + UNI_SUR_LOW_START); - } - } - offsets[upto] = outUpto; - result.length = outUpto; - } - - /** - * Get the next valid UTF-16 String in UTF-16 order. - *

- * If the input String is already valid, it is returned. - * Otherwise the next String in code unit order is returned. - *

- * @param s input String (possibly with unpaired surrogates) - * @return next valid UTF-16 String in UTF-16 order - */ - public static String nextValidUTF16String(String s) { - if (validUTF16String(s)) - return s; - else { - UTF16Result chars = new UTF16Result(); - chars.copyText(s); - nextValidUTF16String(chars); - return new String(chars.result, 0, chars.length); - } - } - - public static void nextValidUTF16String(UTF16Result s) { - final int size = s.length; - for (int i = 0; i < size; i++) { - char ch = s.result[i]; - if (ch >= UnicodeUtil.UNI_SUR_HIGH_START - && ch <= UnicodeUtil.UNI_SUR_HIGH_END) { - if (i < size - 1) { - i++; - char nextCH = s.result[i]; - if (nextCH >= UnicodeUtil.UNI_SUR_LOW_START - && nextCH <= UnicodeUtil.UNI_SUR_LOW_END) { - // Valid surrogate pair - } else - // Unmatched high surrogate - if (nextCH < UnicodeUtil.UNI_SUR_LOW_START) { // SMP not enumerated - s.setLength(i + 1); - s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; - return; - } else { // SMP already enumerated - if (s.result[i - 1] == UnicodeUtil.UNI_SUR_HIGH_END) { - s.result[i - 1] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); - s.setLength(i); - } else { - s.result[i - 1]++; - s.result[i] = (char) UnicodeUtil.UNI_SUR_LOW_START; - s.setLength(i + 1); - } - return; - } - } else { - // Unmatched high surrogate in final position, SMP not yet enumerated - s.setLength(i + 2); - s.result[i + 1] = (char) UnicodeUtil.UNI_SUR_LOW_START; - return; - } - } else if (ch >= UnicodeUtil.UNI_SUR_LOW_START - && ch <= UnicodeUtil.UNI_SUR_LOW_END) { - // Unmatched low surrogate, SMP already enumerated - s.setLength(i + 1); - s.result[i] = (char) (UnicodeUtil.UNI_SUR_LOW_END + 1); - return; - } - } - } - // Only called from assert /* private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) { @@ -705,4 +562,51 @@ public final class UnicodeUtil { } return sb.toString(); } + + /** + * Interprets the given byte array as UTF-8 and converts to UTF-16. The {@link CharsRef} will be extended if + * it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + *

+ * NOTE: Full characters are read, even if this reads past the length passed (and + * can result in an ArrayOutOfBoundsException if invalid UTF-8 is passed). + * Explicit checks for valid UTF-8 are not performed. + */ + public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { + int out_offset = chars.offset = 0; + final char[] out = chars.chars = ArrayUtil.grow(chars.chars, length); + final int limit = offset + length; + while (offset < limit) { + int b = utf8[offset++]&0xff; + if (b < 0xc0) { + assert b < 0x80; + out[out_offset++] = (char)b; + } else if (b < 0xe0) { + out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f)); + } else if (b < 0xf0) { + out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f)); + offset += 2; + } else { + assert b < 0xf8; + int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f); + offset += 3; + if (ch < UNI_MAX_BMP) { + out[out_offset++] = (char)ch; + } else { + int chHalf = ch - 0x0010000; + out[out_offset++] = (char) ((chHalf >> 10) + 0xD800); + out[out_offset++] = (char) ((chHalf & HALF_MASK) + 0xDC00); + } + } + } + chars.length = out_offset - chars.offset; + } + + /** + * Utility method for {@link #UTF8toUTF16(byte[], int, int, CharsRef)} + * @see #UTF8toUTF16(byte[], int, int, CharsRef) + */ + public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { + UTF8toUTF16(bytesRef.bytes, bytesRef.offset, bytesRef.length, chars); + } + } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java index 782cd3a2a01..a676908a6c9 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java @@ -21,6 +21,7 @@ package org.apache.lucene.index.codecs.preflexrw; import java.io.IOException; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.util.BytesRef; @@ -107,14 +108,14 @@ final class TermInfosWriter { } // Currently used only by assert statements - UnicodeUtil.UTF16Result utf16Result1; - UnicodeUtil.UTF16Result utf16Result2; + CharsRef utf16Result1; + CharsRef utf16Result2; private final BytesRef scratchBytes = new BytesRef(); // Currently used only by assert statements private boolean initUTF16Results() { - utf16Result1 = new UnicodeUtil.UTF16Result(); - utf16Result2 = new UnicodeUtil.UTF16Result(); + utf16Result1 = new CharsRef(10); + utf16Result2 = new CharsRef(10); return true; } @@ -145,8 +146,8 @@ final class TermInfosWriter { len = utf16Result2.length; for(int i=0;i stopWords = new HashSet(); - Terms terms = MultiFields.getTerms(reader, fieldName); + final Terms terms = MultiFields.getTerms(reader, fieldName); + final CharsRef spare = new CharsRef(); if (terms != null) { - TermsEnum te = terms.iterator(); + final TermsEnum te = terms.iterator(); BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { - stopWords.add(text.utf8ToString()); + stopWords.add(text.utf8ToChars(spare).toString()); } } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index 89d92c09708..e94d81a14d7 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -34,6 +34,7 @@ import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.automaton.LevenshteinAutomata; /** @@ -322,7 +323,7 @@ public class DirectSpellChecker { */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, boolean morePopular, float accuracy) throws IOException { - + final CharsRef spare = new CharsRef(); String text = term.text(); if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength) return new SuggestWord[0]; @@ -358,11 +359,11 @@ public class DirectSpellChecker { int inspections = numSug * maxInspections; // try ed=1 first, in case we get lucky - terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy); + terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); if (maxEdits > 1 && terms.size() < inspections) { HashSet moreTerms = new HashSet(); moreTerms.addAll(terms); - moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy)); + moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); terms = moreTerms; } @@ -372,7 +373,7 @@ public class DirectSpellChecker { int index = suggestions.length - 1; for (ScoreTerm s : terms) { SuggestWord suggestion = new SuggestWord(); - suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToString(); + suggestion.string = s.termAsString != null ? s.termAsString : s.term.utf8ToChars(spare).toString(); suggestion.score = s.score; suggestion.freq = s.docfreq; suggestions[index--] = suggestion; @@ -388,7 +389,7 @@ public class DirectSpellChecker { } private Collection suggestSimilar(Term term, int numSug, - IndexReader ir, int docfreq, int editDistance, float accuracy) throws IOException { + IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = @@ -425,7 +426,7 @@ public class DirectSpellChecker { // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.getScaleFactor() + e.getMinSimilarity(); } else { - termAsString = candidateTerm.utf8ToString(); + termAsString = candidateTerm.utf8ToChars(spare).toString(); score = distance.getDistance(term.text(), termAsString); } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java index 8a02ace041c..3d39f509ab3 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.BytesRef; @@ -42,6 +43,7 @@ public class HighFrequencyDictionary implements Dictionary { private IndexReader reader; private String field; private float thresh; + private final CharsRef spare = new CharsRef(); public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { this.reader = reader; @@ -89,7 +91,7 @@ public class HighFrequencyDictionary implements Dictionary { } hasNextCalled = false; - return (actualTerm != null) ? actualTerm.utf8ToString() : null; + return (actualTerm != null) ? actualTerm.utf8ToChars(spare).toString() : null; } public boolean hasNext() { diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java index 3ab41c2813c..b70be783113 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java @@ -23,6 +23,7 @@ import java.util.Iterator; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.StringHelper; @@ -56,6 +57,7 @@ public class LuceneDictionary implements Dictionary { final class LuceneIterator implements Iterator { private TermsEnum termsEnum; private BytesRef pendingTerm; + private final CharsRef spare = new CharsRef(); LuceneIterator() { try { @@ -74,7 +76,7 @@ public class LuceneDictionary implements Dictionary { return null; } - String result = pendingTerm.utf8ToString(); + final String result = pendingTerm.utf8ToChars(spare).toString(); try { pendingTerm = termsEnum.next(); diff --git a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java index 7049c94262e..bb0e73d9061 100644 --- a/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java +++ b/solr/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java @@ -27,6 +27,7 @@ import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeReflector; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.SorterTemplate; import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; @@ -39,8 +40,6 @@ import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; -import org.apache.noggit.CharArr; - import java.io.IOException; import java.io.StringReader; import java.util.*; @@ -235,18 +234,13 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { FieldType fieldType = context.getFieldType(); - final CharArr textBuf = new CharArr(); for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); final NamedList tokenNamedList = new SimpleOrderedMap(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); termAtt.fillBytesRef(); - - textBuf.reset(); - fieldType.indexedToReadable(rawBytes, textBuf); - final String text = textBuf.toString(); - + final String text = fieldType.indexedToReadable(rawBytes, new CharsRef(rawBytes.length)).toString(); tokenNamedList.add("text", text); if (token.hasAttribute(CharTermAttribute.class)) { diff --git a/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java b/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java index b4c189d6033..17ff7730984 100644 --- a/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java +++ b/solr/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java @@ -46,6 +46,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.BytesRef; import org.apache.solr.analysis.CharFilterFactory; @@ -232,6 +233,7 @@ public class LukeRequestHandler extends RequestHandlerBase private static SimpleOrderedMap getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema ) throws IOException { + final CharsRef spare = new CharsRef(); SimpleOrderedMap finfo = new SimpleOrderedMap(); for( Object o : doc.getFields() ) { Fieldable fieldable = (Fieldable)o; @@ -265,7 +267,7 @@ public class LukeRequestHandler extends RequestHandlerBase if( v != null ) { SimpleOrderedMap tfv = new SimpleOrderedMap(); for( int i=0; i getTopTerms( IndexReader reader, Set fields, int numTerms, Set junkWords ) throws Exception { Map info = new HashMap(); - + final CharsRef spare = new CharsRef(); Fields fieldsC = MultiFields.getFields(reader); if (fieldsC != null) { FieldsEnum fieldsEnum = fieldsC.iterator(); @@ -634,7 +636,7 @@ public class LukeRequestHandler extends RequestHandlerBase TermsEnum termsEnum = fieldsEnum.terms(); BytesRef text; while((text = termsEnum.next()) != null) { - String t = text.utf8ToString(); + String t = text.utf8ToChars(spare).toString(); // Compute distinct terms for every field TopTermQueue tiq = info.get( field ); diff --git a/solr/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/src/java/org/apache/solr/handler/component/QueryComponent.java index 56a8d7e095a..1d26b374d06 100644 --- a/solr/src/java/org/apache/solr/handler/component/QueryComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/QueryComponent.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.ReaderUtil; import org.apache.solr.cloud.CloudDescriptor; import org.apache.solr.cloud.ZkController; @@ -455,7 +456,7 @@ public class QueryComponent extends SearchComponent { SolrQueryRequest req = rb.req; SolrQueryResponse rsp = rb.rsp; - + final CharsRef spare = new CharsRef(); // The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't // currently have an option to return sort field values. Because of this, we // take the documents given and re-derive the sort values. @@ -524,7 +525,7 @@ public class QueryComponent extends SearchComponent // String field in Lucene, which returns the terms // data as BytesRef: if (val instanceof BytesRef) { - field.setValue(((BytesRef)val).utf8ToString()); + field.setValue(((BytesRef)val).utf8ToChars(spare).toString()); val = ft.toObject(field); } diff --git a/solr/src/java/org/apache/solr/handler/component/StatsComponent.java b/solr/src/java/org/apache/solr/handler/component/StatsComponent.java index 64af5b95ebb..a8b0c4a65d4 100644 --- a/solr/src/java/org/apache/solr/handler/component/StatsComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/StatsComponent.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.search.FieldCache; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.noggit.CharArr; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.StatsParams; @@ -270,19 +271,15 @@ class SimpleStats { } finfo[i++] = new FieldFacetStats( f, si, ft, 0 ); } - + final CharsRef spare = new CharsRef(); final BytesRef tempBR = new BytesRef(); - final CharArr spare = new CharArr(); - DocIterator iter = docs.iterator(); while (iter.hasNext()) { int docID = iter.nextDoc(); BytesRef raw = all.getTermText(docID, tempBR); Double v = null; if( raw != null ) { - spare.reset(); - all.ft.indexedToReadable(raw, spare); - v = Double.parseDouble(spare.toString()); + v = Double.parseDouble(all.ft.indexedToReadable(raw, spare).toString()); allstats.accumulate(v); } else { diff --git a/solr/src/java/org/apache/solr/handler/component/TermsComponent.java b/solr/src/java/org/apache/solr/handler/component/TermsComponent.java index ba99b3fe3b0..4ce9f9f507a 100644 --- a/solr/src/java/org/apache/solr/handler/component/TermsComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/TermsComponent.java @@ -18,7 +18,7 @@ package org.apache.solr.handler.component; import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.*; import org.apache.solr.common.util.NamedList; @@ -178,8 +178,7 @@ public class TermsComponent extends SearchComponent { int i = 0; BoundedTreeSet> queue = (sort ? new BoundedTreeSet>(limit) : null); - CharArr external = new CharArr(); - + CharsRef external = new CharsRef(); while (term != null && (i item : queue) { if (i >= limit) break; - external.reset(); ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); i++; diff --git a/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java b/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java index 0b003552f1a..02b1f596eb3 100755 --- a/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java +++ b/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java @@ -23,9 +23,11 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.Filter; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; import org.apache.lucene.util.packed.Direct8; @@ -37,7 +39,6 @@ import org.apache.solr.schema.FieldType; import org.apache.solr.search.DocSet; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.BoundedTreeSet; -import org.apache.solr.util.ByteUtils; import java.io.IOException; import java.util.*; @@ -244,7 +245,7 @@ class PerSegmentSingleValuedFaceting { BytesRef prefixRef = new BytesRef(prefix); startTermIndex = si.binarySearchLookup(prefixRef, tempBR); if (startTermIndex<0) startTermIndex=-startTermIndex-1; - prefixRef.append(ByteUtils.bigTerm); + prefixRef.append(UnicodeUtil.BIG_TERM); // TODO: we could constrain the lower endpoint if we had a binarySearch method that allowed passing start/end endTermIndex = si.binarySearchLookup(prefixRef, tempBR); assert endTermIndex < 0; @@ -339,6 +340,8 @@ abstract class FacetCollector { // This collector expects facets to be collected in index order class CountSortedFacetCollector extends FacetCollector { + private final CharsRef spare = new CharsRef(); + final int offset; final int limit; final int maxsize; @@ -360,7 +363,7 @@ class CountSortedFacetCollector extends FacetCollector { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). - queue.add(new SimpleFacets.CountPair(term.utf8ToString(), count)); + queue.add(new SimpleFacets.CountPair(term.utf8ToChars(spare).toString(), count)); if (queue.size()>=maxsize) min=queue.last().val; } return false; @@ -383,12 +386,13 @@ class CountSortedFacetCollector extends FacetCollector { // This collector expects facets to be collected in index order class IndexSortedFacetCollector extends FacetCollector { + private final CharsRef spare = new CharsRef(); + int offset; int limit; final int mincount; final NamedList res = new NamedList(); - public IndexSortedFacetCollector(int offset, int limit, int mincount) { this.offset = offset; this.limit = limit>0 ? limit : Integer.MAX_VALUE; @@ -407,7 +411,7 @@ class IndexSortedFacetCollector extends FacetCollector { } if (limit > 0) { - res.add(term.utf8ToString(), count); + res.add(term.utf8ToChars(spare).toString(), count); limit--; } diff --git a/solr/src/java/org/apache/solr/request/SimpleFacets.java b/solr/src/java/org/apache/solr/request/SimpleFacets.java index dc81e6a5c17..ed9040ac7d2 100644 --- a/solr/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java @@ -21,12 +21,13 @@ import org.apache.lucene.index.*; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.PackedInts; -import org.apache.noggit.CharArr; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.RequiredSolrParams; @@ -41,7 +42,6 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.schema.*; import org.apache.solr.search.*; import org.apache.solr.util.BoundedTreeSet; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.DateMathParser; import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.util.LongPriorityQueue; @@ -109,7 +109,7 @@ public class SimpleFacets { if (localParams == null) return; // remove local params unless it's a query - if (type != FacetParams.FACET_QUERY) { + if (type != FacetParams.FACET_QUERY) { // TODO Cut over to an Enum here facetValue = localParams.get(CommonParams.VALUE); } @@ -128,7 +128,7 @@ public class SimpleFacets { String excludeStr = localParams.get(CommonParams.EXCLUDE); if (excludeStr == null) return; - Map tagMap = (Map)req.getContext().get("tags"); + Map tagMap = (Map)req.getContext().get("tags"); if (tagMap != null && rb != null) { List excludeTagList = StrUtils.splitSmart(excludeStr,','); @@ -137,7 +137,7 @@ public class SimpleFacets { Object olst = tagMap.get(excludeTag); // tagMap has entries of List>, but subject to change in the future if (!(olst instanceof Collection)) continue; - for (Object o : (Collection)olst) { + for (Object o : (Collection)olst) { if (!(o instanceof QParser)) continue; QParser qp = (QParser)o; excludeSet.put(qp.getQuery(), Boolean.TRUE); @@ -435,7 +435,7 @@ public class SimpleFacets { if (prefix!=null) { startTermIndex = si.binarySearchLookup(prefixRef, br); if (startTermIndex<0) startTermIndex=-startTermIndex-1; - prefixRef.append(ByteUtils.bigTerm); + prefixRef.append(UnicodeUtil.BIG_TERM); endTermIndex = si.binarySearchLookup(prefixRef, br); assert endTermIndex < 0; endTermIndex = -endTermIndex-1; @@ -446,8 +446,7 @@ public class SimpleFacets { final int nTerms=endTermIndex-startTermIndex; int missingCount = -1; - - CharArr spare = new CharArr(); + final CharsRef charsRef = new CharsRef(10); if (nTerms>0 && docs.size() >= mincount) { // count collection array only needs to be as big as the number of terms we are @@ -547,10 +546,8 @@ public class SimpleFacets { long pair = sorted[i]; int c = (int)(pair >>> 32); int tnum = Integer.MAX_VALUE - (int)pair; - - spare.reset(); - ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), spare); - res.add(spare.toString(), c); + ft.indexedToReadable(si.lookup(startTermIndex+tnum, br), charsRef); + res.add(charsRef.toString(), c); } } else { @@ -567,9 +564,8 @@ public class SimpleFacets { int c = counts[i]; if (c=0) continue; if (--lim<0) break; - spare.reset(); - ft.indexedToReadable(si.lookup(startTermIndex+i, br), spare); - res.add(spare.toString(), c); + ft.indexedToReadable(si.lookup(startTermIndex+i, br), charsRef); + res.add(charsRef.toString(), c); } } } @@ -657,7 +653,7 @@ public class SimpleFacets { } DocsEnum docsEnum = null; - CharArr spare = new CharArr(); + CharsRef charsRef = new CharsRef(10); if (docs.size() >= mincount) { while (term != null) { @@ -742,9 +738,8 @@ public class SimpleFacets { } else { if (c >= mincount && --off<0) { if (--lim<0) break; - spare.reset(); - ft.indexedToReadable(term, spare); - res.add(spare.toString(), c); + ft.indexedToReadable(term, charsRef); + res.add(charsRef.toString(), c); } } } @@ -757,9 +752,8 @@ public class SimpleFacets { for (CountPair p : queue) { if (--off>=0) continue; if (--lim<0) break; - spare.reset(); - ft.indexedToReadable(p.key, spare); - res.add(spare.toString(), p.val); + ft.indexedToReadable(p.key, charsRef); + res.add(charsRef.toString(), p.val); } } diff --git a/solr/src/java/org/apache/solr/request/UnInvertedField.java b/solr/src/java/org/apache/solr/request/UnInvertedField.java index da7fd273187..25371d12956 100755 --- a/solr/src/java/org/apache/solr/request/UnInvertedField.java +++ b/solr/src/java/org/apache/solr/request/UnInvertedField.java @@ -24,7 +24,6 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.util.StringHelper; -import org.apache.noggit.CharArr; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.SolrException; @@ -33,13 +32,14 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.TrieField; import org.apache.solr.search.*; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.LongPriorityQueue; import org.apache.solr.util.PrimUtils; import org.apache.solr.handler.component.StatsValues; import org.apache.solr.handler.component.FieldFacetStats; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; import java.io.IOException; import java.util.HashMap; @@ -227,13 +227,13 @@ public class UnInvertedField extends DocTermOrds { TermsEnum te = getOrdTermsEnum(searcher.getIndexReader()); if (prefix != null && prefix.length() > 0) { - BytesRef prefixBr = new BytesRef(prefix); + final BytesRef prefixBr = new BytesRef(prefix); if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { startTerm = numTermsInField; } else { startTerm = (int) te.ord(); } - prefixBr.append(ByteUtils.bigTerm); + prefixBr.append(UnicodeUtil.BIG_TERM); if (te.seek(prefixBr, true) == TermsEnum.SeekStatus.END) { endTerm = numTermsInField; } else { @@ -331,8 +331,7 @@ public class UnInvertedField extends DocTermOrds { } } } - - CharArr spare = new CharArr(); + final CharsRef charsRef = new CharsRef(); int off=offset; int lim=limit>=0 ? limit : Integer.MAX_VALUE; @@ -408,7 +407,7 @@ public class UnInvertedField extends DocTermOrds { for (int i=sortedIdxStart; i 0 && input.bytes[input.offset] == 'T') { - out.write("true"); + charsRef.copy(TRUE); } else { - out.write("false"); + charsRef.copy(FALSE); } + return charsRef; } @Override diff --git a/solr/src/java/org/apache/solr/schema/DateField.java b/solr/src/java/org/apache/solr/schema/DateField.java index 198dfae9f06..d7bcef0ca0a 100644 --- a/solr/src/java/org/apache/solr/schema/DateField.java +++ b/solr/src/java/org/apache/solr/schema/DateField.java @@ -23,14 +23,13 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.DateUtil; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; import org.apache.solr.search.function.*; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.DateMathParser; import java.io.IOException; @@ -131,6 +130,8 @@ public class DateField extends FieldType { protected static String NOW = "NOW"; protected static char Z = 'Z'; + private static char[] Z_ARRAY = new char[] {Z}; + @Override public String toInternal(String val) { @@ -184,7 +185,7 @@ public class DateField extends FieldType { public Fieldable createField(SchemaField field, Object value, float boost) { // Convert to a string before indexing if(value instanceof Date) { - value = toInternal( (Date)value ) + 'Z'; + value = toInternal( (Date)value ) + Z; } return super.createField(field, value, boost); } @@ -199,9 +200,10 @@ public class DateField extends FieldType { } @Override - public void indexedToReadable(BytesRef input, CharArr out) { - ByteUtils.UTF8toUTF16(input, out); - out.write(Z); + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { + input.utf8ToChars(charsRef); + charsRef.append(Z_ARRAY, 0, 1); + return charsRef; } @Override @@ -479,10 +481,8 @@ class DateFieldSource extends FieldCacheSource { if (ord == 0) { return null; } else { - BytesRef br = termsIndex.lookup(ord, new BytesRef()); - CharArr spare = new CharArr(); - ft.indexedToReadable(br, spare); - return spare.toString(); + final BytesRef br = termsIndex.lookup(ord, spare); + return ft.indexedToReadable(br, spareChars).toString(); } } @@ -492,7 +492,7 @@ class DateFieldSource extends FieldCacheSource { if (ord == 0) { return null; } else { - BytesRef br = termsIndex.lookup(ord, new BytesRef()); + final BytesRef br = termsIndex.lookup(ord, new BytesRef()); return ft.toObject(null, br); } } diff --git a/solr/src/java/org/apache/solr/schema/FieldType.java b/solr/src/java/org/apache/solr/schema/FieldType.java index 2c78c6ecd08..325064bf9b7 100644 --- a/solr/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/src/java/org/apache/solr/schema/FieldType.java @@ -30,8 +30,8 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.noggit.CharArr; import org.apache.solr.analysis.SolrAnalyzer; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; @@ -39,7 +39,6 @@ import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; import org.apache.solr.search.Sorting; import org.apache.solr.search.function.ValueSource; -import org.apache.solr.util.ByteUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -354,9 +353,9 @@ public abstract class FieldType extends FieldProperties { } public Object toObject(SchemaField sf, BytesRef term) { - CharArr ext = new CharArr(term.length); - indexedToReadable(term, ext); - Fieldable f = createField(sf, ext.toString(), 1.0f); + final CharsRef ref = new CharsRef(term.length); + indexedToReadable(term, ref); + final Fieldable f = createField(sf, ref.toString(), 1.0f); return toObject(f); } @@ -365,9 +364,10 @@ public abstract class FieldType extends FieldProperties { return indexedForm; } - /** Given an indexed term, append the human readable representation to out */ - public void indexedToReadable(BytesRef input, CharArr out) { - ByteUtils.UTF8toUTF16(input, out); + /** Given an indexed term, append the human readable representation*/ + public CharsRef indexedToReadable(BytesRef input, CharsRef output) { + input.utf8ToChars(output); + return output; } /** Given the stored field, return the human readable representation */ @@ -390,7 +390,7 @@ public abstract class FieldType extends FieldProperties { /** Given the readable value, return the term value that will match it. */ public void readableToIndexed(CharSequence val, BytesRef result) { - String internal = readableToIndexed(val.toString()); + final String internal = readableToIndexed(val.toString()); UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result); } diff --git a/solr/src/java/org/apache/solr/schema/SortableDoubleField.java b/solr/src/java/org/apache/solr/schema/SortableDoubleField.java index 4df79f506f2..0a0d53a6fe1 100644 --- a/solr/src/java/org/apache/solr/schema/SortableDoubleField.java +++ b/solr/src/java/org/apache/solr/schema/SortableDoubleField.java @@ -19,7 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.search.MutableValueDouble; import org.apache.solr.search.MutableValue; import org.apache.solr.search.QParser; @@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues; import org.apache.solr.search.function.StringIndexDocValues; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.NumberUtils; import org.apache.solr.response.TextResponseWriter; @@ -78,9 +77,12 @@ public class SortableDoubleField extends FieldType { } @Override - public void indexedToReadable(BytesRef input, CharArr out) { + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead - out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) ); + input.utf8ToChars(charsRef); + final char[] indexedToReadable = indexedToReadable(charsRef.toString()).toCharArray(); + charsRef.copy(indexedToReadable, 0, indexedToReadable.length); + return charsRef; } @Override @@ -90,9 +92,6 @@ public class SortableDoubleField extends FieldType { } } - - - class SortableDoubleFieldSource extends FieldCacheSource { protected double defVal; diff --git a/solr/src/java/org/apache/solr/schema/SortableFloatField.java b/solr/src/java/org/apache/solr/schema/SortableFloatField.java index 6f7dc5e1e6c..876a5f9eede 100644 --- a/solr/src/java/org/apache/solr/schema/SortableFloatField.java +++ b/solr/src/java/org/apache/solr/schema/SortableFloatField.java @@ -19,7 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.search.MutableValueFloat; import org.apache.solr.search.MutableValue; import org.apache.solr.search.QParser; @@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues; import org.apache.solr.search.function.StringIndexDocValues; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.NumberUtils; import org.apache.solr.response.TextResponseWriter; @@ -77,10 +76,11 @@ public class SortableFloatField extends FieldType { return NumberUtils.SortableStr2floatStr(indexedForm); } - @Override - public void indexedToReadable(BytesRef input, CharArr out) { + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead - out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) ); + final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray(); + charsRef.copy(indexedToReadable, 0, indexedToReadable.length); + return charsRef; } @Override diff --git a/solr/src/java/org/apache/solr/schema/SortableIntField.java b/solr/src/java/org/apache/solr/schema/SortableIntField.java index 012aa5b16e1..74f52f80b91 100644 --- a/solr/src/java/org/apache/solr/schema/SortableIntField.java +++ b/solr/src/java/org/apache/solr/schema/SortableIntField.java @@ -19,7 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.search.MutableValueInt; import org.apache.solr.search.MutableValue; import org.apache.solr.search.QParser; @@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues; import org.apache.solr.search.function.StringIndexDocValues; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.NumberUtils; import org.apache.solr.response.TextResponseWriter; @@ -75,10 +74,11 @@ public class SortableIntField extends FieldType { return NumberUtils.SortableStr2int(indexedForm); } - @Override - public void indexedToReadable(BytesRef input, CharArr out) { + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead - out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) ); + final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray(); + charsRef.copy(indexedToReadable, 0, indexedToReadable.length); + return charsRef; } @Override diff --git a/solr/src/java/org/apache/solr/schema/SortableLongField.java b/solr/src/java/org/apache/solr/schema/SortableLongField.java index 7ddbdbcc89f..6a6e8b49ee2 100644 --- a/solr/src/java/org/apache/solr/schema/SortableLongField.java +++ b/solr/src/java/org/apache/solr/schema/SortableLongField.java @@ -19,7 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.search.SortField; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.search.MutableValueLong; import org.apache.solr.search.MutableValue; import org.apache.solr.search.QParser; @@ -29,7 +29,6 @@ import org.apache.solr.search.function.DocValues; import org.apache.solr.search.function.StringIndexDocValues; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.solr.util.ByteUtils; import org.apache.solr.util.NumberUtils; import org.apache.solr.response.TextResponseWriter; @@ -67,10 +66,11 @@ public class SortableLongField extends FieldType { return NumberUtils.SortableStr2long(indexedForm); } - @Override - public void indexedToReadable(BytesRef input, CharArr out) { + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { // TODO: this could be more efficient, but the sortable types should be deprecated instead - out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) ); + final char[] indexedToReadable = indexedToReadable(input.utf8ToChars(charsRef).toString()).toCharArray(); + charsRef.copy(indexedToReadable, 0, indexedToReadable.length); + return charsRef; } @Override diff --git a/solr/src/java/org/apache/solr/schema/StrField.java b/solr/src/java/org/apache/solr/schema/StrField.java index e4749ecb31f..ae07be17121 100644 --- a/solr/src/java/org/apache/solr/schema/StrField.java +++ b/solr/src/java/org/apache/solr/schema/StrField.java @@ -23,7 +23,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.function.ValueSource; import org.apache.solr.search.QParser; -import org.apache.solr.util.ByteUtils; import java.util.Map; import java.io.IOException; @@ -54,7 +53,7 @@ public class StrField extends FieldType { @Override public Object toObject(SchemaField sf, BytesRef term) { - return ByteUtils.UTF8toUTF16(term); + return term.utf8ToString(); } } diff --git a/solr/src/java/org/apache/solr/schema/StrFieldSource.java b/solr/src/java/org/apache/solr/schema/StrFieldSource.java index 727d37d461d..aa5ea23aff0 100755 --- a/solr/src/java/org/apache/solr/schema/StrFieldSource.java +++ b/solr/src/java/org/apache/solr/schema/StrFieldSource.java @@ -18,12 +18,9 @@ package org.apache.solr.schema; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; import org.apache.solr.search.function.DocValues; import org.apache.solr.search.function.FieldCacheSource; import org.apache.solr.search.function.StringIndexDocValues; -import org.apache.solr.util.ByteUtils; import java.io.IOException; import java.util.Map; diff --git a/solr/src/java/org/apache/solr/schema/TextField.java b/solr/src/java/org/apache/solr/schema/TextField.java index 21409733dec..b33ad1502a3 100644 --- a/solr/src/java/org/apache/solr/schema/TextField.java +++ b/solr/src/java/org/apache/solr/schema/TextField.java @@ -34,7 +34,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.util.BytesRef; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; -import org.apache.solr.util.ByteUtils; import java.util.Map; import java.util.List; @@ -81,7 +80,7 @@ public class TextField extends FieldType { @Override public Object toObject(SchemaField sf, BytesRef term) { - return ByteUtils.UTF8toUTF16(term); + return term.utf8ToString(); } @Override diff --git a/solr/src/java/org/apache/solr/schema/TrieDateField.java b/solr/src/java/org/apache/solr/schema/TrieDateField.java index 8d58fa55213..e4ebf765398 100755 --- a/solr/src/java/org/apache/solr/schema/TrieDateField.java +++ b/solr/src/java/org/apache/solr/schema/TrieDateField.java @@ -17,7 +17,6 @@ package org.apache.solr.schema; -import org.apache.noggit.CharArr; import org.apache.solr.search.function.ValueSource; import org.apache.solr.search.QParser; import org.apache.solr.response.TextResponseWriter; @@ -26,6 +25,7 @@ import org.apache.lucene.search.SortField; import org.apache.lucene.search.Query; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import java.util.Map; import java.util.Date; @@ -111,10 +111,10 @@ public class TrieDateField extends DateField { public String indexedToReadable(String _indexedForm) { return wrappedField.indexedToReadable(_indexedForm); } - @Override - public void indexedToReadable(BytesRef input, CharArr out) { - wrappedField.indexedToReadable(input, out); + public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) { + // TODO: this could be more efficient, but the sortable types should be deprecated instead + return wrappedField.indexedToReadable(input, charsRef); } @Override diff --git a/solr/src/java/org/apache/solr/schema/TrieField.java b/solr/src/java/org/apache/solr/schema/TrieField.java index eb78e1bbfd8..608596a8dc7 100644 --- a/solr/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/src/java/org/apache/solr/schema/TrieField.java @@ -26,8 +26,8 @@ import org.apache.lucene.search.cache.FloatValuesCreator; import org.apache.lucene.search.cache.IntValuesCreator; import org.apache.lucene.search.cache.LongValuesCreator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.NumericUtils; -import org.apache.noggit.CharArr; import org.apache.solr.analysis.*; import org.apache.solr.common.SolrException; import org.apache.solr.response.TextResponseWriter; @@ -296,7 +296,7 @@ public class TrieField extends FieldType { @Override public String readableToIndexed(String val) { // TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts! - BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); + final BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG); readableToIndexed(val, bytes); return bytes.utf8ToString(); } @@ -363,31 +363,29 @@ public class TrieField extends FieldType { } @Override - public void indexedToReadable(BytesRef input, CharArr out) { - BytesRef indexedForm = input; - String s; - + public CharsRef indexedToReadable(BytesRef indexedForm, CharsRef charsRef) { + final char[] value; switch (type) { case INTEGER: - s = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) ); + value = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) ).toCharArray(); break; case FLOAT: - s = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) ); + value = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) ).toCharArray(); break; case LONG: - s = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) ); + value = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) ).toCharArray(); break; case DOUBLE: - s = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ); + value = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray(); break; case DATE: - s = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ); + value = dateField.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) ).toCharArray(); break; default: throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type); } - - out.write(s); + charsRef.copy(value, 0, value.length); + return charsRef; } @Override diff --git a/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java b/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java index 4248750f744..a9d0cb0c0d7 100644 --- a/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java +++ b/solr/src/java/org/apache/solr/search/MissingStringLastComparatorSource.java @@ -21,11 +21,11 @@ import org.apache.lucene.search.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; import org.apache.lucene.util.packed.Direct8; import org.apache.lucene.util.packed.PackedInts; -import org.apache.solr.util.ByteUtils; import java.io.IOException; @@ -34,7 +34,7 @@ public class MissingStringLastComparatorSource extends FieldComparatorSource { private final BytesRef missingValueProxy; public MissingStringLastComparatorSource() { - this(ByteUtils.bigTerm); + this(UnicodeUtil.BIG_TERM); } /** Creates a {@link FieldComparatorSource} that sorts null last in a normal ascending sort. diff --git a/solr/src/java/org/apache/solr/search/MutableValueStr.java b/solr/src/java/org/apache/solr/search/MutableValueStr.java index 03eabb89460..3c97a8f5a2f 100755 --- a/solr/src/java/org/apache/solr/search/MutableValueStr.java +++ b/solr/src/java/org/apache/solr/search/MutableValueStr.java @@ -17,14 +17,13 @@ package org.apache.solr.search; import org.apache.lucene.util.BytesRef; -import org.apache.solr.util.ByteUtils; public class MutableValueStr extends MutableValue { public BytesRef value = new BytesRef(); @Override public Object toObject() { - return exists ? ByteUtils.UTF8toUTF16(value) : null; + return exists ? value.utf8ToString() : null; } @Override diff --git a/solr/src/java/org/apache/solr/search/function/IDFValueSource.java b/solr/src/java/org/apache/solr/search/function/IDFValueSource.java index 522e94623f3..4e7f4431c3a 100755 --- a/solr/src/java/org/apache/solr/search/function/IDFValueSource.java +++ b/solr/src/java/org/apache/solr/search/function/IDFValueSource.java @@ -22,7 +22,6 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.BytesRef; -import org.apache.solr.util.ByteUtils; import java.io.IOException; import java.util.Map; @@ -43,8 +42,7 @@ public class IDFValueSource extends DocFreqValueSource { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); Similarity sim = searcher.getSimilarityProvider().get(field); // todo: we need docFreq that takes a BytesRef - String strVal = ByteUtils.UTF8toUTF16(indexedBytes); - int docfreq = searcher.docFreq(new Term(indexedField, strVal)); + int docfreq = searcher.docFreq(new Term(indexedField, indexedBytes.utf8ToString())); float idf = sim.idf(docfreq, searcher.maxDoc()); return new ConstDoubleDocValues(idf, this); } diff --git a/solr/src/java/org/apache/solr/search/function/StringIndexDocValues.java b/solr/src/java/org/apache/solr/search/function/StringIndexDocValues.java index 16d5a14b9ea..95d7d0cd823 100755 --- a/solr/src/java/org/apache/solr/search/function/StringIndexDocValues.java +++ b/solr/src/java/org/apache/solr/search/function/StringIndexDocValues.java @@ -21,10 +21,9 @@ import org.apache.lucene.search.FieldCache; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; +import org.apache.lucene.util.CharsRef; import org.apache.solr.search.MutableValue; import org.apache.solr.search.MutableValueStr; -import org.apache.solr.util.ByteUtils; import java.io.IOException; @@ -36,7 +35,7 @@ public abstract class StringIndexDocValues extends DocValues { protected final ValueSource vs; protected final MutableValueStr val = new MutableValueStr(); protected final BytesRef spare = new BytesRef(); - protected final CharArr spareChars = new CharArr(); + protected final CharsRef spareChars = new CharsRef(); public StringIndexDocValues(ValueSource vs, AtomicReaderContext context, String field) throws IOException { try { @@ -75,8 +74,7 @@ public abstract class StringIndexDocValues extends DocValues { int ord=termsIndex.getOrd(doc); if (ord==0) return null; termsIndex.lookup(ord, spare); - spareChars.reset(); - ByteUtils.UTF8toUTF16(spare, spareChars); + spare.utf8ToChars(spareChars); return spareChars.toString(); } diff --git a/solr/src/java/org/apache/solr/util/ByteUtils.java b/solr/src/java/org/apache/solr/util/ByteUtils.java deleted file mode 100755 index 87a5b80fe31..00000000000 --- a/solr/src/java/org/apache/solr/util/ByteUtils.java +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.util; - -import org.apache.lucene.util.BytesRef; -import org.apache.noggit.CharArr; - - -public class ByteUtils { - /** A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms - * one would normally encounter, and definitely bigger than any UTF-8 terms */ - public static final BytesRef bigTerm = new BytesRef( - new byte[] {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1} - ); - - /** Converts utf8 to utf16 and returns the number of 16 bit Java chars written. - * Full characters are read, even if this reads past the length passed (and can result in - * an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed. - * The char[] out should probably have enough room to hold the worst case of each byte becoming a Java char. - */ - public static int UTF8toUTF16(byte[] utf8, int offset, int len, char[] out, int out_offset) { - int out_start = out_offset; - final int limit = offset + len; - while (offset < limit) { - int b = utf8[offset++]&0xff; - - if (b < 0xc0) { - assert b < 0x80; - out[out_offset++] = (char)b; - } else if (b < 0xe0) { - out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f)); - } else if (b < 0xf0) { - out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f)); - offset += 2; - } else { - assert b < 0xf8; - int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f); - offset += 3; - if (ch < 0xffff) { - out[out_offset++] = (char)ch; - } else { - int chHalf = ch - 0x0010000; - out[out_offset++] = (char) ((chHalf >> 10) + 0xD800); - out[out_offset++] = (char) ((chHalf & 0x3FFL) + 0xDC00); - } - } - } - - return out_offset - out_start; - } - - /** Convert UTF8 bytes into UTF16 characters. */ - public static void UTF8toUTF16(BytesRef utf8, CharArr out) { - // TODO: do in chunks if the input is large - out.reserve(utf8.length); - int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out.getArray(), out.getEnd()); - out.setEnd(out.getEnd() + n); - } - - /** Convert UTF8 bytes into a String */ - public static String UTF8toUTF16(BytesRef utf8) { - char[] out = new char[utf8.length]; - int n = UTF8toUTF16(utf8.bytes, utf8.offset, utf8.length, out, 0); - return new String(out,0,n); - } -} diff --git a/solr/src/webapp/web/admin/analysis.jsp b/solr/src/webapp/web/admin/analysis.jsp index 43c8ae5b5e3..e517eb0f842 100644 --- a/solr/src/webapp/web/admin/analysis.jsp +++ b/solr/src/webapp/web/admin/analysis.jsp @@ -19,6 +19,7 @@ org.apache.lucene.util.AttributeSource, org.apache.lucene.util.Attribute, org.apache.lucene.util.BytesRef, + org.apache.lucene.util.CharsRef, org.apache.lucene.analysis.TokenStream, org.apache.lucene.index.Payload, org.apache.lucene.analysis.CharReader, @@ -32,8 +33,7 @@ org.apache.solr.schema.FieldType, org.apache.solr.schema.SchemaField, org.apache.solr.common.util.XML, - javax.servlet.jsp.JspWriter,java.io.IOException, - org.apache.noggit.CharArr + javax.servlet.jsp.JspWriter,java.io.IOException "%> <%@ page import="java.io.Reader"%> <%@ page import="java.io.StringReader"%> @@ -287,9 +287,7 @@ bytes = new BytesRef(spare); rawText = (token.hasAttribute(CharTermAttribute.class)) ? token.getAttribute(CharTermAttribute.class).toString() : null; - final CharArr textBuf = new CharArr(bytes.length); - ft.indexedToReadable(bytes, textBuf); - text = textBuf.toString(); + text = ft.indexedToReadable(bytes, new CharsRef()).toString(); token.reflectWith(new AttributeReflector() { public void reflect(Class attClass, String key, Object value) { // leave out position and raw term