From 25bca6f16513fda0bdd2ab670633bac26dbf5d6e Mon Sep 17 00:00:00 2001 From: Noble Paul Date: Tue, 20 Nov 2018 14:58:32 +1100 Subject: [PATCH] SOLR-12992: When using binary format, ExportWriter to directly copy BytesRef instead of creating new String --- solr/CHANGES.txt | 3 + .../handler/export/StringFieldWriter.java | 23 +- .../apache/solr/update/TransactionLog.java | 2 +- .../org/apache/solr/common/MapWriter.java | 9 + .../util/ByteArrayUtf8CharSequence.java | 251 ++++++++++++++++++ .../apache/solr/common/util/ByteUtils.java | 3 +- .../solr/common/util/FastOutputStream.java | 19 +- .../apache/solr/common/util/JavaBinCodec.java | 63 ++++- .../apache/solr/common/util/MurmurHash2.java | 100 +++++++ .../apache/solr/common/util/TextWriter.java | 2 +- .../solr/common/util/Utf8CharSequence.java | 63 +++++ .../common/util/Utf8CharSequenceTest.java | 61 +++++ 12 files changed, 584 insertions(+), 15 deletions(-) create mode 100644 solr/solrj/src/java/org/apache/solr/common/util/ByteArrayUtf8CharSequence.java create mode 100644 solr/solrj/src/java/org/apache/solr/common/util/MurmurHash2.java create mode 100644 solr/solrj/src/java/org/apache/solr/common/util/Utf8CharSequence.java create mode 100644 solr/solrj/src/test/org/apache/solr/common/util/Utf8CharSequenceTest.java diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 6a897dfac32..79941423890 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -134,6 +134,9 @@ Improvements * SOLR-12881: Remove unneeded import statements (Peter Somogyi via Erick Erickson) +* SOLR-12992: When using binary format, ExportWriter to directly copy BytesRef instead of + creating new String (noble) + ================== 7.6.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. diff --git a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java index 60718702a37..c14e4d7770d 100644 --- a/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java +++ b/solr/core/src/java/org/apache/solr/handler/export/StringFieldWriter.java @@ -25,12 +25,25 @@ import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.solr.common.MapWriter; +import org.apache.solr.common.util.ByteArrayUtf8CharSequence; +import org.apache.solr.common.util.JavaBinCodec; import org.apache.solr.schema.FieldType; class StringFieldWriter extends FieldWriter { private String field; private FieldType fieldType; private CharsRefBuilder cref = new CharsRefBuilder(); + final ByteArrayUtf8CharSequence utf8 = new ByteArrayUtf8CharSequence(new byte[0], 0, 0) { + @Override + public String toString() { + String str = super.utf16; + if (str != null) return str; + fieldType.indexedToReadable(new BytesRef(super.buf, super.offset, super.length), cref); + str = cref.toString(); + super.utf16 = str; + return str; + } + }; public StringFieldWriter(String field, FieldType fieldType) { this.field = field; @@ -55,8 +68,14 @@ class StringFieldWriter extends FieldWriter { int ord = vals.ordValue(); ref = vals.lookupOrd(ord); } - fieldType.indexedToReadable(ref, cref); - ew.put(this.field, cref.toString()); + + if (ew instanceof JavaBinCodec.BinEntryWriter) { + ew.put(this.field, utf8.reset(ref.bytes, ref.offset, ref.length, null)); + } else { + fieldType.indexedToReadable(ref, cref); + String v = cref.toString(); + ew.put(this.field, v); + } return true; } } \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/update/TransactionLog.java b/solr/core/src/java/org/apache/solr/update/TransactionLog.java index b11f76a780f..a5f55c2033a 100644 --- a/solr/core/src/java/org/apache/solr/update/TransactionLog.java +++ b/solr/core/src/java/org/apache/solr/update/TransactionLog.java @@ -125,7 +125,7 @@ public class TransactionLog implements Closeable { } @Override - public String readExternString(DataInputInputStream fis) throws IOException { + public CharSequence readExternString(DataInputInputStream fis) throws IOException { int idx = readSize(fis); if (idx != 0) {// idx != 0 is the index of the extern string // no need to synchronize globalStringList - it's only updated before the first record is written to the log diff --git a/solr/solrj/src/java/org/apache/solr/common/MapWriter.java b/solr/solrj/src/java/org/apache/solr/common/MapWriter.java index d6c9efb5cb5..a378e1d7e6e 100644 --- a/solr/solrj/src/java/org/apache/solr/common/MapWriter.java +++ b/solr/solrj/src/java/org/apache/solr/common/MapWriter.java @@ -145,5 +145,14 @@ public interface MapWriter extends MapSerializable , NavigableObject { put(k, (Boolean) v); return this; } + + /**This is an optimization to avoid the instanceof checks. + * + */ + default EntryWriter put(CharSequence k, CharSequence v) throws IOException { + put(k, (Object)v); + return this; + + } } } diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ByteArrayUtf8CharSequence.java b/solr/solrj/src/java/org/apache/solr/common/util/ByteArrayUtf8CharSequence.java new file mode 100644 index 00000000000..aecb50d9b19 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/util/ByteArrayUtf8CharSequence.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common.util; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Map; + +import org.noggit.CharArr; + +/**A mutable byte[] backed Utf8CharSequence. This is quite similar to the BytesRef of Lucene + * This is designed for single-threaded use + * + */ +public class ByteArrayUtf8CharSequence implements Utf8CharSequence { + + protected byte[] buf; + protected int offset; + protected int hashCode = Integer.MIN_VALUE; + protected int length; + protected volatile String utf16; + + public ByteArrayUtf8CharSequence(String utf16) { + buf = new byte[Math.multiplyExact(utf16.length(), 3)]; + offset = 0; + length = ByteUtils.UTF16toUTF8(utf16, 0, utf16.length(), buf, 0); + if (buf.length > length) { + byte[] copy = new byte[length]; + System.arraycopy(buf, 0, copy, 0, length); + buf = copy; + } + assert isValid(); + } + + public ByteArrayUtf8CharSequence(byte[] buf, int offset, int length) { + this.buf = buf; + this.offset = offset; + this.length = length; + } + + public String getStringOrNull() { + return utf16; + } + @Override + public int write(int start, byte[] buffer, int pos) { + if (start == -1 || start >= length) return -1; + int writableBytes = Math.min(length - start, buffer.length - pos); + System.arraycopy(buf, offset + start, buffer, pos, writableBytes); + return writableBytes; + } + + @Override + public int size() { + return length; + } + + private ByteArrayUtf8CharSequence(byte[] buf, int offset, int length, String utf16, int hashCode) { + this.buf = buf; + this.offset = offset; + this.length = length; + this.utf16 = utf16; + this.hashCode = hashCode; + } + + @Override + public int hashCode() { + if (hashCode == Integer.MIN_VALUE) { + hashCode = MurmurHash2.hash32(buf, offset, length); + } + return hashCode; + } + + @Override + public int length() { + return _getStr().length(); + } + + @Override + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (other instanceof ByteArrayUtf8CharSequence) { + ByteArrayUtf8CharSequence that = (ByteArrayUtf8CharSequence) other; + return _equals(this.buf, this.offset, this.offset + this.length, + that.buf, that.offset, that.offset + that.length); + } + return false; + } + + + @Override + public char charAt(int index) { + return _getStr().charAt(index); + } + + private String _getStr() { + if (utf16 == null) { + synchronized (this) { + if (utf16 == null) { + CharArr arr = new CharArr(); + ByteUtils.UTF8toUTF16(buf, offset, length, arr); + utf16 = arr.toString(); + } + } + } + return utf16; + } + + @Override + public CharSequence subSequence(int start, int end) { + return new ByteArrayUtf8CharSequence(_getStr().subSequence(start, end).toString()); + } + + @Override + public ByteArrayUtf8CharSequence clone() { + return new ByteArrayUtf8CharSequence(buf, offset, length, utf16, hashCode); + } + + public ByteArrayUtf8CharSequence deepCopy() { + byte[] bytes = new byte[length]; + System.arraycopy(buf, offset, bytes, 0, length); + return new ByteArrayUtf8CharSequence(bytes, 0, length, utf16, hashCode); + } + + public static Map.Entry convertCharSeq(Map.Entry result) { + if (result.getKey() instanceof Utf8CharSequence || result.getValue() instanceof Utf8CharSequence) { + return new AbstractMap.SimpleEntry(convertCharSeq(result.getKey()), convertCharSeq(result.getValue())); + } + return result; + + } + + public static Collection convertCharSeq(Collection vals) { + if (vals == null) return vals; + boolean needsCopy = false; + for (Object o : vals) { + if (o instanceof Utf8CharSequence) { + needsCopy = true; + break; + } + } + if (needsCopy) { + ArrayList copy = new ArrayList(vals.size()); + for (Object o : vals) copy.add(convertCharSeq(o)); + return copy; + } + return vals; + } + + public static Object convertCharSeq(Object o) { + if (o == null) return null; + if (o instanceof Utf8CharSequence) return ((Utf8CharSequence) o).toString(); + return o; + } + + + // methods in Arrays are defined stupid: they cannot use Objects.checkFromToIndex + // they throw IAE (vs IOOBE) in the case of fromIndex > toIndex. + // so this method works just like checkFromToIndex, but with that stupidity added. + private static void checkFromToIndex(int fromIndex, int toIndex, int length) { + if (fromIndex > toIndex) { + throw new IllegalArgumentException("fromIndex " + fromIndex + " > toIndex " + toIndex); + } + if (fromIndex < 0 || toIndex > length) { + throw new IndexOutOfBoundsException("Range [" + fromIndex + ", " + toIndex + ") out-of-bounds for length " + length); + } + } + + @Override + public String toString() { + return _getStr(); + } + + /** + * Behaves like Java 9's Arrays.equals + * + * @see Arrays.equals + */ + public static boolean _equals(byte[] a, int aFromIndex, int aToIndex, byte[] b, int bFromIndex, int bToIndex) { + checkFromToIndex(aFromIndex, aToIndex, a.length); + checkFromToIndex(bFromIndex, bToIndex, b.length); + int aLen = aToIndex - aFromIndex; + int bLen = bToIndex - bFromIndex; + // lengths differ: cannot be equal + if (aLen != bLen) { + return false; + } + for (int i = 0; i < aLen; i++) { + if (a[i + aFromIndex] != b[i + bFromIndex]) { + return false; + } + } + return true; + } + + + public ByteArrayUtf8CharSequence reset(byte[] bytes, int offset, int length, String str) { + this.buf = bytes; + this.offset = offset; + this.length = length; + this.utf16 = str; + this.hashCode = Integer.MIN_VALUE; + return this; + } + + /** + * Performs internal consistency checks. + * Always returns true (or throws IllegalStateException) + */ + public boolean isValid() { + if (buf == null) { + throw new IllegalStateException("bytes is null"); + } + if (length < 0) { + throw new IllegalStateException("length is negative: " + length); + } + if (length > buf.length) { + throw new IllegalStateException("length is out of bounds: " + length + ",bytes.length=" + buf.length); + } + if (offset < 0) { + throw new IllegalStateException("offset is negative: " + offset); + } + if (offset > buf.length) { + throw new IllegalStateException("offset out of bounds: " + offset + ",bytes.length=" + buf.length); + } + if (offset + length < 0) { + throw new IllegalStateException("offset+length is negative: offset=" + offset + ",length=" + length); + } + if (offset + length > buf.length) { + throw new IllegalStateException("offset+length out of bounds: offset=" + offset + ",length=" + length + ",bytes.length=" + buf.length); + } + return true; + } +} diff --git a/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java b/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java index cb17ad7947b..ec4acdb6871 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java @@ -142,6 +142,7 @@ public class ByteUtils { if (upto > scratch.length - 4) { // a code point may take upto 4 bytes and we don't have enough space, so reset totalBytes += upto; + if(fos == null) throw new IOException("buffer over flow"); fos.write(scratch, 0, upto); upto = 0; } @@ -180,7 +181,7 @@ public class ByteUtils { } totalBytes += upto; - fos.write(scratch, 0, upto); + if(fos != null) fos.write(scratch, 0, upto); return totalBytes; } diff --git a/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java b/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java index 82ccd8d5ee0..7a2d00e89ef 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/FastOutputStream.java @@ -16,7 +16,10 @@ */ package org.apache.solr.common.util; -import java.io.*; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.OutputStream; /** Single threaded buffered OutputStream * Internal Solr use only, subject to change. @@ -229,4 +232,18 @@ public class FastOutputStream extends OutputStream implements DataOutput { this.written = written; } + /**Copies a {@link Utf8CharSequence} without making extra copies + */ + public void writeUtf8CharSeq(Utf8CharSequence utf8) throws IOException { + int start = 0; + int totalWritten = 0; + for (; ; ) { + if (pos >= buf.length) flushBuffer(); + int sz = utf8.write(start, buf, pos); + pos += sz; + totalWritten += sz; + if (totalWritten >= utf8.size()) break; + start += sz; + } + } } diff --git a/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java b/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java index 4435e189af0..2e49cf9b98c 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java @@ -115,12 +115,18 @@ public class JavaBinCodec implements PushWriter { private WritableDocFields writableDocFields; private boolean alreadyMarshalled; private boolean alreadyUnmarshalled; + private boolean readStringAsCharSeq = false; public JavaBinCodec() { resolver =null; writableDocFields =null; } + public JavaBinCodec setReadStringAsCharSeq(boolean flag) { + readStringAsCharSeq = flag; + return this; + } + /** * Use this to use this as a PushWriter. ensure that close() is called explicitly after use * @@ -403,7 +409,7 @@ public class JavaBinCodec implements PushWriter { return false; } - private final MapWriter.EntryWriter ew = new MapWriter.EntryWriter() { + public class BinEntryWriter implements MapWriter.EntryWriter { @Override public MapWriter.EntryWriter put(CharSequence k, Object v) throws IOException { writeExternString(k); @@ -445,7 +451,16 @@ public class JavaBinCodec implements PushWriter { writeBoolean(v); return this; } - }; + + @Override + public MapWriter.EntryWriter put(CharSequence k, CharSequence v) throws IOException { + writeExternString(k); + writeStr(v); + return this; + } + } + + private final MapWriter.EntryWriter ew = new BinEntryWriter(); public void writeMap(MapWriter val) throws IOException { @@ -832,6 +847,10 @@ public class JavaBinCodec implements PushWriter { writeTag(NULL); return; } + if (s instanceof Utf8CharSequence) { + writeUTF8Str((Utf8CharSequence) s); + return; + } int end = s.length(); int maxSize = end * ByteUtils.MAX_UTF8_BYTES_PER_CHAR; @@ -853,11 +872,11 @@ public class JavaBinCodec implements PushWriter { CharArr arr = new CharArr(); private StringBytes bytesRef = new StringBytes(bytes,0,0); - public String readStr(DataInputInputStream dis) throws IOException { + public CharSequence readStr(DataInputInputStream dis) throws IOException { return readStr(dis,null); } - public String readStr(DataInputInputStream dis, StringCache stringCache) throws IOException { + public CharSequence readStr(DataInputInputStream dis, StringCache stringCache) throws IOException { int sz = readSize(dis); if (bytes == null || bytes.length < sz) bytes = new byte[sz]; dis.readFully(bytes, 0, sz); @@ -865,8 +884,14 @@ public class JavaBinCodec implements PushWriter { return stringCache.get(bytesRef.reset(bytes, 0, sz)); } else { arr.reset(); - ByteUtils.UTF8toUTF16(bytes, 0, sz, arr); - return arr.toString(); + if (readStringAsCharSeq) { + byte[] copyBuf = new byte[sz]; + System.arraycopy(bytes, 0, copyBuf, 0, sz); + return new ByteArrayUtf8CharSequence(copyBuf, 0, sz); + } else { + ByteUtils.UTF8toUTF16(bytes, 0, sz, arr); + return arr.toString(); + } } } @@ -928,6 +953,8 @@ public class JavaBinCodec implements PushWriter { if (val == null) { daos.writeByte(NULL); return true; + } else if (val instanceof Utf8CharSequence) { + writeUTF8Str((Utf8CharSequence) val); } else if (val instanceof CharSequence) { writeStr((CharSequence) val); return true; @@ -990,6 +1017,10 @@ public class JavaBinCodec implements PushWriter { public void writeMap(Map val) throws IOException { writeTag(MAP, val.size()); + if (val instanceof MapWriter) { + ((MapWriter) val).writeMap(ew); + return; + } for (Map.Entry entry : val.entrySet()) { Object key = entry.getKey(); if (key instanceof String) { @@ -1060,7 +1091,7 @@ public class JavaBinCodec implements PushWriter { private int stringsCount = 0; private Map stringsMap; - private List stringsList; + private List stringsList; public void writeExternString(CharSequence s) throws IOException { if (s == null) { @@ -1078,19 +1109,33 @@ public class JavaBinCodec implements PushWriter { } - public String readExternString(DataInputInputStream fis) throws IOException { + public CharSequence readExternString(DataInputInputStream fis) throws IOException { int idx = readSize(fis); if (idx != 0) {// idx != 0 is the index of the extern string return stringsList.get(idx - 1); } else {// idx == 0 means it has a string value tagByte = fis.readByte(); - String s = readStr(fis, stringCache); + CharSequence s = readStr(fis, stringCache); + if (s != null) s = s.toString(); if (stringsList == null) stringsList = new ArrayList<>(); stringsList.add(s); return s; } } + + public void writeUTF8Str(Utf8CharSequence utf8) throws IOException { + writeTag(STR, utf8.size()); + daos.writeUtf8CharSeq(utf8); + } + + public long getTotalBytesWritten() { + if (daos != null) { + return daos.written; + } + return 0; + } + /** * Allows extension of {@link JavaBinCodec} to support serialization of arbitrary data types. *

diff --git a/solr/solrj/src/java/org/apache/solr/common/util/MurmurHash2.java b/solr/solrj/src/java/org/apache/solr/common/util/MurmurHash2.java new file mode 100644 index 00000000000..3d3ca26a5eb --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/util/MurmurHash2.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common.util; + +/** + * This is a very fast, non-cryptographic hash suitable for general hash-based + * lookup. See http://murmurhash.googlepages.com/ for more details. + *

+ * The C version of MurmurHash 2.0 found at that site was ported to Java by + * Andrzej Bialecki (ab at getopt org). + *

+ *

+ * The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of + * hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths + * rather than raw byte arrays. + *

+ * @lucene.experimental + */ +public final class MurmurHash2 { + + public static final MurmurHash2 INSTANCE = new MurmurHash2(); + + private MurmurHash2() {} + + public static int hash(byte[] data, int seed, int offset, int len) { + int m = 0x5bd1e995; + int r = 24; + int h = seed ^ len; + int len_4 = len >> 2; + for (int i = 0; i < len_4; i++) { + int i_4 = offset + (i << 2); + int k = data[i_4 + 3]; + k = k << 8; + k = k | (data[i_4 + 2] & 0xff); + k = k << 8; + k = k | (data[i_4 + 1] & 0xff); + k = k << 8; + k = k | (data[i_4 + 0] & 0xff); + k *= m; + k ^= k >>> r; + k *= m; + h *= m; + h ^= k; + } + int len_m = len_4 << 2; + int left = len - len_m; + if (left != 0) { + if (left >= 3) { + h ^= data[offset + len - 3] << 16; + } + if (left >= 2) { + h ^= data[offset + len - 2] << 8; + } + if (left >= 1) { + h ^= data[offset + len - 1]; + } + h *= m; + } + h ^= h >>> 13; + h *= m; + h ^= h >>> 15; + return h; + } + + /** + * Generates 32 bit hash from byte array with default seed value. + * + * @param data + * byte array to hash + * @param offset + * the start position in the array to hash + * @param len + * length of the array elements to hash + * @return 32 bit hash of the given array + */ + public static final int hash32(final byte[] data, int offset, int len) { + return MurmurHash2.hash(data, 0x9747b28c, offset, len); + } + + + @Override + public String toString() { + return getClass().getSimpleName(); + } +} diff --git a/solr/solrj/src/java/org/apache/solr/common/util/TextWriter.java b/solr/solrj/src/java/org/apache/solr/common/util/TextWriter.java index b8dbeb2c2cc..6aa58c6a8fa 100644 --- a/solr/solrj/src/java/org/apache/solr/common/util/TextWriter.java +++ b/solr/solrj/src/java/org/apache/solr/common/util/TextWriter.java @@ -48,7 +48,7 @@ public interface TextWriter extends PushWriter { // go in order of most common to least common, however some of the more general types like Map belong towards the end if (val == null) { writeNull(name); - } else if (val instanceof String) { + } else if (val instanceof CharSequence) { writeStr(name, val.toString(), true); // micro-optimization... using toString() avoids a cast first } else if (val instanceof Number) { diff --git a/solr/solrj/src/java/org/apache/solr/common/util/Utf8CharSequence.java b/solr/solrj/src/java/org/apache/solr/common/util/Utf8CharSequence.java new file mode 100644 index 00000000000..5b884ec7763 --- /dev/null +++ b/solr/solrj/src/java/org/apache/solr/common/util/Utf8CharSequence.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common.util; + +import java.io.IOException; +import java.io.OutputStream; + +/**A byte[] backed String + * + */ +public interface Utf8CharSequence extends CharSequence { + + /** + * Write the bytes into a buffer. The objective is to avoid the local bytes being exposed to + * other classes if the implementation is expected to be immutable. It writes as many bytes as + * possible into the buffer and then return how many bytes were written. It's the responsibility + * of the caller to call this method repeatedly and ensure that everything is completely written + * + * @param start position from which to start writing + * @param buffer the buffer to which to write to + * @param pos position to start writing + * @return no:of bytes written + */ + int write(int start, byte[] buffer, int pos); + + /** The size of utf8 bytes + * @return the size + */ + int size(); + + /** + * Creates a byte[] and copy to it first before writing it out to the output + * @param os The sink + */ + default void write(OutputStream os) throws IOException { + byte[] buf = new byte[1024]; + int start = 0; + int totalWritten = 0; + for (; ; ) { + int sz = write(start, buf, 0); + totalWritten += sz; + if (sz > 0) os.write(buf, 0, sz); + if (totalWritten >= size()) break; + start += sz; + } + } + +} diff --git a/solr/solrj/src/test/org/apache/solr/common/util/Utf8CharSequenceTest.java b/solr/solrj/src/test/org/apache/solr/common/util/Utf8CharSequenceTest.java new file mode 100644 index 00000000000..bff362c9882 --- /dev/null +++ b/solr/solrj/src/test/org/apache/solr/common/util/Utf8CharSequenceTest.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.common.util; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.solr.SolrTestCaseJ4; + +public class Utf8CharSequenceTest extends SolrTestCaseJ4 { + + public void testLargeString() throws IOException { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 100; i++) { + sb.append("Hello World!"); + } + ByteArrayUtf8CharSequence utf8 = new ByteArrayUtf8CharSequence(sb.toString()); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buf = new byte[256]; + FastOutputStream fos = new FastOutputStream(baos, buf, 0); + fos.writeUtf8CharSeq(utf8); + fos.flush(); + byte[] result = baos.toByteArray(); + ByteArrayUtf8CharSequence utf81 = new ByteArrayUtf8CharSequence(result, 0, result.length); + assertTrue(utf81.equals(utf8)); + baos.reset(); + utf8.write(baos); + result = baos.toByteArray(); + utf81 = new ByteArrayUtf8CharSequence(result, 0, result.length); + assertTrue(utf81.equals(utf8)); + + Map m0 = new HashMap(); + m0.put("str", utf8); + baos.reset(); + new JavaBinCodec().marshal(m0, baos); + result = baos.toByteArray(); + Map m1 = (Map) new JavaBinCodec() + .setReadStringAsCharSeq(true) + .unmarshal(new ByteArrayInputStream(result)); + utf81 = (ByteArrayUtf8CharSequence) m1.get("str"); + assertTrue(utf81.equals(utf8)); + } +}