SOLR-12992: When using binary format, ExportWriter to directly copy BytesRef instead of creating new String

This commit is contained in:
Noble Paul 2018-11-20 14:58:32 +11:00
parent 4efaecac34
commit 25bca6f165
12 changed files with 584 additions and 15 deletions

View File

@ -134,6 +134,9 @@ Improvements
* SOLR-12881: Remove unneeded import statements (Peter Somogyi via Erick Erickson)
* SOLR-12992: When using binary format, ExportWriter to directly copy BytesRef instead of
creating new String (noble)
================== 7.6.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -25,12 +25,25 @@ import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.util.ByteArrayUtf8CharSequence;
import org.apache.solr.common.util.JavaBinCodec;
import org.apache.solr.schema.FieldType;
class StringFieldWriter extends FieldWriter {
private String field;
private FieldType fieldType;
private CharsRefBuilder cref = new CharsRefBuilder();
final ByteArrayUtf8CharSequence utf8 = new ByteArrayUtf8CharSequence(new byte[0], 0, 0) {
@Override
public String toString() {
String str = super.utf16;
if (str != null) return str;
fieldType.indexedToReadable(new BytesRef(super.buf, super.offset, super.length), cref);
str = cref.toString();
super.utf16 = str;
return str;
}
};
public StringFieldWriter(String field, FieldType fieldType) {
this.field = field;
@ -55,8 +68,14 @@ class StringFieldWriter extends FieldWriter {
int ord = vals.ordValue();
ref = vals.lookupOrd(ord);
}
fieldType.indexedToReadable(ref, cref);
ew.put(this.field, cref.toString());
if (ew instanceof JavaBinCodec.BinEntryWriter) {
ew.put(this.field, utf8.reset(ref.bytes, ref.offset, ref.length, null));
} else {
fieldType.indexedToReadable(ref, cref);
String v = cref.toString();
ew.put(this.field, v);
}
return true;
}
}

View File

@ -125,7 +125,7 @@ public class TransactionLog implements Closeable {
}
@Override
public String readExternString(DataInputInputStream fis) throws IOException {
public CharSequence readExternString(DataInputInputStream fis) throws IOException {
int idx = readSize(fis);
if (idx != 0) {// idx != 0 is the index of the extern string
// no need to synchronize globalStringList - it's only updated before the first record is written to the log

View File

@ -145,5 +145,14 @@ public interface MapWriter extends MapSerializable , NavigableObject {
put(k, (Boolean) v);
return this;
}
/**This is an optimization to avoid the instanceof checks.
*
*/
default EntryWriter put(CharSequence k, CharSequence v) throws IOException {
put(k, (Object)v);
return this;
}
}
}

View File

@ -0,0 +1,251 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import org.noggit.CharArr;
/**A mutable byte[] backed Utf8CharSequence. This is quite similar to the BytesRef of Lucene
* This is designed for single-threaded use
*
*/
public class ByteArrayUtf8CharSequence implements Utf8CharSequence {
protected byte[] buf;
protected int offset;
protected int hashCode = Integer.MIN_VALUE;
protected int length;
protected volatile String utf16;
public ByteArrayUtf8CharSequence(String utf16) {
buf = new byte[Math.multiplyExact(utf16.length(), 3)];
offset = 0;
length = ByteUtils.UTF16toUTF8(utf16, 0, utf16.length(), buf, 0);
if (buf.length > length) {
byte[] copy = new byte[length];
System.arraycopy(buf, 0, copy, 0, length);
buf = copy;
}
assert isValid();
}
public ByteArrayUtf8CharSequence(byte[] buf, int offset, int length) {
this.buf = buf;
this.offset = offset;
this.length = length;
}
public String getStringOrNull() {
return utf16;
}
@Override
public int write(int start, byte[] buffer, int pos) {
if (start == -1 || start >= length) return -1;
int writableBytes = Math.min(length - start, buffer.length - pos);
System.arraycopy(buf, offset + start, buffer, pos, writableBytes);
return writableBytes;
}
@Override
public int size() {
return length;
}
private ByteArrayUtf8CharSequence(byte[] buf, int offset, int length, String utf16, int hashCode) {
this.buf = buf;
this.offset = offset;
this.length = length;
this.utf16 = utf16;
this.hashCode = hashCode;
}
@Override
public int hashCode() {
if (hashCode == Integer.MIN_VALUE) {
hashCode = MurmurHash2.hash32(buf, offset, length);
}
return hashCode;
}
@Override
public int length() {
return _getStr().length();
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (other instanceof ByteArrayUtf8CharSequence) {
ByteArrayUtf8CharSequence that = (ByteArrayUtf8CharSequence) other;
return _equals(this.buf, this.offset, this.offset + this.length,
that.buf, that.offset, that.offset + that.length);
}
return false;
}
@Override
public char charAt(int index) {
return _getStr().charAt(index);
}
private String _getStr() {
if (utf16 == null) {
synchronized (this) {
if (utf16 == null) {
CharArr arr = new CharArr();
ByteUtils.UTF8toUTF16(buf, offset, length, arr);
utf16 = arr.toString();
}
}
}
return utf16;
}
@Override
public CharSequence subSequence(int start, int end) {
return new ByteArrayUtf8CharSequence(_getStr().subSequence(start, end).toString());
}
@Override
public ByteArrayUtf8CharSequence clone() {
return new ByteArrayUtf8CharSequence(buf, offset, length, utf16, hashCode);
}
public ByteArrayUtf8CharSequence deepCopy() {
byte[] bytes = new byte[length];
System.arraycopy(buf, offset, bytes, 0, length);
return new ByteArrayUtf8CharSequence(bytes, 0, length, utf16, hashCode);
}
public static Map.Entry convertCharSeq(Map.Entry result) {
if (result.getKey() instanceof Utf8CharSequence || result.getValue() instanceof Utf8CharSequence) {
return new AbstractMap.SimpleEntry(convertCharSeq(result.getKey()), convertCharSeq(result.getValue()));
}
return result;
}
public static Collection convertCharSeq(Collection vals) {
if (vals == null) return vals;
boolean needsCopy = false;
for (Object o : vals) {
if (o instanceof Utf8CharSequence) {
needsCopy = true;
break;
}
}
if (needsCopy) {
ArrayList copy = new ArrayList(vals.size());
for (Object o : vals) copy.add(convertCharSeq(o));
return copy;
}
return vals;
}
public static Object convertCharSeq(Object o) {
if (o == null) return null;
if (o instanceof Utf8CharSequence) return ((Utf8CharSequence) o).toString();
return o;
}
// methods in Arrays are defined stupid: they cannot use Objects.checkFromToIndex
// they throw IAE (vs IOOBE) in the case of fromIndex > toIndex.
// so this method works just like checkFromToIndex, but with that stupidity added.
private static void checkFromToIndex(int fromIndex, int toIndex, int length) {
if (fromIndex > toIndex) {
throw new IllegalArgumentException("fromIndex " + fromIndex + " > toIndex " + toIndex);
}
if (fromIndex < 0 || toIndex > length) {
throw new IndexOutOfBoundsException("Range [" + fromIndex + ", " + toIndex + ") out-of-bounds for length " + length);
}
}
@Override
public String toString() {
return _getStr();
}
/**
* Behaves like Java 9's Arrays.equals
*
* @see <a href="http://download.java.net/java/jdk9/docs/api/java/util/Arrays.html#equals-byte:A-int-int-byte:A-int-int-">Arrays.equals</a>
*/
public static boolean _equals(byte[] a, int aFromIndex, int aToIndex, byte[] b, int bFromIndex, int bToIndex) {
checkFromToIndex(aFromIndex, aToIndex, a.length);
checkFromToIndex(bFromIndex, bToIndex, b.length);
int aLen = aToIndex - aFromIndex;
int bLen = bToIndex - bFromIndex;
// lengths differ: cannot be equal
if (aLen != bLen) {
return false;
}
for (int i = 0; i < aLen; i++) {
if (a[i + aFromIndex] != b[i + bFromIndex]) {
return false;
}
}
return true;
}
public ByteArrayUtf8CharSequence reset(byte[] bytes, int offset, int length, String str) {
this.buf = bytes;
this.offset = offset;
this.length = length;
this.utf16 = str;
this.hashCode = Integer.MIN_VALUE;
return this;
}
/**
* Performs internal consistency checks.
* Always returns true (or throws IllegalStateException)
*/
public boolean isValid() {
if (buf == null) {
throw new IllegalStateException("bytes is null");
}
if (length < 0) {
throw new IllegalStateException("length is negative: " + length);
}
if (length > buf.length) {
throw new IllegalStateException("length is out of bounds: " + length + ",bytes.length=" + buf.length);
}
if (offset < 0) {
throw new IllegalStateException("offset is negative: " + offset);
}
if (offset > buf.length) {
throw new IllegalStateException("offset out of bounds: " + offset + ",bytes.length=" + buf.length);
}
if (offset + length < 0) {
throw new IllegalStateException("offset+length is negative: offset=" + offset + ",length=" + length);
}
if (offset + length > buf.length) {
throw new IllegalStateException("offset+length out of bounds: offset=" + offset + ",length=" + length + ",bytes.length=" + buf.length);
}
return true;
}
}

View File

@ -142,6 +142,7 @@ public class ByteUtils {
if (upto > scratch.length - 4) {
// a code point may take upto 4 bytes and we don't have enough space, so reset
totalBytes += upto;
if(fos == null) throw new IOException("buffer over flow");
fos.write(scratch, 0, upto);
upto = 0;
}
@ -180,7 +181,7 @@ public class ByteUtils {
}
totalBytes += upto;
fos.write(scratch, 0, upto);
if(fos != null) fos.write(scratch, 0, upto);
return totalBytes;
}

View File

@ -16,7 +16,10 @@
*/
package org.apache.solr.common.util;
import java.io.*;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
/** Single threaded buffered OutputStream
* Internal Solr use only, subject to change.
@ -229,4 +232,18 @@ public class FastOutputStream extends OutputStream implements DataOutput {
this.written = written;
}
/**Copies a {@link Utf8CharSequence} without making extra copies
*/
public void writeUtf8CharSeq(Utf8CharSequence utf8) throws IOException {
int start = 0;
int totalWritten = 0;
for (; ; ) {
if (pos >= buf.length) flushBuffer();
int sz = utf8.write(start, buf, pos);
pos += sz;
totalWritten += sz;
if (totalWritten >= utf8.size()) break;
start += sz;
}
}
}

View File

@ -115,12 +115,18 @@ public class JavaBinCodec implements PushWriter {
private WritableDocFields writableDocFields;
private boolean alreadyMarshalled;
private boolean alreadyUnmarshalled;
private boolean readStringAsCharSeq = false;
public JavaBinCodec() {
resolver =null;
writableDocFields =null;
}
public JavaBinCodec setReadStringAsCharSeq(boolean flag) {
readStringAsCharSeq = flag;
return this;
}
/**
* Use this to use this as a PushWriter. ensure that close() is called explicitly after use
*
@ -403,7 +409,7 @@ public class JavaBinCodec implements PushWriter {
return false;
}
private final MapWriter.EntryWriter ew = new MapWriter.EntryWriter() {
public class BinEntryWriter implements MapWriter.EntryWriter {
@Override
public MapWriter.EntryWriter put(CharSequence k, Object v) throws IOException {
writeExternString(k);
@ -445,7 +451,16 @@ public class JavaBinCodec implements PushWriter {
writeBoolean(v);
return this;
}
};
@Override
public MapWriter.EntryWriter put(CharSequence k, CharSequence v) throws IOException {
writeExternString(k);
writeStr(v);
return this;
}
}
private final MapWriter.EntryWriter ew = new BinEntryWriter();
public void writeMap(MapWriter val) throws IOException {
@ -832,6 +847,10 @@ public class JavaBinCodec implements PushWriter {
writeTag(NULL);
return;
}
if (s instanceof Utf8CharSequence) {
writeUTF8Str((Utf8CharSequence) s);
return;
}
int end = s.length();
int maxSize = end * ByteUtils.MAX_UTF8_BYTES_PER_CHAR;
@ -853,11 +872,11 @@ public class JavaBinCodec implements PushWriter {
CharArr arr = new CharArr();
private StringBytes bytesRef = new StringBytes(bytes,0,0);
public String readStr(DataInputInputStream dis) throws IOException {
public CharSequence readStr(DataInputInputStream dis) throws IOException {
return readStr(dis,null);
}
public String readStr(DataInputInputStream dis, StringCache stringCache) throws IOException {
public CharSequence readStr(DataInputInputStream dis, StringCache stringCache) throws IOException {
int sz = readSize(dis);
if (bytes == null || bytes.length < sz) bytes = new byte[sz];
dis.readFully(bytes, 0, sz);
@ -865,8 +884,14 @@ public class JavaBinCodec implements PushWriter {
return stringCache.get(bytesRef.reset(bytes, 0, sz));
} else {
arr.reset();
ByteUtils.UTF8toUTF16(bytes, 0, sz, arr);
return arr.toString();
if (readStringAsCharSeq) {
byte[] copyBuf = new byte[sz];
System.arraycopy(bytes, 0, copyBuf, 0, sz);
return new ByteArrayUtf8CharSequence(copyBuf, 0, sz);
} else {
ByteUtils.UTF8toUTF16(bytes, 0, sz, arr);
return arr.toString();
}
}
}
@ -928,6 +953,8 @@ public class JavaBinCodec implements PushWriter {
if (val == null) {
daos.writeByte(NULL);
return true;
} else if (val instanceof Utf8CharSequence) {
writeUTF8Str((Utf8CharSequence) val);
} else if (val instanceof CharSequence) {
writeStr((CharSequence) val);
return true;
@ -990,6 +1017,10 @@ public class JavaBinCodec implements PushWriter {
public void writeMap(Map<?,?> val) throws IOException {
writeTag(MAP, val.size());
if (val instanceof MapWriter) {
((MapWriter) val).writeMap(ew);
return;
}
for (Map.Entry<?,?> entry : val.entrySet()) {
Object key = entry.getKey();
if (key instanceof String) {
@ -1060,7 +1091,7 @@ public class JavaBinCodec implements PushWriter {
private int stringsCount = 0;
private Map<String, Integer> stringsMap;
private List<String> stringsList;
private List<CharSequence> stringsList;
public void writeExternString(CharSequence s) throws IOException {
if (s == null) {
@ -1078,19 +1109,33 @@ public class JavaBinCodec implements PushWriter {
}
public String readExternString(DataInputInputStream fis) throws IOException {
public CharSequence readExternString(DataInputInputStream fis) throws IOException {
int idx = readSize(fis);
if (idx != 0) {// idx != 0 is the index of the extern string
return stringsList.get(idx - 1);
} else {// idx == 0 means it has a string value
tagByte = fis.readByte();
String s = readStr(fis, stringCache);
CharSequence s = readStr(fis, stringCache);
if (s != null) s = s.toString();
if (stringsList == null) stringsList = new ArrayList<>();
stringsList.add(s);
return s;
}
}
public void writeUTF8Str(Utf8CharSequence utf8) throws IOException {
writeTag(STR, utf8.size());
daos.writeUtf8CharSeq(utf8);
}
public long getTotalBytesWritten() {
if (daos != null) {
return daos.written;
}
return 0;
}
/**
* Allows extension of {@link JavaBinCodec} to support serialization of arbitrary data types.
* <p>

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based
* lookup. See http://murmurhash.googlepages.com/ for more details.
* <p>
* The C version of MurmurHash 2.0 found at that site was ported to Java by
* Andrzej Bialecki (ab at getopt org).
* </p>
* <p>
* The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of
* hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
* rather than raw byte arrays.
* </p>
* @lucene.experimental
*/
public final class MurmurHash2 {
public static final MurmurHash2 INSTANCE = new MurmurHash2();
private MurmurHash2() {}
public static int hash(byte[] data, int seed, int offset, int len) {
int m = 0x5bd1e995;
int r = 24;
int h = seed ^ len;
int len_4 = len >> 2;
for (int i = 0; i < len_4; i++) {
int i_4 = offset + (i << 2);
int k = data[i_4 + 3];
k = k << 8;
k = k | (data[i_4 + 2] & 0xff);
k = k << 8;
k = k | (data[i_4 + 1] & 0xff);
k = k << 8;
k = k | (data[i_4 + 0] & 0xff);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
int len_m = len_4 << 2;
int left = len - len_m;
if (left != 0) {
if (left >= 3) {
h ^= data[offset + len - 3] << 16;
}
if (left >= 2) {
h ^= data[offset + len - 2] << 8;
}
if (left >= 1) {
h ^= data[offset + len - 1];
}
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
/**
* Generates 32 bit hash from byte array with default seed value.
*
* @param data
* byte array to hash
* @param offset
* the start position in the array to hash
* @param len
* length of the array elements to hash
* @return 32 bit hash of the given array
*/
public static final int hash32(final byte[] data, int offset, int len) {
return MurmurHash2.hash(data, 0x9747b28c, offset, len);
}
@Override
public String toString() {
return getClass().getSimpleName();
}
}

View File

@ -48,7 +48,7 @@ public interface TextWriter extends PushWriter {
// go in order of most common to least common, however some of the more general types like Map belong towards the end
if (val == null) {
writeNull(name);
} else if (val instanceof String) {
} else if (val instanceof CharSequence) {
writeStr(name, val.toString(), true);
// micro-optimization... using toString() avoids a cast first
} else if (val instanceof Number) {

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.io.IOException;
import java.io.OutputStream;
/**A byte[] backed String
*
*/
public interface Utf8CharSequence extends CharSequence {
/**
* Write the bytes into a buffer. The objective is to avoid the local bytes being exposed to
* other classes if the implementation is expected to be immutable. It writes as many bytes as
* possible into the buffer and then return how many bytes were written. It's the responsibility
* of the caller to call this method repeatedly and ensure that everything is completely written
*
* @param start position from which to start writing
* @param buffer the buffer to which to write to
* @param pos position to start writing
* @return no:of bytes written
*/
int write(int start, byte[] buffer, int pos);
/** The size of utf8 bytes
* @return the size
*/
int size();
/**
* Creates a byte[] and copy to it first before writing it out to the output
* @param os The sink
*/
default void write(OutputStream os) throws IOException {
byte[] buf = new byte[1024];
int start = 0;
int totalWritten = 0;
for (; ; ) {
int sz = write(start, buf, 0);
totalWritten += sz;
if (sz > 0) os.write(buf, 0, sz);
if (totalWritten >= size()) break;
start += sz;
}
}
}

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.common.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.SolrTestCaseJ4;
public class Utf8CharSequenceTest extends SolrTestCaseJ4 {
public void testLargeString() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 100; i++) {
sb.append("Hello World!");
}
ByteArrayUtf8CharSequence utf8 = new ByteArrayUtf8CharSequence(sb.toString());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte[] buf = new byte[256];
FastOutputStream fos = new FastOutputStream(baos, buf, 0);
fos.writeUtf8CharSeq(utf8);
fos.flush();
byte[] result = baos.toByteArray();
ByteArrayUtf8CharSequence utf81 = new ByteArrayUtf8CharSequence(result, 0, result.length);
assertTrue(utf81.equals(utf8));
baos.reset();
utf8.write(baos);
result = baos.toByteArray();
utf81 = new ByteArrayUtf8CharSequence(result, 0, result.length);
assertTrue(utf81.equals(utf8));
Map m0 = new HashMap();
m0.put("str", utf8);
baos.reset();
new JavaBinCodec().marshal(m0, baos);
result = baos.toByteArray();
Map m1 = (Map) new JavaBinCodec()
.setReadStringAsCharSeq(true)
.unmarshal(new ByteArrayInputStream(result));
utf81 = (ByteArrayUtf8CharSequence) m1.get("str");
assertTrue(utf81.equals(utf8));
}
}