improve writing text as utf8 when having a string if we can write directly the utf8 bytes
This commit is contained in:
parent
6e20056619
commit
a41477e0fa
|
@ -0,0 +1,333 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.common.io;
|
||||
|
||||
import java.io.CharConversionException;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.Writer;
|
||||
|
||||
|
||||
/**
|
||||
*/
|
||||
public final class UTF8StreamWriter extends Writer {
|
||||
|
||||
/**
|
||||
* Holds the current output stream or <code>null</code> if closed.
|
||||
*/
|
||||
private OutputStream _outputStream;
|
||||
|
||||
/**
|
||||
* Holds the bytes' buffer.
|
||||
*/
|
||||
private final byte[] _bytes;
|
||||
|
||||
/**
|
||||
* Holds the bytes buffer index.
|
||||
*/
|
||||
private int _index;
|
||||
|
||||
/**
|
||||
* Creates a UTF-8 writer having a byte buffer of moderate capacity (2048).
|
||||
*/
|
||||
public UTF8StreamWriter() {
|
||||
_bytes = new byte[2048];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a UTF-8 writer having a byte buffer of specified capacity.
|
||||
*
|
||||
* @param capacity the capacity of the byte buffer.
|
||||
*/
|
||||
public UTF8StreamWriter(int capacity) {
|
||||
_bytes = new byte[capacity];
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the output stream to use for writing until this writer is closed.
|
||||
* For example:[code]
|
||||
* Writer writer = new UTF8StreamWriter().setOutputStream(out);
|
||||
* [/code] is equivalent but writes faster than [code]
|
||||
* Writer writer = new java.io.OutputStreamWriter(out, "UTF-8");
|
||||
* [/code]
|
||||
*
|
||||
* @param out the output stream.
|
||||
* @return this UTF-8 writer.
|
||||
* @throws IllegalStateException if this writer is being reused and
|
||||
* it has not been {@link #close closed} or {@link #reset reset}.
|
||||
*/
|
||||
public UTF8StreamWriter setOutput(OutputStream out) {
|
||||
if (_outputStream != null)
|
||||
throw new IllegalStateException("Writer not closed or reset");
|
||||
_outputStream = out;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a single character. This method supports 16-bits
|
||||
* character surrogates.
|
||||
*
|
||||
* @param c <code>char</code> the character to be written (possibly
|
||||
* a surrogate).
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write(char c) throws IOException {
|
||||
if ((c < 0xd800) || (c > 0xdfff)) {
|
||||
write((int) c);
|
||||
} else if (c < 0xdc00) { // High surrogate.
|
||||
_highSurrogate = c;
|
||||
} else { // Low surrogate.
|
||||
int code = ((_highSurrogate - 0xd800) << 10) + (c - 0xdc00)
|
||||
+ 0x10000;
|
||||
write(code);
|
||||
}
|
||||
}
|
||||
|
||||
private char _highSurrogate;
|
||||
|
||||
/**
|
||||
* Writes a character given its 31-bits Unicode.
|
||||
*
|
||||
* @param code the 31 bits Unicode of the character to be written.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write(int code) throws IOException {
|
||||
if ((code & 0xffffff80) == 0) {
|
||||
_bytes[_index] = (byte) code;
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else { // Writes more than one byte.
|
||||
write2(code);
|
||||
}
|
||||
}
|
||||
|
||||
private void write2(int c) throws IOException {
|
||||
if ((c & 0xfffff800) == 0) { // 2 bytes.
|
||||
_bytes[_index] = (byte) (0xc0 | (c >> 6));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | (c & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else if ((c & 0xffff0000) == 0) { // 3 bytes.
|
||||
_bytes[_index] = (byte) (0xe0 | (c >> 12));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 6) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | (c & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else if ((c & 0xff200000) == 0) { // 4 bytes.
|
||||
_bytes[_index] = (byte) (0xf0 | (c >> 18));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 12) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 6) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | (c & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else if ((c & 0xf4000000) == 0) { // 5 bytes.
|
||||
_bytes[_index] = (byte) (0xf8 | (c >> 24));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 18) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 12) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 6) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | (c & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else if ((c & 0x80000000) == 0) { // 6 bytes.
|
||||
_bytes[_index] = (byte) (0xfc | (c >> 30));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 24) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 18) & 0x3f));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 12) & 0x3F));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | ((c >> 6) & 0x3F));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
_bytes[_index] = (byte) (0x80 | (c & 0x3F));
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else {
|
||||
throw new CharConversionException("Illegal character U+"
|
||||
+ Integer.toHexString(c));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a portion of an array of characters.
|
||||
*
|
||||
* @param cbuf the array of characters.
|
||||
* @param off the offset from which to start writing characters.
|
||||
* @param len the number of characters to write.
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void write(char cbuf[], int off, int len) throws IOException {
|
||||
final int off_plus_len = off + len;
|
||||
for (int i = off; i < off_plus_len; ) {
|
||||
char c = cbuf[i++];
|
||||
if (c < 0x80) {
|
||||
_bytes[_index] = (byte) c;
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else {
|
||||
write(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a portion of a string.
|
||||
*
|
||||
* @param str a String.
|
||||
* @param off the offset from which to start writing characters.
|
||||
* @param len the number of characters to write.
|
||||
* @throws IOException if an I/O error occurs
|
||||
*/
|
||||
public void write(String str, int off, int len) throws IOException {
|
||||
final int off_plus_len = off + len;
|
||||
for (int i = off; i < off_plus_len; ) {
|
||||
char c = str.charAt(i++);
|
||||
if (c < 0x80) {
|
||||
_bytes[_index] = (byte) c;
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else {
|
||||
write(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the specified character sequence.
|
||||
*
|
||||
* @param csq the character sequence.
|
||||
* @throws IOException if an I/O error occurs
|
||||
*/
|
||||
public void write(CharSequence csq) throws IOException {
|
||||
final int length = csq.length();
|
||||
for (int i = 0; i < length; ) {
|
||||
char c = csq.charAt(i++);
|
||||
if (c < 0x80) {
|
||||
_bytes[_index] = (byte) c;
|
||||
if (++_index >= _bytes.length) {
|
||||
flushBuffer();
|
||||
}
|
||||
} else {
|
||||
write(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the stream. If the stream has saved any characters from the
|
||||
* various write() methods in a buffer, write them immediately to their
|
||||
* intended destination. Then, if that destination is another character or
|
||||
* byte stream, flush it. Thus one flush() invocation will flush all the
|
||||
* buffers in a chain of Writers and OutputStreams.
|
||||
*
|
||||
* @throws IOException if an I/O error occurs.
|
||||
*/
|
||||
public void flush() throws IOException {
|
||||
flushBuffer();
|
||||
_outputStream.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes and {@link #reset resets} this writer for reuse.
|
||||
*
|
||||
* @throws IOException if an I/O error occurs
|
||||
*/
|
||||
public void close() throws IOException {
|
||||
if (_outputStream != null) {
|
||||
flushBuffer();
|
||||
_outputStream.close();
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes the internal bytes buffer.
|
||||
*
|
||||
* @throws IOException if an I/O error occurs
|
||||
*/
|
||||
private void flushBuffer() throws IOException {
|
||||
if (_outputStream == null)
|
||||
throw new IOException("Stream closed");
|
||||
_outputStream.write(_bytes, 0, _index);
|
||||
_index = 0;
|
||||
}
|
||||
|
||||
// Implements Reusable.
|
||||
public void reset() {
|
||||
_highSurrogate = 0;
|
||||
_index = 0;
|
||||
_outputStream = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Replaced by {@link #setOutput(OutputStream)}
|
||||
*/
|
||||
public UTF8StreamWriter setOutputStream(OutputStream out) {
|
||||
return this.setOutput(out);
|
||||
}
|
||||
}
|
|
@ -37,6 +37,11 @@ public abstract class AdapterStreamInput extends StreamInput {
|
|||
return in.readBytesReference();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesReference readBytesReference(int length) throws IOException {
|
||||
return in.readBytesReference(length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
in.reset();
|
||||
|
|
|
@ -43,6 +43,21 @@ public class AdapterStreamOutput extends StreamOutput {
|
|||
return this.out;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekPositionSupported() {
|
||||
return out.seekPositionSupported();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return out.position();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long position) throws IOException {
|
||||
out.seek(position);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) throws IOException {
|
||||
out.writeByte(b);
|
||||
|
|
|
@ -60,13 +60,9 @@ public class BytesStreamInput extends StreamInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public BytesReference readBytesReference() throws IOException {
|
||||
public BytesReference readBytesReference(int length) throws IOException {
|
||||
if (unsafe) {
|
||||
return super.readBytesReference();
|
||||
}
|
||||
int length = readVInt();
|
||||
if (length == 0) {
|
||||
return BytesArray.EMPTY;
|
||||
return super.readBytesReference(length);
|
||||
}
|
||||
BytesArray bytes = new BytesArray(buf, pos, length);
|
||||
pos += length;
|
||||
|
|
|
@ -50,6 +50,24 @@ public class BytesStreamOutput extends StreamOutput implements BytesStream {
|
|||
this.buf = new byte[size];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekPositionSupported() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long position() throws IOException {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long position) throws IOException {
|
||||
if (position > Integer.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
count = (int) position;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeByte(byte b) throws IOException {
|
||||
int newcount = count + 1;
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.common.io.stream;
|
|||
|
||||
import jsr166y.LinkedTransferQueue;
|
||||
import org.elasticsearch.common.compress.Compressor;
|
||||
import org.elasticsearch.common.io.UTF8StreamWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.ref.SoftReference;
|
||||
|
@ -132,4 +133,17 @@ public class CachedStreamOutput {
|
|||
ref.add(entry);
|
||||
}
|
||||
}
|
||||
|
||||
private static ThreadLocal<SoftReference<UTF8StreamWriter>> utf8StreamWriter = new ThreadLocal<SoftReference<UTF8StreamWriter>>();
|
||||
|
||||
public static UTF8StreamWriter utf8StreamWriter() {
|
||||
SoftReference<UTF8StreamWriter> ref = utf8StreamWriter.get();
|
||||
UTF8StreamWriter writer = (ref == null) ? null : ref.get();
|
||||
if (writer == null) {
|
||||
writer = new UTF8StreamWriter(1024 * 4);
|
||||
utf8StreamWriter.set(new SoftReference<UTF8StreamWriter>(writer));
|
||||
}
|
||||
writer.reset();
|
||||
return writer;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,6 +56,14 @@ public abstract class StreamInput extends InputStream {
|
|||
*/
|
||||
public BytesReference readBytesReference() throws IOException {
|
||||
int length = readVInt();
|
||||
return readBytesReference(length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a bytes reference from this stream, might hold an actual reference to the underlying
|
||||
* bytes of the stream.
|
||||
*/
|
||||
public BytesReference readBytesReference(int length) throws IOException {
|
||||
if (length == 0) {
|
||||
return BytesArray.EMPTY;
|
||||
}
|
||||
|
@ -159,7 +167,8 @@ public abstract class StreamInput extends InputStream {
|
|||
|
||||
public Text readText() throws IOException {
|
||||
// use StringAndBytes so we can cache the string if its ever converted to it
|
||||
return new StringAndBytesText(readBytesReference());
|
||||
int length = readInt();
|
||||
return new StringAndBytesText(readBytesReference(length));
|
||||
}
|
||||
|
||||
@Nullable
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.common.io.stream;
|
|||
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.UTF8StreamWriter;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.joda.time.ReadableInstant;
|
||||
|
||||
|
@ -36,6 +37,18 @@ import java.util.Map;
|
|||
*/
|
||||
public abstract class StreamOutput extends OutputStream {
|
||||
|
||||
public boolean seekPositionSupported() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public long position() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public void seek(long position) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes a single byte.
|
||||
*/
|
||||
|
@ -150,9 +163,23 @@ public abstract class StreamOutput extends OutputStream {
|
|||
}
|
||||
|
||||
public void writeText(Text text) throws IOException {
|
||||
// always write the bytes...
|
||||
// TODO: TextBytesOptimization we could potentially optimize this, and write the bytes directly to the output stream converting to UTF8 in case its a string
|
||||
writeBytesReference(text.bytes());
|
||||
if (!text.hasBytes() && seekPositionSupported()) {
|
||||
long pos1 = position();
|
||||
// make room for the size
|
||||
seek(pos1 + 4);
|
||||
UTF8StreamWriter utf8StreamWriter = CachedStreamOutput.utf8StreamWriter();
|
||||
utf8StreamWriter.setOutput(this);
|
||||
utf8StreamWriter.write(text.string());
|
||||
utf8StreamWriter.close();
|
||||
long pos2 = position();
|
||||
seek(pos1);
|
||||
writeInt((int) (pos2 - pos1 - 4));
|
||||
seek(pos2);
|
||||
} else {
|
||||
BytesReference bytes = text.bytes();
|
||||
writeInt(bytes.length());
|
||||
bytes.writeTo(this);
|
||||
}
|
||||
}
|
||||
|
||||
public void writeString(String str) throws IOException {
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
|
||||
package org.elasticsearch.common.xcontent.support;
|
||||
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentGenerator;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
|
@ -175,6 +178,22 @@ public class XContentMapConverter {
|
|||
gen.writeBinary((byte[]) value);
|
||||
} else if (value instanceof Date) {
|
||||
gen.writeString(XContentBuilder.defaultDatePrinter.print(((Date) value).getTime()));
|
||||
} else if (value instanceof BytesReference) {
|
||||
BytesReference bytes = (BytesReference) value;
|
||||
if (!bytes.hasArray()) {
|
||||
bytes = bytes.toBytesArray();
|
||||
}
|
||||
gen.writeBinary(bytes.array(), bytes.arrayOffset(), bytes.length());
|
||||
} else if (value instanceof Text) {
|
||||
Text text = (Text) value;
|
||||
if (text.hasBytes() && text.bytes().hasArray()) {
|
||||
gen.writeUTF8String(text.bytes().array(), text.bytes().arrayOffset(), text.bytes().length());
|
||||
} else if (text.hasString()) {
|
||||
gen.writeString(text.string());
|
||||
} else {
|
||||
BytesArray bytesArray = text.bytes().toBytesArray();
|
||||
gen.writeUTF8String(bytesArray.array(), bytesArray.arrayOffset(), bytesArray.length());
|
||||
}
|
||||
} else {
|
||||
gen.writeString(value.toString());
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
|
||||
package org.elasticsearch.transport.netty;
|
||||
|
||||
import org.elasticsearch.common.bytes.BytesArray;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.bytes.ChannelBufferBytesReference;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
|
@ -52,11 +51,7 @@ public class ChannelBufferStreamInput extends StreamInput {
|
|||
}
|
||||
|
||||
@Override
|
||||
public BytesReference readBytesReference() throws IOException {
|
||||
int length = readVInt();
|
||||
if (length == 0) {
|
||||
return BytesArray.EMPTY;
|
||||
}
|
||||
public BytesReference readBytesReference(int length) throws IOException {
|
||||
ChannelBufferBytesReference ref = new ChannelBufferBytesReference(buffer.slice(buffer.readerIndex(), length));
|
||||
buffer.skipBytes(length);
|
||||
return ref;
|
||||
|
|
Loading…
Reference in New Issue