From 45f27fe96a1a52e1fb86c9a130aa5565a45809d9 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Sun, 20 Jan 2013 23:34:02 +0100 Subject: [PATCH] add packed bytes variant for strings/bytes --- .../fielddata/IndexFieldDataService.java | 1 + .../plain/PackedBytesAtomicFieldData.java | 750 ++++++++++++++++++ .../plain/PackedBytesIndexFieldData.java | 193 +++++ .../PackedBytesStringFieldDataTests.java | 35 + 4 files changed, 979 insertions(+) create mode 100644 src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesAtomicFieldData.java create mode 100644 src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesIndexFieldData.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/fielddata/PackedBytesStringFieldDataTests.java diff --git a/src/main/java/org/elasticsearch/index/fielddata/IndexFieldDataService.java b/src/main/java/org/elasticsearch/index/fielddata/IndexFieldDataService.java index c5e5ef492b3..2cb4d966f9b 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/IndexFieldDataService.java +++ b/src/main/java/org/elasticsearch/index/fielddata/IndexFieldDataService.java @@ -56,6 +56,7 @@ public class IndexFieldDataService extends AbstractIndexComponent { buildersByTypeAndFormat = MapBuilder., IndexFieldData.Builder>newMapBuilder() .put(Tuple.tuple("string", "concrete_bytes"), new ConcreteBytesRefIndexFieldData.Builder()) + .put(Tuple.tuple("string", "packed_bytes"), new PackedBytesIndexFieldData.Builder()) .put(Tuple.tuple("float", "array"), new FloatArrayIndexFieldData.Builder()) .put(Tuple.tuple("double", "array"), new DoubleArrayIndexFieldData.Builder()) .put(Tuple.tuple("byte", "array"), new ByteArrayIndexFieldData.Builder()) diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesAtomicFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesAtomicFieldData.java new file mode 100644 index 00000000000..0e251617e25 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesAtomicFieldData.java @@ -0,0 +1,750 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.fielddata.plain; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.common.RamUsage; +import org.elasticsearch.common.lucene.HashedBytesRef; +import org.elasticsearch.index.fielddata.*; +import org.elasticsearch.index.fielddata.ordinals.Ordinals; +import org.elasticsearch.index.fielddata.util.BytesRefArrayRef; +import org.elasticsearch.index.fielddata.util.IntArrayRef; +import org.elasticsearch.index.fielddata.util.StringArrayRef; + +/** + */ +public class PackedBytesAtomicFieldData implements AtomicOrdinalFieldData { + + // 0 ordinal in values means no value (its null) + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final Ordinals ordinals; + + private int[] hashes; + private long size = -1; + + public PackedBytesAtomicFieldData(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals ordinals) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.ordinals = ordinals; + } + + @Override + public boolean isMultiValued() { + return ordinals.isMultiValued(); + } + + @Override + public int getNumDocs() { + return ordinals.getNumDocs(); + } + + @Override + public boolean isValuesOrdered() { + return true; + } + + @Override + public long getMemorySizeInBytes() { + if (size == -1) { + long size = ordinals.getMemorySizeInBytes(); + // PackedBytes + size += RamUsage.NUM_BYTES_ARRAY_HEADER + bytes.getBlocks().length; + for (byte[] b : bytes.getBlocks()) { + size += b.length; + } + // PackedInts + size += termOrdToBytesOffset.ramBytesUsed(); + this.size = size; + } + return size; + } + + @Override + public OrdinalsBytesValues getBytesValues() { + return ordinals.isMultiValued() ? new BytesValues.Multi(bytes, termOrdToBytesOffset, ordinals.ordinals()) : new BytesValues.Single(bytes, termOrdToBytesOffset, ordinals.ordinals()); + } + + @Override + public OrdinalsHashedBytesValues getHashedBytesValues() { + if (hashes == null) { + int numberOfValues = termOrdToBytesOffset.size(); + int[] hashes = new int[numberOfValues]; + BytesRef scratch = new BytesRef(); + for (int i = 0; i < numberOfValues; i++) { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(i)); + hashes[i] = value == null ? 0 : value.hashCode(); + } + this.hashes = hashes; + } + return ordinals.isMultiValued() ? new HashedBytesValues.Multi(bytes, termOrdToBytesOffset, hashes, ordinals.ordinals()) : new HashedBytesValues.Single(bytes, termOrdToBytesOffset, hashes, ordinals.ordinals()); + } + + @Override + public OrdinalsStringValues getStringValues() { + return ordinals.isMultiValued() ? new StringValues.Multi(bytes, termOrdToBytesOffset, ordinals.ordinals()) : new StringValues.Single(bytes, termOrdToBytesOffset, ordinals.ordinals()); + } + + @Override + public ScriptDocValues.Strings getScriptValues() { + return new ScriptDocValues.Strings(getStringValues()); + } + + static abstract class BytesValues implements org.elasticsearch.index.fielddata.OrdinalsBytesValues { + + protected final PagedBytes.Reader bytes; + protected final PackedInts.Reader termOrdToBytesOffset; + protected final Ordinals.Docs ordinals; + + protected final BytesRef scratch = new BytesRef(); + + BytesValues(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.ordinals = ordinals; + } + + @Override + public Ordinals.Docs ordinals() { + return this.ordinals; + } + + @Override + public BytesRef getValueByOrd(int ord) { + return bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + } + + @Override + public BytesRef getValueScratchByOrd(int ord, BytesRef ret) { + return bytes.fill(ret, termOrdToBytesOffset.get(ord)); + } + + @Override + public BytesRef getSafeValueByOrd(int ord) { + return bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)); + } + + @Override + public boolean hasValue(int docId) { + return ordinals.getOrd(docId) != 0; + } + + @Override + public BytesRef makeSafe(BytesRef bytes) { + return BytesRef.deepCopyOf(bytes); + } + + @Override + public BytesRef getValue(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return null; + return bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + } + + @Override + public BytesRef getValueScratch(int docId, BytesRef ret) { + return bytes.fill(ret, termOrdToBytesOffset.get(ordinals.getOrd(docId))); + } + + @Override + public BytesRef getValueSafe(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return null; + return bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)); + } + + static class Single extends BytesValues { + + private final BytesRefArrayRef arrayScratch = new BytesRefArrayRef(new BytesRef[1], 1); + private final Iter.Single iter = new Iter.Single(); + + Single(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, ordinals); + } + + @Override + public boolean isMultiValued() { + return false; + } + + @Override + public BytesRefArrayRef getValues(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return BytesRefArrayRef.EMPTY; + arrayScratch.values[0] = bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)); + return arrayScratch; + } + + @Override + public Iter getIter(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return Iter.Empty.INSTANCE; + return iter.reset(bytes.fill(scratch, termOrdToBytesOffset.get(ord))); + } + + @Override + public Iter getIterSafe(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return Iter.Empty.INSTANCE; + return iter.reset(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord))); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + int ord = ordinals.getOrd(docId); + if (ord == 0) { + proc.onMissing(docId); + } else { + proc.onValue(docId, bytes.fill(scratch, termOrdToBytesOffset.get(ord))); + } + } + + @Override + public void forEachSafeValueInDoc(int docId, ValueInDocProc proc) { + int ord = ordinals.getOrd(docId); + if (ord == 0) { + proc.onMissing(docId); + } else { + proc.onValue(docId, bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord))); + } + } + } + + static class Multi extends BytesValues { + + private final BytesRefArrayRef arrayScratch = new BytesRefArrayRef(new BytesRef[10], 0); + private final ValuesIter iter; + private final SafeValuesIter safeIter; + + Multi(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, ordinals); + this.iter = new ValuesIter(bytes, termOrdToBytesOffset); + this.safeIter = new SafeValuesIter(bytes, termOrdToBytesOffset); + } + + @Override + public boolean isMultiValued() { + return true; + } + + @Override + public BytesRefArrayRef getValues(int docId) { + IntArrayRef ords = ordinals.getOrds(docId); + int size = ords.size(); + if (size == 0) return BytesRefArrayRef.EMPTY; + + arrayScratch.reset(size); + for (int i = ords.start; i < ords.end; i++) { + arrayScratch.values[arrayScratch.end++] = bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ords.values[i])); + } + return arrayScratch; + } + + @Override + public Iter getIter(int docId) { + return iter.reset(ordinals.getIter(docId)); + } + + @Override + public Iter getIterSafe(int docId) { + return safeIter.reset(ordinals.getIter(docId)); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + Ordinals.Docs.Iter iter = ordinals.getIter(docId); + int ord = iter.next(); + if (ord == 0) { + proc.onMissing(docId); + return; + } + do { + proc.onValue(docId, bytes.fill(scratch, termOrdToBytesOffset.get(ord))); + } while ((ord = iter.next()) != 0); + } + + @Override + public void forEachSafeValueInDoc(int docId, ValueInDocProc proc) { + Ordinals.Docs.Iter iter = ordinals.getIter(docId); + int ord = iter.next(); + if (ord == 0) { + proc.onMissing(docId); + return; + } + do { + proc.onValue(docId, bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord))); + } while ((ord = iter.next()) != 0); + } + + static class ValuesIter implements Iter { + + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final BytesRef scratch = new BytesRef(); + private Ordinals.Docs.Iter ordsIter; + private int ord; + + ValuesIter(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + } + + public ValuesIter reset(Ordinals.Docs.Iter ordsIter) { + this.ordsIter = ordsIter; + this.ord = ordsIter.next(); + return this; + } + + @Override + public boolean hasNext() { + return ord != 0; + } + + @Override + public BytesRef next() { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + ord = ordsIter.next(); + return value; + } + } + + static class SafeValuesIter implements Iter { + + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private Ordinals.Docs.Iter ordsIter; + private int ord; + + SafeValuesIter(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + } + + public SafeValuesIter reset(Ordinals.Docs.Iter ordsIter) { + this.ordsIter = ordsIter; + this.ord = ordsIter.next(); + return this; + } + + @Override + public boolean hasNext() { + return ord != 0; + } + + @Override + public BytesRef next() { + BytesRef value = bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)); + ord = ordsIter.next(); + return value; + } + } + } + } + + static abstract class HashedBytesValues implements org.elasticsearch.index.fielddata.OrdinalsHashedBytesValues { + + protected final PagedBytes.Reader bytes; + protected final PackedInts.Reader termOrdToBytesOffset; + protected final int[] hashes; + protected final Ordinals.Docs ordinals; + + protected final BytesRef scratch1 = new BytesRef(); + protected final HashedBytesRef scratch = new HashedBytesRef(); + + HashedBytesValues(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, int[] hashes, Ordinals.Docs ordinals) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.hashes = hashes; + this.ordinals = ordinals; + } + + @Override + public Ordinals.Docs ordinals() { + return this.ordinals; + } + + @Override + public HashedBytesRef getValueByOrd(int ord) { + return scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord]); + } + + @Override + public HashedBytesRef getSafeValueByOrd(int ord) { + return new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord]); + } + + @Override + public boolean hasValue(int docId) { + return ordinals.getOrd(docId) != 0; + } + + @Override + public HashedBytesRef makeSafe(HashedBytesRef bytes) { + return new HashedBytesRef(BytesRef.deepCopyOf(bytes.bytes), bytes.hash); + } + + @Override + public HashedBytesRef getValue(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return null; + return scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord]); + } + + @Override + public HashedBytesRef getValueSafe(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return null; + return new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord]); + } + + static class Single extends HashedBytesValues { + + private final Iter.Single iter = new Iter.Single(); + + Single(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, int[] hashes, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, hashes, ordinals); + } + + @Override + public boolean isMultiValued() { + return false; + } + + @Override + public Iter getIter(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return Iter.Empty.INSTANCE; + return iter.reset(scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord])); + } + + @Override + public Iter getIterSafe(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return Iter.Empty.INSTANCE; + return iter.reset(new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord])); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + int ord = ordinals.getOrd(docId); + if (ord == 0) { + proc.onMissing(docId); + } else { + proc.onValue(docId, scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord])); + } + } + + @Override + public void forEachSafeValueInDoc(int docId, ValueInDocProc proc) { + int ord = ordinals.getOrd(docId); + if (ord == 0) { + proc.onMissing(docId); + } else { + proc.onValue(docId, new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord])); + } + } + } + + static class Multi extends HashedBytesValues { + + private final ValuesIter iter; + private final SafeValuesIter safeIter; + + Multi(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, int[] hashes, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, hashes, ordinals); + this.iter = new ValuesIter(bytes, termOrdToBytesOffset, hashes); + this.safeIter = new SafeValuesIter(bytes, termOrdToBytesOffset, hashes); + } + + @Override + public boolean isMultiValued() { + return true; + } + + @Override + public Iter getIter(int docId) { + return iter.reset(ordinals.getIter(docId)); + } + + @Override + public Iter getIterSafe(int docId) { + return safeIter.reset(ordinals.getIter(docId)); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + Ordinals.Docs.Iter iter = ordinals.getIter(docId); + int ord = iter.next(); + if (ord == 0) { + proc.onMissing(docId); + return; + } + do { + proc.onValue(docId, scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord])); + } while ((ord = iter.next()) != 0); + } + + @Override + public void forEachSafeValueInDoc(int docId, ValueInDocProc proc) { + Ordinals.Docs.Iter iter = ordinals.getIter(docId); + int ord = iter.next(); + if (ord == 0) { + proc.onMissing(docId); + return; + } + do { + proc.onValue(docId, new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord])); + } while ((ord = iter.next()) != 0); + } + + static class ValuesIter implements Iter { + + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final int[] hashes; + private Ordinals.Docs.Iter ordsIter; + private int ord; + + private final BytesRef scratch1 = new BytesRef(); + private final HashedBytesRef scratch = new HashedBytesRef(); + + ValuesIter(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, int[] hashes) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.hashes = hashes; + } + + public ValuesIter reset(Ordinals.Docs.Iter ordsIter) { + this.ordsIter = ordsIter; + this.ord = ordsIter.next(); + return this; + } + + @Override + public boolean hasNext() { + return ord != 0; + } + + @Override + public HashedBytesRef next() { + HashedBytesRef value = scratch.reset(bytes.fill(scratch1, termOrdToBytesOffset.get(ord)), hashes[ord]); + ord = ordsIter.next(); + return value; + } + } + + static class SafeValuesIter implements Iter { + + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final int[] hashes; + private Ordinals.Docs.Iter ordsIter; + private int ord; + + SafeValuesIter(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, int[] hashes) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.hashes = hashes; + } + + public SafeValuesIter reset(Ordinals.Docs.Iter ordsIter) { + this.ordsIter = ordsIter; + this.ord = ordsIter.next(); + return this; + } + + @Override + public boolean hasNext() { + return ord != 0; + } + + @Override + public HashedBytesRef next() { + HashedBytesRef value = new HashedBytesRef(bytes.fill(new BytesRef(), termOrdToBytesOffset.get(ord)), hashes[ord]); + ord = ordsIter.next(); + return value; + } + } + } + } + + static abstract class StringValues implements OrdinalsStringValues { + + protected final PagedBytes.Reader bytes; + protected final PackedInts.Reader termOrdToBytesOffset; + protected final Ordinals.Docs ordinals; + + protected final BytesRef scratch = new BytesRef(); + + protected StringValues(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + this.ordinals = ordinals; + } + + @Override + public Ordinals.Docs ordinals() { + return ordinals; + } + + @Override + public String getValueByOrd(int ord) { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + return value.utf8ToString(); + } + + @Override + public boolean hasValue(int docId) { + return ordinals.getOrd(docId) != 0; + } + + @Override + public String getValue(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return null; + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + return value.utf8ToString(); + } + + static class Single extends StringValues { + + private final StringArrayRef arrayScratch = new StringArrayRef(new String[1], 1); + private final Iter.Single iter = new Iter.Single(); + + Single(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, ordinals); + } + + @Override + public boolean isMultiValued() { + return false; + } + + @Override + public StringArrayRef getValues(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return StringArrayRef.EMPTY; + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + arrayScratch.values[0] = value == null ? null : value.utf8ToString(); + return arrayScratch; + } + + @Override + public Iter getIter(int docId) { + int ord = ordinals.getOrd(docId); + if (ord == 0) return Iter.Empty.INSTANCE; + return iter.reset(bytes.fill(scratch, termOrdToBytesOffset.get(ord)).utf8ToString()); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + int ord = ordinals.getOrd(docId); + if (ord == 0) { + proc.onMissing(docId); + return; + } + proc.onValue(docId, bytes.fill(scratch, termOrdToBytesOffset.get(ord)).utf8ToString()); + } + } + + static class Multi extends StringValues { + + private final StringArrayRef arrayScratch = new StringArrayRef(new String[10], 0); + private final ValuesIter iter; + + Multi(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, Ordinals.Docs ordinals) { + super(bytes, termOrdToBytesOffset, ordinals); + iter = new ValuesIter(bytes, termOrdToBytesOffset); + } + + @Override + public boolean isMultiValued() { + return true; + } + + @Override + public StringArrayRef getValues(int docId) { + IntArrayRef ords = ordinals.getOrds(docId); + int size = ords.size(); + if (size == 0) return StringArrayRef.EMPTY; + + arrayScratch.reset(size); + for (int i = ords.start; i < ords.end; i++) { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ords.values[i])); + arrayScratch.values[arrayScratch.end++] = value == null ? null : value.utf8ToString(); + } + return arrayScratch; + } + + @Override + public Iter getIter(int docId) { + return iter.reset(ordinals.getIter(docId)); + } + + @Override + public void forEachValueInDoc(int docId, ValueInDocProc proc) { + Ordinals.Docs.Iter iter = ordinals.getIter(docId); + int ord = iter.next(); + if (ord == 0) { + proc.onMissing(docId); + return; + } + do { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + proc.onValue(docId, value == null ? null : value.utf8ToString()); + } while ((ord = iter.next()) != 0); + } + + static class ValuesIter implements StringValues.Iter { + + private final PagedBytes.Reader bytes; + private final PackedInts.Reader termOrdToBytesOffset; + private final BytesRef scratch = new BytesRef(); + private Ordinals.Docs.Iter ordsIter; + private int ord; + + ValuesIter(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset) { + this.bytes = bytes; + this.termOrdToBytesOffset = termOrdToBytesOffset; + } + + public ValuesIter reset(Ordinals.Docs.Iter ordsIter) { + this.ordsIter = ordsIter; + this.ord = ordsIter.next(); + return this; + } + + @Override + public boolean hasNext() { + return ord != 0; + } + + @Override + public String next() { + BytesRef value = bytes.fill(scratch, termOrdToBytesOffset.get(ord)); + ord = ordsIter.next(); + return value == null ? null : value.utf8ToString(); + } + } + } + } +} diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesIndexFieldData.java new file mode 100644 index 00000000000..fb70a1f000f --- /dev/null +++ b/src/main/java/org/elasticsearch/index/fielddata/plain/PackedBytesIndexFieldData.java @@ -0,0 +1,193 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.fielddata.plain; + +import org.apache.lucene.index.*; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.GrowableWriter; +import org.apache.lucene.util.packed.PackedInts; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.fielddata.*; +import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource; +import org.elasticsearch.index.fielddata.ordinals.EmptyOrdinals; +import org.elasticsearch.index.fielddata.ordinals.MultiFlatArrayOrdinals; +import org.elasticsearch.index.fielddata.ordinals.SingleArrayOrdinals; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.settings.IndexSettings; + +import java.util.ArrayList; + +/** + */ +public class PackedBytesIndexFieldData extends AbstractIndexFieldData implements IndexOrdinalFieldData { + + public static class Builder implements IndexFieldData.Builder { + + @Override + public IndexFieldData build(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType type, IndexFieldDataCache cache) { + return new PackedBytesIndexFieldData(index, indexSettings, fieldNames, type, cache); + } + } + + public PackedBytesIndexFieldData(Index index, @IndexSettings Settings indexSettings, FieldMapper.Names fieldNames, FieldDataType fieldDataType, IndexFieldDataCache cache) { + super(index, indexSettings, fieldNames, fieldDataType, cache); + } + + @Override + public boolean valuesOrdered() { + return true; + } + + @Override + public PackedBytesAtomicFieldData load(AtomicReaderContext context) { + try { + return cache.load(context, this); + } catch (Throwable e) { + if (e instanceof ElasticSearchException) { + throw (ElasticSearchException) e; + } else { + throw new ElasticSearchException(e.getMessage(), e); + } + } + } + + @Override + public PackedBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { + AtomicReader reader = context.reader(); + + Terms terms = reader.terms(getFieldNames().indexName()); + if (terms == null) { + final PagedBytes bytes = new PagedBytes(1); + // 0 is reserved for "unset" + bytes.copyUsingLengthPrefix(new BytesRef()); + GrowableWriter termOrdToBytesOffset = new GrowableWriter(1, 2, PackedInts.FASTEST); + return new PackedBytesAtomicFieldData(bytes.freeze(true), termOrdToBytesOffset.getMutable(), new EmptyOrdinals(reader.maxDoc())); + } + + final PagedBytes bytes = new PagedBytes(15); + int startBytesBPV; + int startTermsBPV; + int startNumUniqueTerms; + + int maxDoc = reader.maxDoc(); + final int termCountHardLimit; + if (maxDoc == Integer.MAX_VALUE) { + termCountHardLimit = Integer.MAX_VALUE; + } else { + termCountHardLimit = maxDoc + 1; + } + + // Try for coarse estimate for number of bits; this + // should be an underestimate most of the time, which + // is fine -- GrowableWriter will reallocate as needed + long numUniqueTerms = terms.size(); + if (numUniqueTerms != -1L) { + if (numUniqueTerms > termCountHardLimit) { + // app is misusing the API (there is more than + // one term per doc); in this case we make best + // effort to load what we can (see LUCENE-2142) + numUniqueTerms = termCountHardLimit; + } + + startBytesBPV = PackedInts.bitsRequired(numUniqueTerms * 4); + startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); + + startNumUniqueTerms = (int) numUniqueTerms; + } else { + startBytesBPV = 1; + startTermsBPV = 1; + startNumUniqueTerms = 1; + } + + // TODO: expose this as an option..., have a nice parser for it... + float acceptableOverheadRatio = PackedInts.FAST; + + GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1 + startNumUniqueTerms, acceptableOverheadRatio); + + ArrayList ordinals = new ArrayList(); + int[] idx = new int[reader.maxDoc()]; + ordinals.add(new int[reader.maxDoc()]); + + // 0 is reserved for "unset" + bytes.copyUsingLengthPrefix(new BytesRef()); + int termOrd = 1; + + TermsEnum termsEnum = terms.iterator(null); + try + + { + DocsEnum docsEnum = null; + for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { + if (termOrd == termOrdToBytesOffset.size()) { + // NOTE: this code only runs if the incoming + // reader impl doesn't implement + // size (which should be uncommon) + termOrdToBytesOffset = termOrdToBytesOffset.resize(ArrayUtil.oversize(1 + termOrd, 1)); + } + termOrdToBytesOffset.set(termOrd, bytes.copyUsingLengthPrefix(term)); + + docsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, 0); + for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { + int[] ordinal; + if (idx[docId] >= ordinals.size()) { + ordinal = new int[reader.maxDoc()]; + ordinals.add(ordinal); + } else { + ordinal = ordinals.get(idx[docId]); + } + ordinal[docId] = termOrd; + idx[docId]++; + } + termOrd++; + } + } catch (RuntimeException e) { + if (e.getClass().getName().endsWith("StopFillCacheException")) { + // all is well, in case numeric parsers are used. + } else { + throw e; + } + } + + PagedBytes.Reader bytesReader = bytes.freeze(true); + PackedInts.Reader termOrdToBytesOffsetReader = termOrdToBytesOffset.getMutable(); + + if (ordinals.size() == 1) { + return new PackedBytesAtomicFieldData(bytesReader, termOrdToBytesOffsetReader, new SingleArrayOrdinals(ordinals.get(0), termOrd)); + } else { + int[][] nativeOrdinals = new int[ordinals.size()][]; + for (int i = 0; i < nativeOrdinals.length; i++) { + nativeOrdinals[i] = ordinals.get(i); + } + return new PackedBytesAtomicFieldData(bytesReader, termOrdToBytesOffsetReader, new MultiFlatArrayOrdinals(nativeOrdinals, termOrd)); + } + + } + + @Override + public XFieldComparatorSource comparatorSource(@Nullable Object missingValue) { + // TODO support "missingValue" for sortMissingValue options here... + return new BytesRefFieldComparatorSource(this); + } +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/fielddata/PackedBytesStringFieldDataTests.java b/src/test/java/org/elasticsearch/test/unit/index/fielddata/PackedBytesStringFieldDataTests.java new file mode 100644 index 00000000000..8dfdd1030f1 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/fielddata/PackedBytesStringFieldDataTests.java @@ -0,0 +1,35 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.unit.index.fielddata; + +import com.google.common.collect.ImmutableMap; +import org.elasticsearch.index.fielddata.FieldDataType; +import org.testng.annotations.Test; + +/** + */ +@Test +public class PackedBytesStringFieldDataTests extends StringFieldDataTests { + + @Override + protected FieldDataType getFieldDataType() { + return new FieldDataType("string", "packed_bytes", ImmutableMap.of()); + } +}