mirror of https://github.com/apache/lucene.git
LUCENE-4764: add more-ram-consuming-but-faster DV format for facets
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1445278 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6c5b2f808e
commit
30e15a01c1
|
@ -159,6 +159,10 @@ New Features
|
|||
to Lucene42DocValuesConsumer) if you want to make this tradeoff.
|
||||
(Adrien Grand, Robert Muir)
|
||||
|
||||
* LUCENE-4764: A new Facet42Codec and Facet42DocValuesFormat provide
|
||||
faster but more RAM-consuming facet performance. (Shai Erera, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-4769: Added OrdinalsCache and CachedOrdsCountingFacetsAggregator
|
||||
which uses the cache to obtain a document's ordinals. This aggregator
|
||||
is faster than others, however consumes much more RAM.
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
<target name="run-encoding-benchmark" depends="compile-test">
|
||||
<java classname="org.apache.lucene.util.encoding.EncodingSpeed" fork="true" failonerror="true">
|
||||
<classpath refid="test.classpath" />
|
||||
<classpath path="${build.dir}/classes/test" />
|
||||
<classpath path="${build.dir}/classes/test" />
|
||||
</java>
|
||||
</target>
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.facet.codecs.facet42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
class Facet42BinaryDocValues extends BinaryDocValues {
|
||||
|
||||
private final byte[] bytes;
|
||||
private final PackedInts.Reader addresses;
|
||||
|
||||
Facet42BinaryDocValues(DataInput in) throws IOException {
|
||||
int totBytes = in.readVInt();
|
||||
bytes = new byte[totBytes];
|
||||
in.readBytes(bytes, 0, totBytes);
|
||||
addresses = PackedInts.getReader(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(int docID, BytesRef ret) {
|
||||
int start = (int) addresses.get(docID);
|
||||
ret.bytes = bytes;
|
||||
ret.offset = start;
|
||||
ret.length = (int) (addresses.get(docID+1)-start);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package org.apache.lucene.facet.codecs.facet42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
|
||||
import org.apache.lucene.facet.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
|
||||
/**
|
||||
* Same as {@link Lucene42Codec} except it uses {@link Facet42DocValuesFormat}
|
||||
* for facet fields (faster-but-more-RAM-consuming doc values).
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE</b>: this codec does not support facet partitions (see
|
||||
* {@link FacetIndexingParams#getPartitionSize()}).
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE</b>: this format cannot handle more than 2 GB
|
||||
* of facet data in a single segment. If your usage may hit
|
||||
* this limit, you can either use Lucene's default
|
||||
* DocValuesFormat, limit the maximum segment size in your
|
||||
* MergePolicy, or send us a patch fixing the limitation.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Facet42Codec extends Lucene42Codec {
|
||||
|
||||
private final Set<String> facetFields;
|
||||
private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facet42");
|
||||
private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42");
|
||||
|
||||
// must have that for SPI purposes
|
||||
/** Default constructor, uses {@link FacetIndexingParams#ALL_PARENTS}. */
|
||||
public Facet42Codec() {
|
||||
this(FacetIndexingParams.ALL_PARENTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes with the given {@link FacetIndexingParams}. Returns the proper
|
||||
* {@link DocValuesFormat} for the fields that are returned by
|
||||
* {@link FacetIndexingParams#getAllCategoryListParams()}.
|
||||
*/
|
||||
public Facet42Codec(FacetIndexingParams fip) {
|
||||
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
|
||||
throw new IllegalArgumentException("this Codec does not support partitions");
|
||||
}
|
||||
this.facetFields = new HashSet<String>();
|
||||
for (CategoryListParams clp : fip.getAllCategoryListParams()) {
|
||||
facetFields.add(clp.field);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesFormat getDocValuesFormatForField(String field) {
|
||||
if (facetFields.contains(field)) {
|
||||
return facetsDVFormat;
|
||||
} else {
|
||||
return lucene42DVFormat;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
package org.apache.lucene.facet.codecs.facet42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/** Writer for {@link Facet42DocValuesFormat}. */
|
||||
public class Facet42DocValuesConsumer extends DocValuesConsumer {
|
||||
|
||||
final IndexOutput out;
|
||||
final int maxDoc;
|
||||
final float acceptableOverheadRatio;
|
||||
|
||||
public Facet42DocValuesConsumer(SegmentWriteState state) throws IOException {
|
||||
this(state, PackedInts.DEFAULT);
|
||||
}
|
||||
|
||||
public Facet42DocValuesConsumer(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||
this.acceptableOverheadRatio = acceptableOverheadRatio;
|
||||
boolean success = false;
|
||||
try {
|
||||
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
|
||||
out = state.directory.createOutput(fileName, state.context);
|
||||
CodecUtil.writeHeader(out, Facet42DocValuesFormat.CODEC, Facet42DocValuesFormat.VERSION_CURRENT);
|
||||
maxDoc = state.segmentInfo.getDocCount();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(this);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
|
||||
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
|
||||
// write the byte[] data
|
||||
out.writeVInt(field.number);
|
||||
|
||||
long totBytes = 0;
|
||||
for (BytesRef v : values) {
|
||||
totBytes += v.length;
|
||||
}
|
||||
|
||||
if (totBytes > Integer.MAX_VALUE) {
|
||||
throw new IllegalStateException("too many facets in one segment: Facet42DocValues cannot handle more than 2 GB facet data per segment");
|
||||
}
|
||||
|
||||
out.writeVInt((int) totBytes);
|
||||
|
||||
for (BytesRef v : values) {
|
||||
out.writeBytes(v.bytes, v.offset, v.length);
|
||||
}
|
||||
|
||||
PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), acceptableOverheadRatio);
|
||||
|
||||
int address = 0;
|
||||
for(BytesRef v : values) {
|
||||
w.add(address);
|
||||
address += v.length;
|
||||
}
|
||||
w.add(address);
|
||||
w.finish();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
|
||||
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
boolean success = false;
|
||||
try {
|
||||
out.writeVInt(-1); // write EOF marker
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(out);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package org.apache.lucene.facet.codecs.facet42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
||||
/**
|
||||
* DocValues format that only handles binary doc values and
|
||||
* is optimized for usage with facets. It uses more RAM than other
|
||||
* formats in exchange for faster lookups.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE</b>: this format cannot handle more than 2 GB
|
||||
* of facet data in a single segment. If your usage may hit
|
||||
* this limit, you can either use Lucene's default
|
||||
* DocValuesFormat, limit the maximum segment size in your
|
||||
* MergePolicy, or send us a patch fixing the limitation.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Facet42DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
public static final String CODEC = "FacetsDocValues";
|
||||
public static final String EXTENSION = "fdv";
|
||||
public static final int VERSION_START = 0;
|
||||
public static final int VERSION_CURRENT = VERSION_START;
|
||||
|
||||
public Facet42DocValuesFormat() {
|
||||
super("Facet42");
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
return new Facet42DocValuesConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
return new Facet42DocValuesProducer(state);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package org.apache.lucene.facet.codecs.facet42;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.DocValuesProducer;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SortedDocValues;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
class Facet42DocValuesProducer extends DocValuesProducer {
|
||||
|
||||
private final Map<Integer,Facet42BinaryDocValues> fields = new HashMap<Integer,Facet42BinaryDocValues>();
|
||||
|
||||
Facet42DocValuesProducer(SegmentReadState state) throws IOException {
|
||||
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
|
||||
IndexInput in = state.directory.openInput(fileName, state.context);
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.checkHeader(in, Facet42DocValuesFormat.CODEC,
|
||||
Facet42DocValuesFormat.VERSION_START,
|
||||
Facet42DocValuesFormat.VERSION_START);
|
||||
int fieldNumber = in.readVInt();
|
||||
while (fieldNumber != -1) {
|
||||
fields.put(fieldNumber, new Facet42BinaryDocValues(in));
|
||||
fieldNumber = in.readVInt();
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(in);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
|
||||
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
|
||||
}
|
||||
|
||||
@Override
|
||||
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
|
||||
return fields.get(field.number);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SortedDocValues getSorted(FieldInfo field) throws IOException {
|
||||
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codec + DocValuesFormat that are optimized for facets.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,16 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat
|
|
@ -2,6 +2,8 @@ package org.apache.lucene.facet;
|
|||
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.facet.codecs.facet42.Facet42Codec;
|
||||
import org.apache.lucene.facet.encoding.DGapIntEncoder;
|
||||
import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder;
|
||||
import org.apache.lucene.facet.encoding.EightFlagsIntEncoder;
|
||||
|
@ -13,6 +15,8 @@ import org.apache.lucene.facet.encoding.UniqueValuesIntEncoder;
|
|||
import org.apache.lucene.facet.encoding.VInt8IntEncoder;
|
||||
import org.apache.lucene.facet.params.CategoryListParams;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -43,6 +47,24 @@ public class FacetTestCase extends LuceneTestCase {
|
|||
new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))),
|
||||
};
|
||||
|
||||
private static Codec savedDefault = null;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClassFacetTestCase() throws Exception {
|
||||
if (random().nextDouble() < 0.3) {
|
||||
savedDefault = Codec.getDefault(); // save to restore later
|
||||
Codec.setDefault(new Facet42Codec());
|
||||
}
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClassFacetTestCase() throws Exception {
|
||||
if (savedDefault != null) {
|
||||
Codec.setDefault(savedDefault);
|
||||
savedDefault = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a {@link CategoryListParams} with random {@link IntEncoder} and field. */
|
||||
public static CategoryListParams randomCategoryListParams() {
|
||||
final String field = CategoryListParams.DEFAULT_FIELD + "$" + random().nextInt();
|
||||
|
|
Loading…
Reference in New Issue