LUCENE-4764: add more-ram-consuming-but-faster DV format for facets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1445278 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-02-12 17:18:38 +00:00
parent 6c5b2f808e
commit 30e15a01c1
10 changed files with 454 additions and 1 deletions

View File

@ -159,6 +159,10 @@ New Features
to Lucene42DocValuesConsumer) if you want to make this tradeoff.
(Adrien Grand, Robert Muir)
* LUCENE-4764: A new Facet42Codec and Facet42DocValuesFormat provide
faster but more RAM-consuming facet performance. (Shai Erera, Mike
McCandless)
* LUCENE-4769: Added OrdinalsCache and CachedOrdsCountingFacetsAggregator
which uses the cache to obtain a document's ordinals. This aggregator
is faster than others, however consumes much more RAM.

View File

@ -31,7 +31,7 @@
<target name="run-encoding-benchmark" depends="compile-test">
<java classname="org.apache.lucene.util.encoding.EncodingSpeed" fork="true" failonerror="true">
<classpath refid="test.classpath" />
<classpath path="${build.dir}/classes/test" />
<classpath path="${build.dir}/classes/test" />
</java>
</target>

View File

@ -0,0 +1,47 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.PackedInts;
class Facet42BinaryDocValues extends BinaryDocValues {
private final byte[] bytes;
private final PackedInts.Reader addresses;
Facet42BinaryDocValues(DataInput in) throws IOException {
int totBytes = in.readVInt();
bytes = new byte[totBytes];
in.readBytes(bytes, 0, totBytes);
addresses = PackedInts.getReader(in);
}
@Override
public void get(int docID, BytesRef ret) {
int start = (int) addresses.get(docID);
ret.bytes = bytes;
ret.offset = start;
ret.length = (int) (addresses.get(docID+1)-start);
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
/**
* Same as {@link Lucene42Codec} except it uses {@link Facet42DocValuesFormat}
* for facet fields (faster-but-more-RAM-consuming doc values).
*
* <p>
* <b>NOTE</b>: this codec does not support facet partitions (see
* {@link FacetIndexingParams#getPartitionSize()}).
*
* <p>
* <b>NOTE</b>: this format cannot handle more than 2 GB
* of facet data in a single segment. If your usage may hit
* this limit, you can either use Lucene's default
* DocValuesFormat, limit the maximum segment size in your
* MergePolicy, or send us a patch fixing the limitation.
*
* @lucene.experimental
*/
public class Facet42Codec extends Lucene42Codec {
private final Set<String> facetFields;
private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facet42");
private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42");
// must have that for SPI purposes
/** Default constructor, uses {@link FacetIndexingParams#ALL_PARENTS}. */
public Facet42Codec() {
this(FacetIndexingParams.ALL_PARENTS);
}
/**
* Initializes with the given {@link FacetIndexingParams}. Returns the proper
* {@link DocValuesFormat} for the fields that are returned by
* {@link FacetIndexingParams#getAllCategoryListParams()}.
*/
public Facet42Codec(FacetIndexingParams fip) {
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
throw new IllegalArgumentException("this Codec does not support partitions");
}
this.facetFields = new HashSet<String>();
for (CategoryListParams clp : fip.getAllCategoryListParams()) {
facetFields.add(clp.field);
}
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
if (facetFields.contains(field)) {
return facetsDVFormat;
} else {
return lucene42DVFormat;
}
}
}

View File

@ -0,0 +1,115 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/** Writer for {@link Facet42DocValuesFormat}. */
public class Facet42DocValuesConsumer extends DocValuesConsumer {
final IndexOutput out;
final int maxDoc;
final float acceptableOverheadRatio;
public Facet42DocValuesConsumer(SegmentWriteState state) throws IOException {
this(state, PackedInts.DEFAULT);
}
public Facet42DocValuesConsumer(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
this.acceptableOverheadRatio = acceptableOverheadRatio;
boolean success = false;
try {
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
out = state.directory.createOutput(fileName, state.context);
CodecUtil.writeHeader(out, Facet42DocValuesFormat.CODEC, Facet42DocValuesFormat.VERSION_CURRENT);
maxDoc = state.segmentInfo.getDocCount();
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
}
@Override
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// write the byte[] data
out.writeVInt(field.number);
long totBytes = 0;
for (BytesRef v : values) {
totBytes += v.length;
}
if (totBytes > Integer.MAX_VALUE) {
throw new IllegalStateException("too many facets in one segment: Facet42DocValues cannot handle more than 2 GB facet data per segment");
}
out.writeVInt((int) totBytes);
for (BytesRef v : values) {
out.writeBytes(v.bytes, v.offset, v.length);
}
PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), acceptableOverheadRatio);
int address = 0;
for(BytesRef v : values) {
w.add(address);
address += v.length;
}
w.add(address);
w.finish();
}
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
}
@Override
public void close() throws IOException {
boolean success = false;
try {
out.writeVInt(-1); // write EOF marker
success = true;
} finally {
if (success) {
IOUtils.close(out);
} else {
IOUtils.closeWhileHandlingException(out);
}
}
}
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* DocValues format that only handles binary doc values and
* is optimized for usage with facets. It uses more RAM than other
* formats in exchange for faster lookups.
*
* <p>
* <b>NOTE</b>: this format cannot handle more than 2 GB
* of facet data in a single segment. If your usage may hit
* this limit, you can either use Lucene's default
* DocValuesFormat, limit the maximum segment size in your
* MergePolicy, or send us a patch fixing the limitation.
*
* @lucene.experimental
*/
public final class Facet42DocValuesFormat extends DocValuesFormat {
public static final String CODEC = "FacetsDocValues";
public static final String EXTENSION = "fdv";
public static final int VERSION_START = 0;
public static final int VERSION_CURRENT = VERSION_START;
public Facet42DocValuesFormat() {
super("Facet42");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Facet42DocValuesConsumer(state);
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new Facet42DocValuesProducer(state);
}
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IOUtils;
class Facet42DocValuesProducer extends DocValuesProducer {
private final Map<Integer,Facet42BinaryDocValues> fields = new HashMap<Integer,Facet42BinaryDocValues>();
Facet42DocValuesProducer(SegmentReadState state) throws IOException {
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
IndexInput in = state.directory.openInput(fileName, state.context);
boolean success = false;
try {
CodecUtil.checkHeader(in, Facet42DocValuesFormat.CODEC,
Facet42DocValuesFormat.VERSION_START,
Facet42DocValuesFormat.VERSION_START);
int fieldNumber = in.readVInt();
while (fieldNumber != -1) {
fields.put(fieldNumber, new Facet42BinaryDocValues(in));
fieldNumber = in.readVInt();
}
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
}
}
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
return fields.get(field.number);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
}
@Override
public void close() throws IOException {
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Codec + DocValuesFormat that are optimized for facets.
</body>
</html>

View File

@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat

View File

@ -2,6 +2,8 @@ package org.apache.lucene.facet;
import java.util.Random;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.facet.codecs.facet42.Facet42Codec;
import org.apache.lucene.facet.encoding.DGapIntEncoder;
import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder;
import org.apache.lucene.facet.encoding.EightFlagsIntEncoder;
@ -13,6 +15,8 @@ import org.apache.lucene.facet.encoding.UniqueValuesIntEncoder;
import org.apache.lucene.facet.encoding.VInt8IntEncoder;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -43,6 +47,24 @@ public class FacetTestCase extends LuceneTestCase {
new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))),
};
private static Codec savedDefault = null;
@BeforeClass
public static void beforeClassFacetTestCase() throws Exception {
if (random().nextDouble() < 0.3) {
savedDefault = Codec.getDefault(); // save to restore later
Codec.setDefault(new Facet42Codec());
}
}
@AfterClass
public static void afterClassFacetTestCase() throws Exception {
if (savedDefault != null) {
Codec.setDefault(savedDefault);
savedDefault = null;
}
}
/** Returns a {@link CategoryListParams} with random {@link IntEncoder} and field. */
public static CategoryListParams randomCategoryListParams() {
final String field = CategoryListParams.DEFAULT_FIELD + "$" + random().nextInt();