LUCENE-5321: remove Facet42DocValuesFormat and FacetCodec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1538245 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2013-11-02 18:36:52 +00:00
parent b4a343d6ba
commit 6befc56d8c
12 changed files with 14 additions and 532 deletions

View File

@ -205,6 +205,9 @@ API Changes:
* LUCENE-5313: Move preservePositionIncrements from setter to ctor in * LUCENE-5313: Move preservePositionIncrements from setter to ctor in
Analyzing/FuzzySuggester. (Areek Zillur via Mike McCandless) Analyzing/FuzzySuggester. (Areek Zillur via Mike McCandless)
* LUCENE-5321: Remove Facet42DocValuesFormat. Use DirectDocValuesFormat if you
want to load the category list into memory. (Shai Erera, Mike McCandless)
Optimizations Optimizations
* LUCENE-5225: The ToParentBlockJoinQuery only keeps tracks of the the child * LUCENE-5225: The ToParentBlockJoinQuery only keeps tracks of the the child

View File

@ -1,53 +0,0 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
class Facet42BinaryDocValues extends BinaryDocValues {
private final byte[] bytes;
private final PackedInts.Reader addresses;
Facet42BinaryDocValues(DataInput in) throws IOException {
int totBytes = in.readVInt();
bytes = new byte[totBytes];
in.readBytes(bytes, 0, totBytes);
addresses = PackedInts.getReader(in);
}
@Override
public void get(int docID, BytesRef ret) {
int start = (int) addresses.get(docID);
ret.bytes = bytes;
ret.offset = start;
ret.length = (int) (addresses.get(docID+1)-start);
}
/** Returns approximate RAM bytes used */
public long ramBytesUsed() {
return RamUsageEstimator.sizeOf(bytes) + addresses.ramBytesUsed();
}
}

View File

@ -1,126 +0,0 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
/** Writer for {@link Facet42DocValuesFormat}. */
public class Facet42DocValuesConsumer extends DocValuesConsumer {
final IndexOutput out;
final int maxDoc;
final float acceptableOverheadRatio;
public Facet42DocValuesConsumer(SegmentWriteState state) throws IOException {
this(state, PackedInts.DEFAULT);
}
public Facet42DocValuesConsumer(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
this.acceptableOverheadRatio = acceptableOverheadRatio;
boolean success = false;
try {
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
out = state.directory.createOutput(fileName, state.context);
CodecUtil.writeHeader(out, Facet42DocValuesFormat.CODEC, Facet42DocValuesFormat.VERSION_CURRENT);
maxDoc = state.segmentInfo.getDocCount();
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
@Override
public void addNumericField(FieldInfo field, Iterable<Number> values) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
}
@Override
public void addBinaryField(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
// write the byte[] data
out.writeVInt(field.number);
long totBytes = 0;
for (BytesRef v : values) {
if (v != null) {
totBytes += v.length;
}
}
if (totBytes > Integer.MAX_VALUE) {
throw new IllegalStateException("too many facets in one segment: Facet42DocValues cannot handle more than 2 GB facet data per segment");
}
out.writeVInt((int) totBytes);
for (BytesRef v : values) {
if (v != null) {
out.writeBytes(v.bytes, v.offset, v.length);
}
}
PackedInts.Writer w = PackedInts.getWriter(out, maxDoc+1, PackedInts.bitsRequired(totBytes+1), acceptableOverheadRatio);
int address = 0;
for(BytesRef v : values) {
w.add(address);
if (v != null) {
address += v.length;
}
}
w.add(address);
w.finish();
}
@Override
public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
}
@Override
public void addSortedSetField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrdCount, Iterable<Number> ords) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues can only handle binary fields");
}
@Override
public void close() throws IOException {
boolean success = false;
try {
out.writeVInt(-1); // write EOF marker
success = true;
} finally {
if (success) {
IOUtils.close(out);
} else {
IOUtils.closeWhileHandlingException(out);
}
}
}
}

View File

@ -1,63 +0,0 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* DocValues format that only handles binary doc values and
* is optimized for usage with facets. It uses more RAM than other
* formats in exchange for faster lookups.
*
* <p>
* <b>NOTE</b>: this format cannot handle more than 2 GB
* of facet data in a single segment. If your usage may hit
* this limit, you can either use Lucene's default
* DocValuesFormat, limit the maximum segment size in your
* MergePolicy, or send us a patch fixing the limitation.
*
* @lucene.experimental
*/
public final class Facet42DocValuesFormat extends DocValuesFormat {
public static final String CODEC = "FacetsDocValues";
public static final String EXTENSION = "fdv";
public static final int VERSION_START = 0;
public static final int VERSION_CURRENT = VERSION_START;
public Facet42DocValuesFormat() {
super("Facet42");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Facet42DocValuesConsumer(state);
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new Facet42DocValuesProducer(state);
}
}

View File

@ -1,103 +0,0 @@
package org.apache.lucene.facet.codecs.facet42;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IOUtils;
class Facet42DocValuesProducer extends DocValuesProducer {
private final Map<Integer,Facet42BinaryDocValues> fields = new HashMap<Integer,Facet42BinaryDocValues>();
private final int maxDoc;
Facet42DocValuesProducer(SegmentReadState state) throws IOException {
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Facet42DocValuesFormat.EXTENSION);
IndexInput in = state.directory.openInput(fileName, state.context);
this.maxDoc = state.segmentInfo.getDocCount();
boolean success = false;
try {
CodecUtil.checkHeader(in, Facet42DocValuesFormat.CODEC,
Facet42DocValuesFormat.VERSION_START,
Facet42DocValuesFormat.VERSION_START);
int fieldNumber = in.readVInt();
while (fieldNumber != -1) {
fields.put(fieldNumber, new Facet42BinaryDocValues(in));
fieldNumber = in.readVInt();
}
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
}
}
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
return fields.get(field.number);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
throw new UnsupportedOperationException("FacetsDocValues only implements binary");
}
@Override
public Bits getDocsWithField(FieldInfo field) throws IOException {
return new Bits.MatchAllBits(maxDoc); // TODO: have codec impl this?
}
@Override
public void close() throws IOException {
}
@Override
public long ramBytesUsed() {
long size = 0;
for (Facet42BinaryDocValues entry: fields.values()) {
size += entry.ramBytesUsed() + Integer.SIZE;
}
return size;
}
}

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Codec + DocValuesFormat that are optimized for facets.
</body>
</html>

View File

@ -1,79 +0,0 @@
package org.apache.lucene.facet.codecs.facet46;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.lucene46.Lucene46Codec;
import org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
/**
* Same as {@link Lucene46Codec} except it uses {@link Facet42DocValuesFormat}
* for facet fields (faster-but-more-RAM-consuming doc values).
*
* <p>
* <b>NOTE</b>: this codec does not support facet partitions (see
* {@link FacetIndexingParams#getPartitionSize()}).
*
* <p>
* <b>NOTE</b>: this format cannot handle more than 2 GB
* of facet data in a single segment. If your usage may hit
* this limit, you can either use Lucene's default
* DocValuesFormat, limit the maximum segment size in your
* MergePolicy, or send us a patch fixing the limitation.
*
* @lucene.experimental
*/
public class Facet46Codec extends Lucene46Codec {
private final Set<String> facetFields;
private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName("Facet42");
/** Default constructor, uses {@link FacetIndexingParams#DEFAULT}. */
public Facet46Codec() {
this(FacetIndexingParams.DEFAULT);
}
/**
* Initializes with the given {@link FacetIndexingParams}. Returns the proper
* {@link DocValuesFormat} for the fields that are returned by
* {@link FacetIndexingParams#getAllCategoryListParams()}.
*/
public Facet46Codec(FacetIndexingParams fip) {
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
throw new IllegalArgumentException("this Codec does not support partitions");
}
this.facetFields = new HashSet<String>();
for (CategoryListParams clp : fip.getAllCategoryListParams()) {
facetFields.add(clp.field);
}
}
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
if (facetFields.contains(field)) {
return facetsDVFormat;
} else {
return super.getDocValuesFormatForField(field);
}
}
}

View File

@ -1,22 +0,0 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Codec + DocValuesFormat that are optimized for facets.
</body>
</html>

View File

@ -1,16 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat

View File

@ -2,8 +2,6 @@ package org.apache.lucene.facet;
import java.util.Random; import java.util.Random;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.facet.codecs.facet46.Facet46Codec;
import org.apache.lucene.facet.encoding.DGapIntEncoder; import org.apache.lucene.facet.encoding.DGapIntEncoder;
import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder; import org.apache.lucene.facet.encoding.DGapVInt8IntEncoder;
import org.apache.lucene.facet.encoding.EightFlagsIntEncoder; import org.apache.lucene.facet.encoding.EightFlagsIntEncoder;
@ -15,8 +13,6 @@ import org.apache.lucene.facet.encoding.UniqueValuesIntEncoder;
import org.apache.lucene.facet.encoding.VInt8IntEncoder; import org.apache.lucene.facet.encoding.VInt8IntEncoder;
import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -47,24 +43,6 @@ public abstract class FacetTestCase extends LuceneTestCase {
new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))), new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))),
}; };
private static Codec savedDefault = null;
@BeforeClass
public static void beforeClassFacetTestCase() throws Exception {
if (random().nextDouble() < 0.3) {
savedDefault = Codec.getDefault(); // save to restore later
Codec.setDefault(new Facet46Codec());
}
}
@AfterClass
public static void afterClassFacetTestCase() throws Exception {
if (savedDefault != null) {
Codec.setDefault(savedDefault);
savedDefault = null;
}
}
/** Returns a {@link CategoryListParams} with random {@link IntEncoder} and field. */ /** Returns a {@link CategoryListParams} with random {@link IntEncoder} and field. */
public static CategoryListParams randomCategoryListParams() { public static CategoryListParams randomCategoryListParams() {
final String field = CategoryListParams.DEFAULT_FIELD + "$" + random().nextInt(); final String field = CategoryListParams.DEFAULT_FIELD + "$" + random().nextInt();

View File

@ -31,7 +31,6 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.FacetTestUtils;
import org.apache.lucene.facet.codecs.facet46.Facet46Codec;
import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetIndexingParams;
@ -257,10 +256,10 @@ public class TestDemoFacets extends FacetTestCase {
// LUCENE-4583: make sure if we require > 32 KB for one // LUCENE-4583: make sure if we require > 32 KB for one
// document, we don't hit exc when using Facet42DocValuesFormat // document, we don't hit exc when using Facet42DocValuesFormat
public void testManyFacetsInOneDocument() throws Exception { public void testManyFacetsInOneDocument() throws Exception {
assumeTrue("default Codec doesn't support huge BinaryDocValues", _TestUtil.fieldSupportsHugeBinaryDocValues(CategoryListParams.DEFAULT_FIELD));
Directory dir = newDirectory(); Directory dir = newDirectory();
Directory taxoDir = newDirectory(); Directory taxoDir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setCodec(new Facet46Codec());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);

View File

@ -28,15 +28,11 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.FacetTestCase; import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetTestUtils; import org.apache.lucene.facet.FacetTestUtils;
import org.apache.lucene.facet.codecs.facet42.Facet42DocValuesFormat;
import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.params.FacetSearchParams;
@ -434,16 +430,6 @@ public class TestDrillSideways extends FacetTestCase {
boolean canUseDV = defaultCodecSupportsSortedSet(); boolean canUseDV = defaultCodecSupportsSortedSet();
// TestRuleSetupAndRestoreClassEnv can sometimes
// randomly pick the non-general Facet42DocValuesFormat:
DocValuesFormat dvf = Codec.getDefault().docValuesFormat();
if (dvf instanceof PerFieldDocValuesFormat) {
dvf = ((PerFieldDocValuesFormat) dvf).getDocValuesFormatForField("$facets");
}
if (dvf instanceof Facet42DocValuesFormat) {
canUseDV = false;
}
while (aChance == 0.0) { while (aChance == 0.0) {
aChance = random().nextDouble(); aChance = random().nextDouble();
} }