New addition: Lucene-4069 BloomFilterPostingsFormat for faster access to low-frequency terms such as primary keys.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1368567 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Harwood 2012-08-02 16:04:20 +00:00
parent 08209761d3
commit ad2cefee00
17 changed files with 1278 additions and 1 deletions

View File

@ -15,6 +15,9 @@ New features
underlying PayloadFunction's explanation as the explanation underlying PayloadFunction's explanation as the explanation
for the payload score. (Scott Smerchek via Robert Muir) for the payload score. (Scott Smerchek via Robert Muir)
* LUCENE-4069: Added BloomFilteringPostingsFormat for use with low-frequency terms
such as primary keys (Mark Harwood, Mike McCandless)
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese * LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
iteration marks. (Robert Muir, Christian Moen) iteration marks. (Robert Muir, Christian Moen)

View File

@ -53,7 +53,13 @@ public abstract class PostingsFormat implements NamedSPILoader.NamedSPI {
/** Reads a segment. NOTE: by the time this call /** Reads a segment. NOTE: by the time this call
* returns, it must hold open any files it will need to * returns, it must hold open any files it will need to
* use; else, those files may be deleted. */ * use; else, those files may be deleted.
* Additionally, required files may be deleted during the execution of
* this call before there is a chance to open them. Under these
* circumstances an IOException should be thrown by the implementation.
* IOExceptions are expected and will automatically cause a retry of the
* segment opening logic with the newly revised segments.
* */
public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException; public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException;
@Override @Override

View File

@ -0,0 +1,63 @@
package org.apache.lucene.codecs.bloom;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.FuzzySet;
/**
* Class used to create index-time {@link FuzzySet} appropriately configured for
* each field. Also called to right-size bitsets for serialization.
* @lucene.experimental
*/
public abstract class BloomFilterFactory {
/**
*
* @param state The content to be indexed
* @param info
* the field requiring a BloomFilter
* @return An appropriately sized set or null if no BloomFiltering required
*/
public abstract FuzzySet getSetForField(SegmentWriteState state, FieldInfo info);
/**
* Called when downsizing bitsets for serialization
*
* @param fieldInfo
* The field with sparse set bits
* @param initialSet
* The bits accumulated
* @return null or a hopefully more densely packed, smaller bitset
*/
public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
// Aim for a bitset size that would have 10% of bits set (so 90% of searches
// would fail-fast)
float targetMaxSaturation = 0.1f;
return initialSet.downsize(targetMaxSaturation);
}
/**
* Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more
* @param bloomFilter The bloomFilter being tested
* @param fieldInfo The field with which this filter is associated
* @return true if the set has reached saturation and should be retired
*/
public abstract boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo);
}

View File

@ -0,0 +1,514 @@
package org.apache.lucene.codecs.bloom;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FuzzySet;
import org.apache.lucene.util.FuzzySet.ContainsResult;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.hash.MurmurHash2;
/**
* <p>
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
* for reads in segments known to have no record of the key. A choice of
* delegate PostingsFormat is used to record all other Postings data.
* </p>
* <p>
* A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
* settings on a per-field basis. The default configuration is
* {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
* values using {@link MurmurHash2}. This should be suitable for most purposes.
* </p>
* <p>
* The format of the blm file is as follows:
* </p>
* <ul>
* <li>BloomFilter (.blm) --&gt; Header, DelegatePostingsFormatName,
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
* <li>Filter --&gt; FieldNumber, FuzzySet</li>
* <li>FuzzySet --&gt;See {@link FuzzySet#serialize(DataOutput)}</li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>DelegatePostingsFormatName --&gt; {@link DataOutput#writeString(String)
* String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
* <li>NumFilteredFields --&gt; {@link DataOutput#writeInt Uint32}</li>
* <li>FieldNumber --&gt; {@link DataOutput#writeInt Uint32} The number of the
* field in this segment</li>
* </ul>
* @lucene.experimental
*/
public class BloomFilteringPostingsFormat extends PostingsFormat {
public static final String BLOOM_CODEC_NAME = "BloomFilter";
public static final int BLOOM_CODEC_VERSION = 1;
/** Extension of Bloom Filters file */
static final String BLOOM_EXTENSION = "blm";
BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
private PostingsFormat delegatePostingsFormat;
/**
* Creates Bloom filters for a selection of fields created in the index. This
* is recorded as a set of Bitsets held as a segment summary in an additional
* "blm" file. This PostingsFormat delegates to a choice of delegate
* PostingsFormat for encoding all other postings data.
*
* @param delegatePostingsFormat
* The PostingsFormat that records all the non-bloom filter data i.e.
* postings info.
* @param bloomFilterFactory
* The {@link BloomFilterFactory} responsible for sizing BloomFilters
* appropriately
*/
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
BloomFilterFactory bloomFilterFactory) {
super(BLOOM_CODEC_NAME);
this.delegatePostingsFormat = delegatePostingsFormat;
this.bloomFilterFactory = bloomFilterFactory;
}
/**
* Creates Bloom filters for a selection of fields created in the index. This
* is recorded as a set of Bitsets held as a segment summary in an additional
* "blm" file. This PostingsFormat delegates to a choice of delegate
* PostingsFormat for encoding all other postings data. This choice of
* constructor defaults to the {@link DefaultBloomFilterFactory} for
* configuring per-field BloomFilters.
*
* @param delegatePostingsFormat
* The PostingsFormat that records all the non-bloom filter data i.e.
* postings info.
*/
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
this(delegatePostingsFormat, new DefaultBloomFilterFactory());
}
// Used only by core Lucene at read-time via Service Provider instantiation -
// do not use at Write-time in application code.
public BloomFilteringPostingsFormat() {
super(BLOOM_CODEC_NAME);
}
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
if (delegatePostingsFormat == null) {
throw new UnsupportedOperationException("Error - " + getClass().getName()
+ " has been constructed without a choice of PostingsFormat");
}
return new BloomFilteredFieldsConsumer(
delegatePostingsFormat.fieldsConsumer(state), state,
delegatePostingsFormat);
}
public FieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return new BloomFilteredFieldsProducer(state);
}
public class BloomFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
public BloomFilteredFieldsProducer(SegmentReadState state)
throws IOException {
String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexInput bloomIn = null;
try {
bloomIn = state.dir.openInput(bloomFileName, state.context);
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
BLOOM_CODEC_VERSION);
// // Load the hash function used in the BloomFilter
// hashFunction = HashFunction.forName(bloomIn.readString());
// Load the delegate postings format
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
.readString());
this.delegateFieldsProducer = delegatePostingsFormat
.fieldsProducer(state);
int numBlooms = bloomIn.readInt();
for (int i = 0; i < numBlooms; i++) {
int fieldNum = bloomIn.readInt();
FuzzySet bloom = FuzzySet.deserialize(bloomIn);
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
}
} finally {
IOUtils.close(bloomIn);
}
}
public FieldsEnum iterator() throws IOException {
return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
bloomsByFieldName);
}
public void close() throws IOException {
delegateFieldsProducer.close();
}
public Terms terms(String field) throws IOException {
FuzzySet filter = bloomsByFieldName.get(field);
if (filter == null) {
return delegateFieldsProducer.terms(field);
} else {
Terms result = delegateFieldsProducer.terms(field);
if (result == null) {
return null;
}
return new BloomFilteredTerms(result, filter);
}
}
public int size() throws IOException {
return delegateFieldsProducer.size();
}
public long getUniqueTermCount() throws IOException {
return delegateFieldsProducer.getUniqueTermCount();
}
// Not all fields in a segment may be subject to a bloom filter. This class
// wraps Terms objects appropriately if a filtering request is present
class BloomFilteredFieldsEnum extends FieldsEnum {
private FieldsEnum delegateFieldsEnum;
private HashMap<String,FuzzySet> bloomsByFieldName;
private String currentFieldName;
public BloomFilteredFieldsEnum(FieldsEnum iterator,
HashMap<String,FuzzySet> bloomsByFieldName) {
this.delegateFieldsEnum = iterator;
this.bloomsByFieldName = bloomsByFieldName;
}
public AttributeSource attributes() {
return delegateFieldsEnum.attributes();
}
public String next() throws IOException {
currentFieldName = delegateFieldsEnum.next();
return currentFieldName;
}
public Terms terms() throws IOException {
FuzzySet filter = bloomsByFieldName.get(currentFieldName);
if (filter == null) {
return delegateFieldsEnum.terms();
} else {
Terms result = delegateFieldsEnum.terms();
if (result == null) {
return null;
}
// wrap the terms object with a bloom filter
return new BloomFilteredTerms(result, filter);
}
}
}
class BloomFilteredTerms extends Terms {
private Terms delegateTerms;
private FuzzySet filter;
public BloomFilteredTerms(Terms terms, FuzzySet filter) {
this.delegateTerms = terms;
this.filter = filter;
}
@Override
public TermsEnum intersect(CompiledAutomaton compiled,
final BytesRef startTerm) throws IOException {
return delegateTerms.intersect(compiled, startTerm);
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
TermsEnum result;
if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
// recycle the existing BloomFilteredTermsEnum by asking the delegate
// to recycle its contained TermsEnum
BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
if (bfte.filter == filter) {
bfte.delegateTermsEnum = delegateTerms
.iterator(bfte.delegateTermsEnum);
return bfte;
}
}
// We have been handed something we cannot reuse (either null, wrong
// class or wrong filter) so allocate a new object
result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
filter);
return result;
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return delegateTerms.getComparator();
}
@Override
public long size() throws IOException {
return delegateTerms.size();
}
@Override
public long getSumTotalTermFreq() throws IOException {
return delegateTerms.getSumTotalTermFreq();
}
@Override
public long getSumDocFreq() throws IOException {
return delegateTerms.getSumDocFreq();
}
@Override
public int getDocCount() throws IOException {
return delegateTerms.getDocCount();
}
}
class BloomFilteredTermsEnum extends TermsEnum {
TermsEnum delegateTermsEnum;
private FuzzySet filter;
public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
this.delegateTermsEnum = iterator;
this.filter = filter;
}
@Override
public final BytesRef next() throws IOException {
return delegateTermsEnum.next();
}
@Override
public final Comparator<BytesRef> getComparator() {
return delegateTermsEnum.getComparator();
}
@Override
public final boolean seekExact(BytesRef text, boolean useCache)
throws IOException {
// The magical fail-fast speed up that is the entire point of all of
// this code - save a disk seek if there is a match on an in-memory
// structure
// that may occasionally give a false positive but guaranteed no false
// negatives
if (filter.contains(text) == ContainsResult.NO) {
return false;
}
return delegateTermsEnum.seekExact(text, useCache);
}
@Override
public final SeekStatus seekCeil(BytesRef text, boolean useCache)
throws IOException {
return delegateTermsEnum.seekCeil(text, useCache);
}
@Override
public final void seekExact(long ord) throws IOException {
delegateTermsEnum.seekExact(ord);
}
@Override
public final BytesRef term() throws IOException {
return delegateTermsEnum.term();
}
@Override
public final long ord() throws IOException {
return delegateTermsEnum.ord();
}
@Override
public final int docFreq() throws IOException {
return delegateTermsEnum.docFreq();
}
@Override
public final long totalTermFreq() throws IOException {
return delegateTermsEnum.totalTermFreq();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
DocsAndPositionsEnum reuse, int flags) throws IOException {
return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
throws IOException {
return delegateTermsEnum.docs(liveDocs, reuse, flags);
}
}
}
class BloomFilteredFieldsConsumer extends FieldsConsumer {
private FieldsConsumer delegateFieldsConsumer;
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
private SegmentWriteState state;
// private PostingsFormat delegatePostingsFormat;
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
this.delegateFieldsConsumer = fieldsConsumer;
// this.delegatePostingsFormat=delegatePostingsFormat;
this.state = state;
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
if (bloomFilter != null) {
assert bloomFilters.containsKey(field) == false;
bloomFilters.put(field, bloomFilter);
return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
} else {
// No, use the unfiltered fieldsConsumer - we are not interested in
// recording any term Bitsets.
return delegateFieldsConsumer.addField(field);
}
}
@Override
public void close() throws IOException {
delegateFieldsConsumer.close();
// Now we are done accumulating values for these fields
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
FuzzySet bloomFilter = entry.getValue();
if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){
nonSaturatedBlooms.add(entry);
}
}
String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
IndexOutput bloomOutput = null;
try {
bloomOutput = state.directory
.createOutput(bloomFileName, state.context);
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
BLOOM_CODEC_VERSION);
// remember the name of the postings format we will delegate to
bloomOutput.writeString(delegatePostingsFormat.getName());
// First field in the output file is the number of fields+blooms saved
bloomOutput.writeInt(nonSaturatedBlooms.size());
for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
FieldInfo fieldInfo = entry.getKey();
FuzzySet bloomFilter = entry.getValue();
bloomOutput.writeInt(fieldInfo.number);
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
}
} finally {
IOUtils.close(bloomOutput);
}
//We are done with large bitsets so no need to keep them hanging around
bloomFilters.clear();
}
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
bloomFilter);
if (rightSizedSet == null) {
rightSizedSet = bloomFilter;
}
rightSizedSet.serialize(bloomOutput);
}
}
class WrappedTermsConsumer extends TermsConsumer {
private TermsConsumer delegateTermsConsumer;
private FuzzySet bloomFilter;
public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
this.delegateTermsConsumer = termsConsumer;
this.bloomFilter = bloomFilter;
}
public PostingsConsumer startTerm(BytesRef text) throws IOException {
return delegateTermsConsumer.startTerm(text);
}
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
// Record this term in our BloomFilter
if (stats.docFreq > 0) {
bloomFilter.addValue(text);
}
delegateTermsConsumer.finishTerm(text, stats);
}
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
throws IOException {
delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
}
public Comparator<BytesRef> getComparator() throws IOException {
return delegateTermsConsumer.getComparator();
}
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.codecs.bloom;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.FuzzySet;
import org.apache.lucene.util.hash.HashFunction;
import org.apache.lucene.util.hash.MurmurHash2;
/**
* Default policy is to allocate a bitset with 10% saturation given a unique term per document.
* Bits are set via MurmurHash2 hashing function.
* @lucene.experimental
*/
public class DefaultBloomFilterFactory extends BloomFilterFactory {
@Override
public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
//Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), 0.10f, new MurmurHash2());
}
@Override
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
// Don't bother saving bitsets if >90% of bits are set - we don't want to
// throw any more memory at this problem.
return bloomFilter.getSaturation() > 0.9f;
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
</body>
</html>

View File

@ -54,6 +54,11 @@ public final class FixedBitSet extends DocIdSet implements Bits {
bits = new long[bits2words(numBits)]; bits = new long[bits2words(numBits)];
} }
public FixedBitSet(long[]storedBits,int numBits) {
this.numBits = numBits;
this.bits = storedBits;
}
/** Makes full copy. */ /** Makes full copy. */
public FixedBitSet(FixedBitSet other) { public FixedBitSet(FixedBitSet other) {
bits = new long[other.bits.length]; bits = new long[other.bits.length];

View File

@ -0,0 +1,292 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.hash.HashFunction;
/**
* <p>
* A class used to represent a set of many, potentially large, values (e.g. many
* long strings such as URLs), using a significantly smaller amount of memory.
* </p>
* <p>
* The set is "lossy" in that it cannot definitively state that is does contain
* a value but it <em>can</em> definitively say if a value is <em>not</em> in
* the set. It can therefore be used as a Bloom Filter.
* </p>
* Another application of the set is that it can be used to perform fuzzy counting because
* it can estimate reasonably accurately how many unique values are contained in the set.
* </p>
* <p>This class is NOT threadsafe.</p>
* <p>
* Internally a Bitset is used to record values and once a client has finished recording
* a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
* is sized appropriately for the number of values recorded and desired saturation levels.
*
* </p>
* @lucene.experimental
*/
public class FuzzySet {
public static final int FUZZY_SERIALIZATION_VERSION=1;
public enum ContainsResult {
MAYBE, NO
};
private HashFunction hashFunction;
private FixedBitSet filter;
private int bloomSize;
//The sizes of BitSet used are all numbers that, when expressed in binary form,
//are all ones. This is to enable fast downsizing from one bitset to another
//by simply ANDing each set index in one bitset with the size of the target bitset
// - this provides a fast modulo of the number. Values previously accumulated in
// a large bitset and then mapped to a smaller set can be looked up using a single
// AND operation of the query term's hash rather than needing to perform a 2-step
// translation of the query term that mirrors the stored content's reprojections.
static final int usableBitSetSizes[];
static
{
usableBitSetSizes=new int[30];
int mask=1;
int size=mask;
for (int i = 0; i < usableBitSetSizes.length; i++) {
size=(size<<1)|mask;
usableBitSetSizes[i]=size;
}
}
/**
* Rounds down required maxNumberOfBits to the nearest number that is made up
* of all ones as a binary number.
* Use this method where controlling memory use is paramount.
*/
public static int getNearestSetSize(int maxNumberOfBits)
{
int result=usableBitSetSizes[0];
for (int i = 0; i < usableBitSetSizes.length; i++) {
if(usableBitSetSizes[i]<=maxNumberOfBits)
{
result=usableBitSetSizes[i];
}
}
return result;
}
/**
* Use this method to choose a set size where accuracy (low content saturation) is more important
* than deciding how much memory to throw at the problem.
* @param maxNumberOfValuesExpected
* @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded
* @return The size of the set nearest to the required size
*/
public static int getNearestSetSize(int maxNumberOfValuesExpected,
float desiredSaturation) {
// Iterate around the various scales of bitset from smallest to largest looking for the first that
// satisfies value volumes at the chosen saturation level
for (int i = 0; i < usableBitSetSizes.length; i++) {
int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions(
usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
return usableBitSetSizes[i];
}
}
return -1;
}
public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes, HashFunction hashFunction)
{
int setSize=getNearestSetSize(maxNumBytes);
return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
}
public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation, HashFunction hashFunction)
{
int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation);
return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
}
private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
super();
this.filter = filter;
this.bloomSize = bloomSize;
this.hashFunction=hashFunction;
}
/**
* The main method required for a Bloom filter which, given a value determines set membership.
* Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
* @param value
* @return NO or MAYBE
*/
public ContainsResult contains(BytesRef value) {
int hash = hashFunction.hash(value);
if (hash < 0) {
hash = hash * -1;
}
return mayContainValue(hash);
}
/**
* Serializes the data set to file using the following format:
* <ul>
* <li>FuzzySet --&gt;FuzzySetVersion,HashFunctionName,BloomSize,
* NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li>
* <li>HashFunctionName --&gt; {@link DataOutput#writeString(String) String} The
* name of a ServiceProvider registered {@link HashFunction}</li>
* <li>FuzzySetVersion --&gt; {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li>
* <li>BloomSize --&gt; {@link DataOutput#writeInt Uint32} The modulo value used
* to project hashes into the field's Bitset</li>
* <li>NumBitSetWords --&gt; {@link DataOutput#writeInt Uint32} The number of
* longs (as returned from {@link FixedBitSet#getBits})</li>
* <li>BitSetWord --&gt; {@link DataOutput#writeLong Long} A long from the array
* returned by {@link FixedBitSet#getBits}</li>
* </ul>
* @param out Data output stream
* @throws IOException
*/
public void serialize(DataOutput out) throws IOException
{
out.writeInt(FUZZY_SERIALIZATION_VERSION);
out.writeString(hashFunction.getName());
out.writeInt(bloomSize);
long[] bits = filter.getBits();
out.writeInt(bits.length);
for (int i = 0; i < bits.length; i++) {
// Can't used VLong encoding because cant cope with negative numbers
// output by FixedBitSet
out.writeLong(bits[i]);
}
}
public static FuzzySet deserialize(DataInput in) throws IOException
{
int version=in.readInt();
if(version!=FUZZY_SERIALIZATION_VERSION)
{
throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION);
}
HashFunction hashFunction=HashFunction.forName(in.readString());
int bloomSize=in.readInt();
int numLongs=in.readInt();
long[]longs=new long[numLongs];
for (int i = 0; i < numLongs; i++) {
longs[i]=in.readLong();
}
FixedBitSet bits = new FixedBitSet(longs,bloomSize+1);
return new FuzzySet(bits,bloomSize,hashFunction);
}
private ContainsResult mayContainValue(int positiveHash) {
assert positiveHash >= 0;
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
int pos = positiveHash & bloomSize;
if (filter.get(pos)) {
// This term may be recorded in this index (but could be a collision)
return ContainsResult.MAYBE;
}
// definitely NOT in this segment
return ContainsResult.NO;
}
/**
* Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
* chosen size of the internal bitset.
* @param value the key value to be hashed
* @throws IOException
*/
public void addValue(BytesRef value) throws IOException {
int hash = hashFunction.hash(value);
if (hash < 0) {
hash = hash * -1;
}
// Bitmasking using bloomSize is effectively a modulo operation.
int bloomPos = hash & bloomSize;
filter.set(bloomPos);
}
/**
*
* @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the
* result. Lower values have better qccuracy but require more space.
* @return a smaller FuzzySet or null if the current set is already over-saturated
*/
public FuzzySet downsize(float targetMaxSaturation)
{
int numBitsSet = filter.cardinality();
FixedBitSet rightSizedBitSet = filter;
int rightSizedBitSetSize = bloomSize;
//Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
for (int i = 0; i < usableBitSetSizes.length; i++) {
int candidateBitsetSize = usableBitSetSizes[i];
float candidateSaturation = (float) numBitsSet
/ (float) candidateBitsetSize;
if (candidateSaturation <= targetMaxSaturation) {
rightSizedBitSetSize = candidateBitsetSize;
break;
}
}
// Re-project the numbers to a smaller space if necessary
if (rightSizedBitSetSize < bloomSize) {
// Reset the choice of bitset to the smaller version
rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
// Map across the bits from the large set to the smaller one
int bitIndex = 0;
do {
bitIndex = filter.nextSetBit(bitIndex);
if (bitIndex >= 0) {
// Project the larger number into a smaller one effectively
// modulo-ing by using the target bitset size as a mask
int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
rightSizedBitSet.set(downSizedBitIndex);
bitIndex++;
}
} while ( (bitIndex >= 0)&&(bitIndex<=bloomSize));
} else {
return null;
}
return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction);
}
public int getEstimatedUniqueValues()
{
return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
}
// Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
int setSize, int numRecordedBits) {
double setSizeAsDouble = setSize;
double numRecordedBitsAsDouble = numRecordedBits;
double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
double logInverseSaturation = Math.log(1 - saturation) * -1;
return (int) (setSizeAsDouble * logInverseSaturation);
}
public float getSaturation() {
int numBitsSet = filter.cardinality();
return (float) numBitsSet / (float) bloomSize;
}
}

View File

@ -0,0 +1,70 @@
package org.apache.lucene.util.hash;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NamedSPILoader;
/**
* Base class for hashing functions that can be referred to by name.
* Subclasses are expected to provide threadsafe implementations of the hash function
* on the range of bytes referenced in the provided {@link BytesRef}
* @lucene.experimental
*/
public abstract class HashFunction implements NamedSPILoader.NamedSPI {
/**
* Hashes the contents of the referenced bytes
* @param bytes the data to be hashed
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
*/
public abstract int hash(BytesRef bytes);
private static final NamedSPILoader<HashFunction> loader =
new NamedSPILoader<HashFunction>(HashFunction.class);
private final String name;
public HashFunction(String name) {
NamedSPILoader.checkServiceName(name);
this.name = name;
}
/** Returns this codec's name */
@Override
public final String getName() {
return name;
}
/** looks up a hash function by name */
public static HashFunction forName(String name) {
return loader.lookup(name);
}
/** returns a list of all available hash function names */
public static Set<String> availableHashFunctionNames() {
return loader.availableServices();
}
@Override
public String toString() {
return name;
}
}

View File

@ -0,0 +1,105 @@
package org.apache.lucene.util.hash;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based
* lookup. See http://murmurhash.googlepages.com/ for more details.
* <p>
* The C version of MurmurHash 2.0 found at that site was ported to Java by
* Andrzej Bialecki (ab at getopt org).
* </p>
* <p>
* The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of
* hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
* rather than raw byte arrays.
* </p>
* @lucene.experimental
*/
public class MurmurHash2 extends HashFunction{
public static final String HASH_NAME="MurmurHash2";
public MurmurHash2() {
super(HASH_NAME);
}
public static int hash(byte[] data, int seed, int offset, int len) {
int m = 0x5bd1e995;
int r = 24;
int h = seed ^ len;
int len_4 = len >> 2;
for (int i = 0; i < len_4; i++) {
int i_4 = offset + (i << 2);
int k = data[i_4 + 3];
k = k << 8;
k = k | (data[i_4 + 2] & 0xff);
k = k << 8;
k = k | (data[i_4 + 1] & 0xff);
k = k << 8;
k = k | (data[i_4 + 0] & 0xff);
k *= m;
k ^= k >>> r;
k *= m;
h *= m;
h ^= k;
}
int len_m = len_4 << 2;
int left = len - len_m;
if (left != 0) {
if (left >= 3) {
h ^= data[offset + len - 3] << 16;
}
if (left >= 2) {
h ^= data[offset + len - 2] << 8;
}
if (left >= 1) {
h ^= data[offset + len - 1];
}
h *= m;
}
h ^= h >>> 13;
h *= m;
h ^= h >>> 15;
return h;
}
/**
* Generates 32 bit hash from byte array with default seed value.
*
* @param data
* byte array to hash
* @param offset
* the start position in the array to hash
* @param len
* length of the array elements to hash
* @return 32 bit hash of the given array
*/
public static final int hash32(final byte[] data, int offset, int len) {
return MurmurHash2.hash(data, 0x9747b28c, offset, len);
}
@Override
public final int hash(BytesRef br) {
return hash32(br.bytes, br.offset, br.length);
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Hashing functions load-able via SPI service
</body>
</html>

View File

@ -17,4 +17,5 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
org.apache.lucene.codecs.memory.MemoryPostingsFormat org.apache.lucene.codecs.memory.MemoryPostingsFormat
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
org.apache.lucene.codecs.memory.DirectPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat

View File

@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.lucene.util.hash.MurmurHash2

View File

@ -0,0 +1,77 @@
package org.apache.lucene.codecs.bloom;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.FuzzySet;
import org.apache.lucene.util.hash.MurmurHash2;
/**
* A class used for testing {@link BloomFilteringPostingsFormat} with a concrete
* delegate (Lucene40). Creates a Bloom filter on ALL fields and with tiny
* amounts of memory reserved for the filter. DO NOT USE IN A PRODUCTION
* APPLICATION This is not a realistic application of Bloom Filters as they
* ordinarily are larger and operate on only primary key type fields.
*/
public class TestBloomFilteredLucene40Postings extends PostingsFormat {
private BloomFilteringPostingsFormat delegate;
// Special class used to avoid OOM exceptions where Junit tests create many
// fields.
static class LowMemoryBloomFactory extends BloomFilterFactory {
@Override
public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
return FuzzySet.createSetBasedOnMaxMemory(1024, new MurmurHash2());
}
@Override
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
// For test purposes always maintain the BloomFilter - even past the point
// of usefulness when all bits are set
return false;
}
}
public TestBloomFilteredLucene40Postings() {
super("TestBloomFilteredLucene40Postings");
delegate = new BloomFilteringPostingsFormat(new Lucene40PostingsFormat(),
new LowMemoryBloomFactory());
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
return delegate.fieldsConsumer(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return delegate.fieldsProducer(state);
}
}

View File

@ -0,0 +1,25 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Support for generating test indexes using the BloomFilteringPostingsFormat
</body>
</html>

View File

@ -29,6 +29,7 @@ import java.util.Set;
import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat; import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings;
import org.apache.lucene.codecs.lucene40.Lucene40Codec; import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds; import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds;
@ -98,6 +99,10 @@ public class RandomCodec extends Lucene40Codec {
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
// add pulsing again with (usually) different parameters // add pulsing again with (usually) different parameters
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock), new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
//TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene40Postings to be constructed
//with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing
//with such "wrapper" classes?
new TestBloomFilteredLucene40Postings(),
new MockSepPostingsFormat(), new MockSepPostingsFormat(),
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)), new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)), new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),

View File

@ -20,5 +20,6 @@ org.apache.lucene.codecs.mocksep.MockSepPostingsFormat
org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat
org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat
org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds
org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings
org.apache.lucene.codecs.asserting.AssertingPostingsFormat org.apache.lucene.codecs.asserting.AssertingPostingsFormat