mirror of https://github.com/apache/lucene.git
New addition: Lucene-4069 BloomFilterPostingsFormat for faster access to low-frequency terms such as primary keys.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1368567 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
08209761d3
commit
ad2cefee00
|
@ -15,6 +15,9 @@ New features
|
|||
underlying PayloadFunction's explanation as the explanation
|
||||
for the payload score. (Scott Smerchek via Robert Muir)
|
||||
|
||||
* LUCENE-4069: Added BloomFilteringPostingsFormat for use with low-frequency terms
|
||||
such as primary keys (Mark Harwood, Mike McCandless)
|
||||
|
||||
* LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
|
||||
iteration marks. (Robert Muir, Christian Moen)
|
||||
|
||||
|
|
|
@ -53,7 +53,13 @@ public abstract class PostingsFormat implements NamedSPILoader.NamedSPI {
|
|||
|
||||
/** Reads a segment. NOTE: by the time this call
|
||||
* returns, it must hold open any files it will need to
|
||||
* use; else, those files may be deleted. */
|
||||
* use; else, those files may be deleted.
|
||||
* Additionally, required files may be deleted during the execution of
|
||||
* this call before there is a chance to open them. Under these
|
||||
* circumstances an IOException should be thrown by the implementation.
|
||||
* IOExceptions are expected and will automatically cause a retry of the
|
||||
* segment opening logic with the newly revised segments.
|
||||
* */
|
||||
public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException;
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
|
||||
|
||||
/**
|
||||
* Class used to create index-time {@link FuzzySet} appropriately configured for
|
||||
* each field. Also called to right-size bitsets for serialization.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class BloomFilterFactory {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param state The content to be indexed
|
||||
* @param info
|
||||
* the field requiring a BloomFilter
|
||||
* @return An appropriately sized set or null if no BloomFiltering required
|
||||
*/
|
||||
public abstract FuzzySet getSetForField(SegmentWriteState state, FieldInfo info);
|
||||
|
||||
/**
|
||||
* Called when downsizing bitsets for serialization
|
||||
*
|
||||
* @param fieldInfo
|
||||
* The field with sparse set bits
|
||||
* @param initialSet
|
||||
* The bits accumulated
|
||||
* @return null or a hopefully more densely packed, smaller bitset
|
||||
*/
|
||||
public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
|
||||
// Aim for a bitset size that would have 10% of bits set (so 90% of searches
|
||||
// would fail-fast)
|
||||
float targetMaxSaturation = 0.1f;
|
||||
return initialSet.downsize(targetMaxSaturation);
|
||||
}
|
||||
|
||||
/**
|
||||
* Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more
|
||||
* @param bloomFilter The bloomFilter being tested
|
||||
* @param fieldInfo The field with which this filter is associated
|
||||
* @return true if the set has reached saturation and should be retired
|
||||
*/
|
||||
public abstract boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo);
|
||||
|
||||
}
|
|
@ -0,0 +1,514 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsConsumer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.FuzzySet.ContainsResult;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A {@link PostingsFormat} useful for low doc-frequency fields such as primary
|
||||
* keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
|
||||
* for reads in segments known to have no record of the key. A choice of
|
||||
* delegate PostingsFormat is used to record all other Postings data.
|
||||
* </p>
|
||||
* <p>
|
||||
* A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
|
||||
* settings on a per-field basis. The default configuration is
|
||||
* {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
|
||||
* values using {@link MurmurHash2}. This should be suitable for most purposes.
|
||||
* </p>
|
||||
* <p>
|
||||
* The format of the blm file is as follows:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>BloomFilter (.blm) --> Header, DelegatePostingsFormatName,
|
||||
* NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
|
||||
* <li>Filter --> FieldNumber, FuzzySet</li>
|
||||
* <li>FuzzySet -->See {@link FuzzySet#serialize(DataOutput)}</li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>DelegatePostingsFormatName --> {@link DataOutput#writeString(String)
|
||||
* String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
|
||||
* <li>NumFilteredFields --> {@link DataOutput#writeInt Uint32}</li>
|
||||
* <li>FieldNumber --> {@link DataOutput#writeInt Uint32} The number of the
|
||||
* field in this segment</li>
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class BloomFilteringPostingsFormat extends PostingsFormat {
|
||||
|
||||
public static final String BLOOM_CODEC_NAME = "BloomFilter";
|
||||
public static final int BLOOM_CODEC_VERSION = 1;
|
||||
|
||||
/** Extension of Bloom Filters file */
|
||||
static final String BLOOM_EXTENSION = "blm";
|
||||
|
||||
BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
|
||||
private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
* @param bloomFilterFactory
|
||||
* The {@link BloomFilterFactory} responsible for sizing BloomFilters
|
||||
* appropriately
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
|
||||
BloomFilterFactory bloomFilterFactory) {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
this.delegatePostingsFormat = delegatePostingsFormat;
|
||||
this.bloomFilterFactory = bloomFilterFactory;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates Bloom filters for a selection of fields created in the index. This
|
||||
* is recorded as a set of Bitsets held as a segment summary in an additional
|
||||
* "blm" file. This PostingsFormat delegates to a choice of delegate
|
||||
* PostingsFormat for encoding all other postings data. This choice of
|
||||
* constructor defaults to the {@link DefaultBloomFilterFactory} for
|
||||
* configuring per-field BloomFilters.
|
||||
*
|
||||
* @param delegatePostingsFormat
|
||||
* The PostingsFormat that records all the non-bloom filter data i.e.
|
||||
* postings info.
|
||||
*/
|
||||
public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
|
||||
this(delegatePostingsFormat, new DefaultBloomFilterFactory());
|
||||
}
|
||||
|
||||
// Used only by core Lucene at read-time via Service Provider instantiation -
|
||||
// do not use at Write-time in application code.
|
||||
public BloomFilteringPostingsFormat() {
|
||||
super(BLOOM_CODEC_NAME);
|
||||
}
|
||||
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
|
||||
throws IOException {
|
||||
if (delegatePostingsFormat == null) {
|
||||
throw new UnsupportedOperationException("Error - " + getClass().getName()
|
||||
+ " has been constructed without a choice of PostingsFormat");
|
||||
}
|
||||
return new BloomFilteredFieldsConsumer(
|
||||
delegatePostingsFormat.fieldsConsumer(state), state,
|
||||
delegatePostingsFormat);
|
||||
}
|
||||
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
return new BloomFilteredFieldsProducer(state);
|
||||
}
|
||||
|
||||
public class BloomFilteredFieldsProducer extends FieldsProducer {
|
||||
private FieldsProducer delegateFieldsProducer;
|
||||
HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
|
||||
|
||||
public BloomFilteredFieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexInput bloomIn = null;
|
||||
try {
|
||||
bloomIn = state.dir.openInput(bloomFileName, state.context);
|
||||
CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// // Load the hash function used in the BloomFilter
|
||||
// hashFunction = HashFunction.forName(bloomIn.readString());
|
||||
// Load the delegate postings format
|
||||
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
|
||||
.readString());
|
||||
|
||||
this.delegateFieldsProducer = delegatePostingsFormat
|
||||
.fieldsProducer(state);
|
||||
int numBlooms = bloomIn.readInt();
|
||||
for (int i = 0; i < numBlooms; i++) {
|
||||
int fieldNum = bloomIn.readInt();
|
||||
FuzzySet bloom = FuzzySet.deserialize(bloomIn);
|
||||
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
|
||||
bloomsByFieldName.put(fieldInfo.name, bloom);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomIn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
|
||||
bloomsByFieldName);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
delegateFieldsProducer.close();
|
||||
}
|
||||
|
||||
public Terms terms(String field) throws IOException {
|
||||
FuzzySet filter = bloomsByFieldName.get(field);
|
||||
if (filter == null) {
|
||||
return delegateFieldsProducer.terms(field);
|
||||
} else {
|
||||
Terms result = delegateFieldsProducer.terms(field);
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
return new BloomFilteredTerms(result, filter);
|
||||
}
|
||||
}
|
||||
|
||||
public int size() throws IOException {
|
||||
return delegateFieldsProducer.size();
|
||||
}
|
||||
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
return delegateFieldsProducer.getUniqueTermCount();
|
||||
}
|
||||
|
||||
// Not all fields in a segment may be subject to a bloom filter. This class
|
||||
// wraps Terms objects appropriately if a filtering request is present
|
||||
class BloomFilteredFieldsEnum extends FieldsEnum {
|
||||
private FieldsEnum delegateFieldsEnum;
|
||||
private HashMap<String,FuzzySet> bloomsByFieldName;
|
||||
private String currentFieldName;
|
||||
|
||||
public BloomFilteredFieldsEnum(FieldsEnum iterator,
|
||||
HashMap<String,FuzzySet> bloomsByFieldName) {
|
||||
this.delegateFieldsEnum = iterator;
|
||||
this.bloomsByFieldName = bloomsByFieldName;
|
||||
}
|
||||
|
||||
public AttributeSource attributes() {
|
||||
return delegateFieldsEnum.attributes();
|
||||
}
|
||||
|
||||
public String next() throws IOException {
|
||||
currentFieldName = delegateFieldsEnum.next();
|
||||
return currentFieldName;
|
||||
}
|
||||
|
||||
public Terms terms() throws IOException {
|
||||
FuzzySet filter = bloomsByFieldName.get(currentFieldName);
|
||||
if (filter == null) {
|
||||
return delegateFieldsEnum.terms();
|
||||
} else {
|
||||
Terms result = delegateFieldsEnum.terms();
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
// wrap the terms object with a bloom filter
|
||||
return new BloomFilteredTerms(result, filter);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class BloomFilteredTerms extends Terms {
|
||||
private Terms delegateTerms;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTerms(Terms terms, FuzzySet filter) {
|
||||
this.delegateTerms = terms;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum intersect(CompiledAutomaton compiled,
|
||||
final BytesRef startTerm) throws IOException {
|
||||
return delegateTerms.intersect(compiled, startTerm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||
TermsEnum result;
|
||||
if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
|
||||
// recycle the existing BloomFilteredTermsEnum by asking the delegate
|
||||
// to recycle its contained TermsEnum
|
||||
BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
|
||||
if (bfte.filter == filter) {
|
||||
bfte.delegateTermsEnum = delegateTerms
|
||||
.iterator(bfte.delegateTermsEnum);
|
||||
return bfte;
|
||||
}
|
||||
}
|
||||
// We have been handed something we cannot reuse (either null, wrong
|
||||
// class or wrong filter) so allocate a new object
|
||||
result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
|
||||
filter);
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTerms.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() throws IOException {
|
||||
return delegateTerms.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
return delegateTerms.getSumTotalTermFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumDocFreq() throws IOException {
|
||||
return delegateTerms.getSumDocFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDocCount() throws IOException {
|
||||
return delegateTerms.getDocCount();
|
||||
}
|
||||
}
|
||||
|
||||
class BloomFilteredTermsEnum extends TermsEnum {
|
||||
|
||||
TermsEnum delegateTermsEnum;
|
||||
private FuzzySet filter;
|
||||
|
||||
public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
|
||||
this.delegateTermsEnum = iterator;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef next() throws IOException {
|
||||
return delegateTermsEnum.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Comparator<BytesRef> getComparator() {
|
||||
return delegateTermsEnum.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean seekExact(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
// The magical fail-fast speed up that is the entire point of all of
|
||||
// this code - save a disk seek if there is a match on an in-memory
|
||||
// structure
|
||||
// that may occasionally give a false positive but guaranteed no false
|
||||
// negatives
|
||||
if (filter.contains(text) == ContainsResult.NO) {
|
||||
return false;
|
||||
}
|
||||
return delegateTermsEnum.seekExact(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final SeekStatus seekCeil(BytesRef text, boolean useCache)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.seekCeil(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void seekExact(long ord) throws IOException {
|
||||
delegateTermsEnum.seekExact(ord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final BytesRef term() throws IOException {
|
||||
return delegateTermsEnum.term();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long ord() throws IOException {
|
||||
return delegateTermsEnum.ord();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int docFreq() throws IOException {
|
||||
return delegateTermsEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long totalTermFreq() throws IOException {
|
||||
return delegateTermsEnum.totalTermFreq();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
|
||||
DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
|
||||
throws IOException {
|
||||
return delegateTermsEnum.docs(liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class BloomFilteredFieldsConsumer extends FieldsConsumer {
|
||||
private FieldsConsumer delegateFieldsConsumer;
|
||||
private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
|
||||
private SegmentWriteState state;
|
||||
|
||||
// private PostingsFormat delegatePostingsFormat;
|
||||
|
||||
public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
|
||||
SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
|
||||
this.delegateFieldsConsumer = fieldsConsumer;
|
||||
// this.delegatePostingsFormat=delegatePostingsFormat;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsConsumer addField(FieldInfo field) throws IOException {
|
||||
FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
|
||||
if (bloomFilter != null) {
|
||||
assert bloomFilters.containsKey(field) == false;
|
||||
bloomFilters.put(field, bloomFilter);
|
||||
return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
|
||||
} else {
|
||||
// No, use the unfiltered fieldsConsumer - we are not interested in
|
||||
// recording any term Bitsets.
|
||||
return delegateFieldsConsumer.addField(field);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
delegateFieldsConsumer.close();
|
||||
// Now we are done accumulating values for these fields
|
||||
List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
|
||||
|
||||
for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){
|
||||
nonSaturatedBlooms.add(entry);
|
||||
}
|
||||
}
|
||||
String bloomFileName = IndexFileNames.segmentFileName(
|
||||
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
|
||||
IndexOutput bloomOutput = null;
|
||||
try {
|
||||
bloomOutput = state.directory
|
||||
.createOutput(bloomFileName, state.context);
|
||||
CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
|
||||
BLOOM_CODEC_VERSION);
|
||||
// remember the name of the postings format we will delegate to
|
||||
bloomOutput.writeString(delegatePostingsFormat.getName());
|
||||
|
||||
// First field in the output file is the number of fields+blooms saved
|
||||
bloomOutput.writeInt(nonSaturatedBlooms.size());
|
||||
for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
|
||||
FieldInfo fieldInfo = entry.getKey();
|
||||
FuzzySet bloomFilter = entry.getValue();
|
||||
bloomOutput.writeInt(fieldInfo.number);
|
||||
saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.close(bloomOutput);
|
||||
}
|
||||
//We are done with large bitsets so no need to keep them hanging around
|
||||
bloomFilters.clear();
|
||||
}
|
||||
|
||||
private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
|
||||
FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
|
||||
|
||||
FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
|
||||
bloomFilter);
|
||||
if (rightSizedSet == null) {
|
||||
rightSizedSet = bloomFilter;
|
||||
}
|
||||
rightSizedSet.serialize(bloomOutput);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class WrappedTermsConsumer extends TermsConsumer {
|
||||
private TermsConsumer delegateTermsConsumer;
|
||||
private FuzzySet bloomFilter;
|
||||
|
||||
public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
|
||||
this.delegateTermsConsumer = termsConsumer;
|
||||
this.bloomFilter = bloomFilter;
|
||||
}
|
||||
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
return delegateTermsConsumer.startTerm(text);
|
||||
}
|
||||
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
// Record this term in our BloomFilter
|
||||
if (stats.docFreq > 0) {
|
||||
bloomFilter.addValue(text);
|
||||
}
|
||||
delegateTermsConsumer.finishTerm(text, stats);
|
||||
}
|
||||
|
||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
|
||||
throws IOException {
|
||||
delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
|
||||
}
|
||||
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return delegateTermsConsumer.getComparator();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.hash.HashFunction;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* Default policy is to allocate a bitset with 10% saturation given a unique term per document.
|
||||
* Bits are set via MurmurHash2 hashing function.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DefaultBloomFilterFactory extends BloomFilterFactory {
|
||||
|
||||
@Override
|
||||
public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
|
||||
//Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set
|
||||
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), 0.10f, new MurmurHash2());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
|
||||
// Don't bother saving bitsets if >90% of bits are set - we don't want to
|
||||
// throw any more memory at this problem.
|
||||
return bloomFilter.getSaturation() > 0.9f;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
|
||||
</body>
|
||||
</html>
|
|
@ -54,6 +54,11 @@ public final class FixedBitSet extends DocIdSet implements Bits {
|
|||
bits = new long[bits2words(numBits)];
|
||||
}
|
||||
|
||||
public FixedBitSet(long[]storedBits,int numBits) {
|
||||
this.numBits = numBits;
|
||||
this.bits = storedBits;
|
||||
}
|
||||
|
||||
/** Makes full copy. */
|
||||
public FixedBitSet(FixedBitSet other) {
|
||||
bits = new long[other.bits.length];
|
||||
|
|
|
@ -0,0 +1,292 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.hash.HashFunction;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A class used to represent a set of many, potentially large, values (e.g. many
|
||||
* long strings such as URLs), using a significantly smaller amount of memory.
|
||||
* </p>
|
||||
* <p>
|
||||
* The set is "lossy" in that it cannot definitively state that is does contain
|
||||
* a value but it <em>can</em> definitively say if a value is <em>not</em> in
|
||||
* the set. It can therefore be used as a Bloom Filter.
|
||||
* </p>
|
||||
* Another application of the set is that it can be used to perform fuzzy counting because
|
||||
* it can estimate reasonably accurately how many unique values are contained in the set.
|
||||
* </p>
|
||||
* <p>This class is NOT threadsafe.</p>
|
||||
* <p>
|
||||
* Internally a Bitset is used to record values and once a client has finished recording
|
||||
* a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
|
||||
* is sized appropriately for the number of values recorded and desired saturation levels.
|
||||
*
|
||||
* </p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class FuzzySet {
|
||||
|
||||
public static final int FUZZY_SERIALIZATION_VERSION=1;
|
||||
|
||||
public enum ContainsResult {
|
||||
MAYBE, NO
|
||||
};
|
||||
private HashFunction hashFunction;
|
||||
private FixedBitSet filter;
|
||||
private int bloomSize;
|
||||
|
||||
//The sizes of BitSet used are all numbers that, when expressed in binary form,
|
||||
//are all ones. This is to enable fast downsizing from one bitset to another
|
||||
//by simply ANDing each set index in one bitset with the size of the target bitset
|
||||
// - this provides a fast modulo of the number. Values previously accumulated in
|
||||
// a large bitset and then mapped to a smaller set can be looked up using a single
|
||||
// AND operation of the query term's hash rather than needing to perform a 2-step
|
||||
// translation of the query term that mirrors the stored content's reprojections.
|
||||
static final int usableBitSetSizes[];
|
||||
static
|
||||
{
|
||||
usableBitSetSizes=new int[30];
|
||||
int mask=1;
|
||||
int size=mask;
|
||||
for (int i = 0; i < usableBitSetSizes.length; i++) {
|
||||
size=(size<<1)|mask;
|
||||
usableBitSetSizes[i]=size;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Rounds down required maxNumberOfBits to the nearest number that is made up
|
||||
* of all ones as a binary number.
|
||||
* Use this method where controlling memory use is paramount.
|
||||
*/
|
||||
public static int getNearestSetSize(int maxNumberOfBits)
|
||||
{
|
||||
int result=usableBitSetSizes[0];
|
||||
for (int i = 0; i < usableBitSetSizes.length; i++) {
|
||||
if(usableBitSetSizes[i]<=maxNumberOfBits)
|
||||
{
|
||||
result=usableBitSetSizes[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use this method to choose a set size where accuracy (low content saturation) is more important
|
||||
* than deciding how much memory to throw at the problem.
|
||||
* @param maxNumberOfValuesExpected
|
||||
* @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded
|
||||
* @return The size of the set nearest to the required size
|
||||
*/
|
||||
public static int getNearestSetSize(int maxNumberOfValuesExpected,
|
||||
float desiredSaturation) {
|
||||
// Iterate around the various scales of bitset from smallest to largest looking for the first that
|
||||
// satisfies value volumes at the chosen saturation level
|
||||
for (int i = 0; i < usableBitSetSizes.length; i++) {
|
||||
int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
|
||||
int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions(
|
||||
usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
|
||||
if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
|
||||
return usableBitSetSizes[i];
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes, HashFunction hashFunction)
|
||||
{
|
||||
int setSize=getNearestSetSize(maxNumBytes);
|
||||
return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
|
||||
}
|
||||
|
||||
public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation, HashFunction hashFunction)
|
||||
{
|
||||
int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation);
|
||||
return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
|
||||
super();
|
||||
this.filter = filter;
|
||||
this.bloomSize = bloomSize;
|
||||
this.hashFunction=hashFunction;
|
||||
}
|
||||
|
||||
/**
|
||||
* The main method required for a Bloom filter which, given a value determines set membership.
|
||||
* Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
|
||||
* @param value
|
||||
* @return NO or MAYBE
|
||||
*/
|
||||
public ContainsResult contains(BytesRef value) {
|
||||
int hash = hashFunction.hash(value);
|
||||
if (hash < 0) {
|
||||
hash = hash * -1;
|
||||
}
|
||||
return mayContainValue(hash);
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializes the data set to file using the following format:
|
||||
* <ul>
|
||||
* <li>FuzzySet -->FuzzySetVersion,HashFunctionName,BloomSize,
|
||||
* NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li>
|
||||
* <li>HashFunctionName --> {@link DataOutput#writeString(String) String} The
|
||||
* name of a ServiceProvider registered {@link HashFunction}</li>
|
||||
* <li>FuzzySetVersion --> {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li>
|
||||
* <li>BloomSize --> {@link DataOutput#writeInt Uint32} The modulo value used
|
||||
* to project hashes into the field's Bitset</li>
|
||||
* <li>NumBitSetWords --> {@link DataOutput#writeInt Uint32} The number of
|
||||
* longs (as returned from {@link FixedBitSet#getBits})</li>
|
||||
* <li>BitSetWord --> {@link DataOutput#writeLong Long} A long from the array
|
||||
* returned by {@link FixedBitSet#getBits}</li>
|
||||
* </ul>
|
||||
* @param out Data output stream
|
||||
* @throws IOException
|
||||
*/
|
||||
public void serialize(DataOutput out) throws IOException
|
||||
{
|
||||
out.writeInt(FUZZY_SERIALIZATION_VERSION);
|
||||
out.writeString(hashFunction.getName());
|
||||
out.writeInt(bloomSize);
|
||||
long[] bits = filter.getBits();
|
||||
out.writeInt(bits.length);
|
||||
for (int i = 0; i < bits.length; i++) {
|
||||
// Can't used VLong encoding because cant cope with negative numbers
|
||||
// output by FixedBitSet
|
||||
out.writeLong(bits[i]);
|
||||
}
|
||||
}
|
||||
public static FuzzySet deserialize(DataInput in) throws IOException
|
||||
{
|
||||
int version=in.readInt();
|
||||
if(version!=FUZZY_SERIALIZATION_VERSION)
|
||||
{
|
||||
throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION);
|
||||
}
|
||||
HashFunction hashFunction=HashFunction.forName(in.readString());
|
||||
int bloomSize=in.readInt();
|
||||
int numLongs=in.readInt();
|
||||
long[]longs=new long[numLongs];
|
||||
for (int i = 0; i < numLongs; i++) {
|
||||
longs[i]=in.readLong();
|
||||
}
|
||||
FixedBitSet bits = new FixedBitSet(longs,bloomSize+1);
|
||||
return new FuzzySet(bits,bloomSize,hashFunction);
|
||||
}
|
||||
|
||||
private ContainsResult mayContainValue(int positiveHash) {
|
||||
assert positiveHash >= 0;
|
||||
// Bloom sizes are always base 2 and so can be ANDed for a fast modulo
|
||||
int pos = positiveHash & bloomSize;
|
||||
if (filter.get(pos)) {
|
||||
// This term may be recorded in this index (but could be a collision)
|
||||
return ContainsResult.MAYBE;
|
||||
}
|
||||
// definitely NOT in this segment
|
||||
return ContainsResult.NO;
|
||||
}
|
||||
|
||||
/**
|
||||
* Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
|
||||
* chosen size of the internal bitset.
|
||||
* @param value the key value to be hashed
|
||||
* @throws IOException
|
||||
*/
|
||||
public void addValue(BytesRef value) throws IOException {
|
||||
int hash = hashFunction.hash(value);
|
||||
if (hash < 0) {
|
||||
hash = hash * -1;
|
||||
}
|
||||
// Bitmasking using bloomSize is effectively a modulo operation.
|
||||
int bloomPos = hash & bloomSize;
|
||||
filter.set(bloomPos);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the
|
||||
* result. Lower values have better qccuracy but require more space.
|
||||
* @return a smaller FuzzySet or null if the current set is already over-saturated
|
||||
*/
|
||||
public FuzzySet downsize(float targetMaxSaturation)
|
||||
{
|
||||
int numBitsSet = filter.cardinality();
|
||||
FixedBitSet rightSizedBitSet = filter;
|
||||
int rightSizedBitSetSize = bloomSize;
|
||||
//Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
|
||||
for (int i = 0; i < usableBitSetSizes.length; i++) {
|
||||
int candidateBitsetSize = usableBitSetSizes[i];
|
||||
float candidateSaturation = (float) numBitsSet
|
||||
/ (float) candidateBitsetSize;
|
||||
if (candidateSaturation <= targetMaxSaturation) {
|
||||
rightSizedBitSetSize = candidateBitsetSize;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Re-project the numbers to a smaller space if necessary
|
||||
if (rightSizedBitSetSize < bloomSize) {
|
||||
// Reset the choice of bitset to the smaller version
|
||||
rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
|
||||
// Map across the bits from the large set to the smaller one
|
||||
int bitIndex = 0;
|
||||
do {
|
||||
bitIndex = filter.nextSetBit(bitIndex);
|
||||
if (bitIndex >= 0) {
|
||||
// Project the larger number into a smaller one effectively
|
||||
// modulo-ing by using the target bitset size as a mask
|
||||
int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
|
||||
rightSizedBitSet.set(downSizedBitIndex);
|
||||
bitIndex++;
|
||||
}
|
||||
} while ( (bitIndex >= 0)&&(bitIndex<=bloomSize));
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction);
|
||||
}
|
||||
|
||||
public int getEstimatedUniqueValues()
|
||||
{
|
||||
return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
|
||||
}
|
||||
|
||||
// Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded
|
||||
public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
|
||||
int setSize, int numRecordedBits) {
|
||||
double setSizeAsDouble = setSize;
|
||||
double numRecordedBitsAsDouble = numRecordedBits;
|
||||
double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
|
||||
double logInverseSaturation = Math.log(1 - saturation) * -1;
|
||||
return (int) (setSizeAsDouble * logInverseSaturation);
|
||||
}
|
||||
|
||||
public float getSaturation() {
|
||||
int numBitsSet = filter.cardinality();
|
||||
return (float) numBitsSet / (float) bloomSize;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package org.apache.lucene.util.hash;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.NamedSPILoader;
|
||||
|
||||
|
||||
/**
|
||||
* Base class for hashing functions that can be referred to by name.
|
||||
* Subclasses are expected to provide threadsafe implementations of the hash function
|
||||
* on the range of bytes referenced in the provided {@link BytesRef}
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class HashFunction implements NamedSPILoader.NamedSPI {
|
||||
|
||||
/**
|
||||
* Hashes the contents of the referenced bytes
|
||||
* @param bytes the data to be hashed
|
||||
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
|
||||
*/
|
||||
public abstract int hash(BytesRef bytes);
|
||||
|
||||
private static final NamedSPILoader<HashFunction> loader =
|
||||
new NamedSPILoader<HashFunction>(HashFunction.class);
|
||||
|
||||
private final String name;
|
||||
|
||||
public HashFunction(String name) {
|
||||
NamedSPILoader.checkServiceName(name);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Returns this codec's name */
|
||||
@Override
|
||||
public final String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/** looks up a hash function by name */
|
||||
public static HashFunction forName(String name) {
|
||||
return loader.lookup(name);
|
||||
}
|
||||
|
||||
/** returns a list of all available hash function names */
|
||||
public static Set<String> availableHashFunctionNames() {
|
||||
return loader.availableServices();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
package org.apache.lucene.util.hash;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* This is a very fast, non-cryptographic hash suitable for general hash-based
|
||||
* lookup. See http://murmurhash.googlepages.com/ for more details.
|
||||
* <p>
|
||||
* The C version of MurmurHash 2.0 found at that site was ported to Java by
|
||||
* Andrzej Bialecki (ab at getopt org).
|
||||
* </p>
|
||||
* <p>
|
||||
* The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of
|
||||
* hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
|
||||
* rather than raw byte arrays.
|
||||
* </p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class MurmurHash2 extends HashFunction{
|
||||
|
||||
|
||||
public static final String HASH_NAME="MurmurHash2";
|
||||
|
||||
public MurmurHash2() {
|
||||
super(HASH_NAME);
|
||||
}
|
||||
|
||||
public static int hash(byte[] data, int seed, int offset, int len) {
|
||||
int m = 0x5bd1e995;
|
||||
int r = 24;
|
||||
int h = seed ^ len;
|
||||
int len_4 = len >> 2;
|
||||
for (int i = 0; i < len_4; i++) {
|
||||
int i_4 = offset + (i << 2);
|
||||
int k = data[i_4 + 3];
|
||||
k = k << 8;
|
||||
k = k | (data[i_4 + 2] & 0xff);
|
||||
k = k << 8;
|
||||
k = k | (data[i_4 + 1] & 0xff);
|
||||
k = k << 8;
|
||||
k = k | (data[i_4 + 0] & 0xff);
|
||||
k *= m;
|
||||
k ^= k >>> r;
|
||||
k *= m;
|
||||
h *= m;
|
||||
h ^= k;
|
||||
}
|
||||
int len_m = len_4 << 2;
|
||||
int left = len - len_m;
|
||||
if (left != 0) {
|
||||
if (left >= 3) {
|
||||
h ^= data[offset + len - 3] << 16;
|
||||
}
|
||||
if (left >= 2) {
|
||||
h ^= data[offset + len - 2] << 8;
|
||||
}
|
||||
if (left >= 1) {
|
||||
h ^= data[offset + len - 1];
|
||||
}
|
||||
h *= m;
|
||||
}
|
||||
h ^= h >>> 13;
|
||||
h *= m;
|
||||
h ^= h >>> 15;
|
||||
return h;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates 32 bit hash from byte array with default seed value.
|
||||
*
|
||||
* @param data
|
||||
* byte array to hash
|
||||
* @param offset
|
||||
* the start position in the array to hash
|
||||
* @param len
|
||||
* length of the array elements to hash
|
||||
* @return 32 bit hash of the given array
|
||||
*/
|
||||
public static final int hash32(final byte[] data, int offset, int len) {
|
||||
return MurmurHash2.hash(data, 0x9747b28c, offset, len);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final int hash(BytesRef br) {
|
||||
return hash32(br.bytes, br.offset, br.length);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Hashing functions load-able via SPI service
|
||||
</body>
|
||||
</html>
|
|
@ -17,4 +17,5 @@ org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
|
|||
org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
|
||||
org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
|
||||
org.apache.lucene.codecs.memory.MemoryPostingsFormat
|
||||
org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
|
||||
org.apache.lucene.codecs.memory.DirectPostingsFormat
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.util.hash.MurmurHash2
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* A class used for testing {@link BloomFilteringPostingsFormat} with a concrete
|
||||
* delegate (Lucene40). Creates a Bloom filter on ALL fields and with tiny
|
||||
* amounts of memory reserved for the filter. DO NOT USE IN A PRODUCTION
|
||||
* APPLICATION This is not a realistic application of Bloom Filters as they
|
||||
* ordinarily are larger and operate on only primary key type fields.
|
||||
*/
|
||||
public class TestBloomFilteredLucene40Postings extends PostingsFormat {
|
||||
|
||||
private BloomFilteringPostingsFormat delegate;
|
||||
|
||||
// Special class used to avoid OOM exceptions where Junit tests create many
|
||||
// fields.
|
||||
static class LowMemoryBloomFactory extends BloomFilterFactory {
|
||||
@Override
|
||||
public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
|
||||
return FuzzySet.createSetBasedOnMaxMemory(1024, new MurmurHash2());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
|
||||
// For test purposes always maintain the BloomFilter - even past the point
|
||||
// of usefulness when all bits are set
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public TestBloomFilteredLucene40Postings() {
|
||||
super("TestBloomFilteredLucene40Postings");
|
||||
delegate = new BloomFilteringPostingsFormat(new Lucene40PostingsFormat(),
|
||||
new LowMemoryBloomFactory());
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
|
||||
throws IOException {
|
||||
return delegate.fieldsConsumer(state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state)
|
||||
throws IOException {
|
||||
return delegate.fieldsProducer(state);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Support for generating test indexes using the BloomFilteringPostingsFormat
|
||||
</body>
|
||||
</html>
|
|
@ -29,6 +29,7 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
|
||||
import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds;
|
||||
|
@ -98,6 +99,10 @@ public class RandomCodec extends Lucene40Codec {
|
|||
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
// add pulsing again with (usually) different parameters
|
||||
new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
|
||||
//TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene40Postings to be constructed
|
||||
//with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing
|
||||
//with such "wrapper" classes?
|
||||
new TestBloomFilteredLucene40Postings(),
|
||||
new MockSepPostingsFormat(),
|
||||
new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
|
||||
new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),
|
||||
|
|
|
@ -20,5 +20,6 @@ org.apache.lucene.codecs.mocksep.MockSepPostingsFormat
|
|||
org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat
|
||||
org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat
|
||||
org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds
|
||||
org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings
|
||||
org.apache.lucene.codecs.asserting.AssertingPostingsFormat
|
||||
|
||||
|
|
Loading…
Reference in New Issue