mirror of https://github.com/apache/lucene.git
LUCENE-4340: Move bloom PF utilities to lucene/codecs.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1381504 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
db6badeb11
commit
3f9aa4dcbb
|
@ -17,7 +17,6 @@ package org.apache.lucene.codecs.bloom;
|
|||
*/
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.codecs.PostingsConsumer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.codecs.bloom.FuzzySet.ContainsResult;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
|
@ -46,11 +47,8 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.FuzzySet.ContainsResult;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
|
|
@ -17,9 +17,6 @@ package org.apache.lucene.codecs.bloom;
|
|||
*/
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.hash.HashFunction;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* Default policy is to allocate a bitset with 10% saturation given a unique term per document.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.util;
|
||||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -20,7 +20,8 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.hash.HashFunction;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
@ -45,9 +46,20 @@ import org.apache.lucene.util.hash.HashFunction;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public class FuzzySet {
|
||||
|
||||
public static final int FUZZY_SERIALIZATION_VERSION=1;
|
||||
|
||||
|
||||
public static final int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI
|
||||
public static final int VERSION_START = VERSION_SPI;
|
||||
public static final int VERSION_CURRENT = 2;
|
||||
|
||||
public static HashFunction hashFunctionForVersion(int version) {
|
||||
if (version < VERSION_START) {
|
||||
throw new IllegalArgumentException("Version " + version + " is too old, expected at least " + VERSION_START);
|
||||
} else if (version > VERSION_CURRENT) {
|
||||
throw new IllegalArgumentException("Version " + version + " is too new, expected at most " + VERSION_CURRENT);
|
||||
}
|
||||
return MurmurHash2.INSTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result from {@link FuzzySet#contains(BytesRef)}:
|
||||
* can never return definitively YES (always MAYBE),
|
||||
|
@ -174,8 +186,7 @@ public class FuzzySet {
|
|||
*/
|
||||
public void serialize(DataOutput out) throws IOException
|
||||
{
|
||||
out.writeInt(FUZZY_SERIALIZATION_VERSION);
|
||||
out.writeString(hashFunction.getName());
|
||||
out.writeInt(VERSION_CURRENT);
|
||||
out.writeInt(bloomSize);
|
||||
long[] bits = filter.getBits();
|
||||
out.writeInt(bits.length);
|
||||
|
@ -188,11 +199,10 @@ public class FuzzySet {
|
|||
public static FuzzySet deserialize(DataInput in) throws IOException
|
||||
{
|
||||
int version=in.readInt();
|
||||
if(version!=FUZZY_SERIALIZATION_VERSION)
|
||||
{
|
||||
throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION);
|
||||
if (version == VERSION_SPI) {
|
||||
in.readString();
|
||||
}
|
||||
HashFunction hashFunction=HashFunction.forName(in.readString());
|
||||
final HashFunction hashFunction = hashFunctionForVersion(version);
|
||||
int bloomSize=in.readInt();
|
||||
int numLongs=in.readInt();
|
||||
long[]longs=new long[numLongs];
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.lucene.codecs.bloom;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
|
||||
/**
|
||||
* Base class for hashing functions that can be referred to by name.
|
||||
* Subclasses are expected to provide threadsafe implementations of the hash function
|
||||
* on the range of bytes referenced in the provided {@link BytesRef}
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class HashFunction {
|
||||
|
||||
/**
|
||||
* Hashes the contents of the referenced bytes
|
||||
* @param bytes the data to be hashed
|
||||
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
|
||||
*/
|
||||
public abstract int hash(BytesRef bytes);
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.util.hash;
|
||||
package org.apache.lucene.codecs.bloom;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -32,14 +32,11 @@ import org.apache.lucene.util.BytesRef;
|
|||
* </p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class MurmurHash2 extends HashFunction{
|
||||
|
||||
|
||||
public static final String HASH_NAME="MurmurHash2";
|
||||
|
||||
public MurmurHash2() {
|
||||
super(HASH_NAME);
|
||||
}
|
||||
public final class MurmurHash2 extends HashFunction{
|
||||
|
||||
public static final MurmurHash2 INSTANCE = new MurmurHash2();
|
||||
|
||||
private MurmurHash2() {}
|
||||
|
||||
public static int hash(byte[] data, int seed, int offset, int len) {
|
||||
int m = 0x5bd1e995;
|
|
@ -1,84 +0,0 @@
|
|||
package org.apache.lucene.util.hash;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.NamedSPILoader;
|
||||
|
||||
|
||||
/**
|
||||
* Base class for hashing functions that can be referred to by name.
|
||||
* Subclasses are expected to provide threadsafe implementations of the hash function
|
||||
* on the range of bytes referenced in the provided {@link BytesRef}
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class HashFunction implements NamedSPILoader.NamedSPI {
|
||||
|
||||
/**
|
||||
* Hashes the contents of the referenced bytes
|
||||
* @param bytes the data to be hashed
|
||||
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
|
||||
*/
|
||||
public abstract int hash(BytesRef bytes);
|
||||
|
||||
private static final NamedSPILoader<HashFunction> loader =
|
||||
new NamedSPILoader<HashFunction>(HashFunction.class);
|
||||
|
||||
private final String name;
|
||||
|
||||
public HashFunction(String name) {
|
||||
NamedSPILoader.checkServiceName(name);
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** Returns this codec's name */
|
||||
@Override
|
||||
public final String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/** looks up a hash function by name */
|
||||
public static HashFunction forName(String name) {
|
||||
return loader.lookup(name);
|
||||
}
|
||||
|
||||
/** returns a list of all available hash function names */
|
||||
public static Set<String> availableHashFunctionNames() {
|
||||
return loader.availableServices();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reloads the hash function list from the given {@link ClassLoader}.
|
||||
* Changes to the function list are visible after the method ends, all
|
||||
* iterators ({@link #availableHashFunctionNames()},...) stay consistent.
|
||||
*
|
||||
* <p><b>NOTE:</b> Only new functions are added, existing ones are
|
||||
* never removed or replaced.
|
||||
*
|
||||
* <p><em>This method is expensive and should only be called for discovery
|
||||
* of new functions on the given classpath/classloader!</em>
|
||||
*/
|
||||
public static void reloadHashFunctions(ClassLoader classloader) {
|
||||
loader.reload(classloader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name;
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Hashing functions load-able via SPI service
|
||||
</body>
|
||||
</html>
|
|
@ -1,16 +0,0 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.lucene.util.hash.MurmurHash2
|
|
@ -22,13 +22,10 @@ import java.io.IOException;
|
|||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
import org.apache.lucene.util.hash.MurmurHash2;
|
||||
|
||||
/**
|
||||
* A class used for testing {@link BloomFilteringPostingsFormat} with a concrete
|
||||
|
|
|
@ -36,7 +36,6 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.util.hash.HashFunction;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.handler.admin.CoreAdminHandler;
|
||||
|
@ -176,8 +175,6 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
* this ResourceLoader.
|
||||
*/
|
||||
void reloadLuceneSPI() {
|
||||
// Hash functions:
|
||||
HashFunction.reloadHashFunctions(this.classLoader);
|
||||
// Codecs:
|
||||
PostingsFormat.reloadPostingsFormats(this.classLoader);
|
||||
Codec.reloadCodecs(this.classLoader);
|
||||
|
|
Loading…
Reference in New Issue