SOLR-799: Add support for hash based exact/near duplicate document handling

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@743163 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-02-10 23:21:57 +00:00
parent a1566dd585
commit 6555bb7bb3
15 changed files with 1004 additions and 4 deletions

View File

@ -153,6 +153,8 @@ New Features
25. SOLR-850: Addition of timeouts for distributed searching. Configurable through 'shard-socket-timeout' and 25. SOLR-850: Addition of timeouts for distributed searching. Configurable through 'shard-socket-timeout' and
'shard-connection-timeout' parameters in SearchHandler. (Patrick O'Leary via shalin) 'shard-connection-timeout' parameters in SearchHandler. (Patrick O'Leary via shalin)
26. SOLR-799: Add support for hash based exact/near duplicate document
handling. (Mark Miller, yonik)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -354,7 +354,7 @@
--> -->
<requestDispatcher handleSelect="true" > <requestDispatcher handleSelect="true" >
<!--Make sure your system has some authentication before enabling remote streaming! --> <!--Make sure your system has some authentication before enabling remote streaming! -->
<requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048" /> <requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048000" />
<!-- Set HTTP caching related parameters (for proxy caches and clients). <!-- Set HTTP caching related parameters (for proxy caches and clients).
@ -758,6 +758,24 @@
</formatter> </formatter>
</highlighting> </highlighting>
<!-- An example dedup update processor that creates the "id" field on the fly
based on the hash code of some other fields. This example has overwriteDupes
set to false since we are using the id field as the signatureField and Solr
will maintain uniqueness based on that anyway. -->
<!--
<updateRequestProcessorChain name="dedupe">
<processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
<bool name="enabled">true</bool>
<str name="signatureField">id</str>
<bool name="overwriteDupes">false</bool>
<str name="fields">name,features,cat</str>
<str name="signatureClass">org.apache.solr.update.processor.Lookup3Signature</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
-->
<!-- queryResponseWriter plugins... query responses will be written using the <!-- queryResponseWriter plugins... query responses will be written using the
writer specified by the 'wt' request parameter matching the name of a registered writer specified by the 'wt' request parameter matching the name of a registered

View File

@ -0,0 +1,242 @@
package org.apache.solr.common.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* <p>Fast, well distributed, cross-platform hash functions.
* </p>
*
* <p>Development background: I was surprised to discovered that there isn't a good cross-platform hash function defined for strings. MD5, SHA, FVN, etc, all define hash functions over bytes, meaning that it's under-specified for strings.
* </p>
*
* <p>So I set out to create a standard 32 bit string hash that would be well defined for implementation in all languages, have very high performance, and have very good hash properties such as distribution. After evaluating all the options, I settled on using Bob Jenkins' lookup3 as a base. It's a well studied and very fast hash function, and the hashword variant can work with 32 bits at a time (perfect for hashing unicode code points). It's also even faster on the latest JVMs which can translate pairs of shifts into native rotate instructions.
* </p>
* <p>The only problem with using lookup3 hashword is that it includes a length in the initial value. This would suck some performance out since directly hashing a UTF8 or UTF16 string (Java) would require a pre-scan to get the actual number of unicode code points. The solution was to simply remove the length factor, which is equivalent to biasing initVal by -(numCodePoints*4). This slightly modified lookup3 I define as lookup3ycs.
* </p>
* <p>So the definition of the cross-platform string hash lookup3ycs is as follows:
* </p>
* <p>The hash value of a character sequence (a string) is defined to be the hash of its unicode code points, according to lookup3 hashword, with the initval biased by -(length*4).
* </p>
*<p>So by definition
*</p>
* <pre>
* lookup3ycs(k,offset,length,initval) == lookup3(k,offset,length,initval-(length*4))
*
* AND
*
* lookup3ycs(k,offset,length,initval+(length*4)) == lookup3(k,offset,length,initval)
* </pre>
* <p>An obvious advantage of this relationship is that you can use lookup3 if you don't have an implementation of lookup3ycs.
* </p>
*
*
* @author yonik
*/
public class Hash {
/**
* A Java implementation of hashword from lookup3.c by Bob Jenkins
* (<a href="http://burtleburtle.net/bob/c/lookup3.c">original source</a>).
*
* @param k the key to hash
* @param offset offset of the start of the key
* @param length length of the key
* @param initval initial value to fold into the hash
* @return the 32 bit hash code
*/
@SuppressWarnings("fallthrough")
public static int lookup3(int[] k, int offset, int length, int initval) {
int a,b,c;
a = b = c = 0xdeadbeef + (length<<2) + initval;
int i=offset;
while (length > 3)
{
a += k[i];
b += k[i+1];
c += k[i+2];
// mix(a,b,c)... Java needs "out" parameters!!!
// Note: recent JVMs (Sun JDK6) turn pairs of shifts (needed to do a rotate)
// into real x86 rotate instructions.
{
a -= c; a ^= (c<<4)|(c>>>-4); c += b;
b -= a; b ^= (a<<6)|(a>>>-6); a += c;
c -= b; c ^= (b<<8)|(b>>>-8); b += a;
a -= c; a ^= (c<<16)|(c>>>-16); c += b;
b -= a; b ^= (a<<19)|(a>>>-19); a += c;
c -= b; c ^= (b<<4)|(b>>>-4); b += a;
}
length -= 3;
i += 3;
}
switch(length) {
case 3 : c+=k[i+2]; // fall through
case 2 : b+=k[i+1]; // fall through
case 1 : a+=k[i+0]; // fall through
// final(a,b,c);
{
c ^= b; c -= (b<<14)|(b>>>-14);
a ^= c; a -= (c<<11)|(c>>>-11);
b ^= a; b -= (a<<25)|(a>>>-25);
c ^= b; c -= (b<<16)|(b>>>-16);
a ^= c; a -= (c<<4)|(c>>>-4);
b ^= a; b -= (a<<14)|(a>>>-14);
c ^= b; c -= (b<<24)|(b>>>-24);
}
case 0:
break;
}
return c;
}
/**
* Identical to lookup3, except initval is biased by -(length&lt;&lt;2).
* This is equivalent to leaving out the length factor in the initial state.
* {@code lookup3ycs(k,offset,length,initval) == lookup3(k,offset,length,initval-(length<<2))}
* and
* {@code lookup3ycs(k,offset,length,initval+(length<<2)) == lookup3(k,offset,length,initval)}
*/
public static int lookup3ycs(int[] k, int offset, int length, int initval) {
return lookup3(k, offset, length, initval-(length<<2));
}
/**
* <p>The hash value of a character sequence is defined to be the hash of
* it's unicode code points, according to {@link #lookup3ycs(int[] k, int offset, int length, int initval)}
* </p>
* <p>If you know the number of code points in the {@code CharSequence}, you can
* generate the same hash as the original lookup3
* via {@code lookup3ycs(s, start, end, initval+(numCodePoints<<2))}
*/
public static int lookup3ycs(CharSequence s, int start, int end, int initval) {
int a,b,c;
a = b = c = 0xdeadbeef + initval;
// only difference from lookup3 is that "+ (length<<2)" is missing
// since we don't know the number of code points to start with,
// and don't want to have to pre-scan the string to find out.
int i=start;
boolean mixed=true; // have the 3 state variables been adequately mixed?
for(;;) {
if (i>= end) break;
mixed=false;
char ch;
ch = s.charAt(i++);
a += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
ch = s.charAt(i++);
b += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
ch = s.charAt(i++);
c += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
// mix(a,b,c)... Java needs "out" parameters!!!
// Note: recent JVMs (Sun JDK6) turn pairs of shifts (needed to do a rotate)
// into real x86 rotate instructions.
{
a -= c; a ^= (c<<4)|(c>>>-4); c += b;
b -= a; b ^= (a<<6)|(a>>>-6); a += c;
c -= b; c ^= (b<<8)|(b>>>-8); b += a;
a -= c; a ^= (c<<16)|(c>>>-16); c += b;
b -= a; b ^= (a<<19)|(a>>>-19); a += c;
c -= b; c ^= (b<<4)|(b>>>-4); b += a;
}
mixed=true;
}
if (!mixed) {
// final(a,b,c)
c ^= b; c -= (b<<14)|(b>>>-14);
a ^= c; a -= (c<<11)|(c>>>-11);
b ^= a; b -= (a<<25)|(a>>>-25);
c ^= b; c -= (b<<16)|(b>>>-16);
a ^= c; a -= (c<<4)|(c>>>-4);
b ^= a; b -= (a<<14)|(a>>>-14);
c ^= b; c -= (b<<24)|(b>>>-24);
}
return c;
}
/**<p>This is the 64 bit version of lookup3ycs, corresponding to Bob Jenkin's
* lookup3 hashlittle2 with initval biased by -(numCodePoints<<2). It is equivalent
* to lookup3ycs in that if the high bits of initval==0, then the low bits of the
* result will be the same as lookup3ycs.
* </p>
*/
public static long lookup3ycs64(CharSequence s, int start, int end, long initval) {
int a,b,c;
a = b = c = 0xdeadbeef + (int)initval;
c += (int)(initval>>>32);
// only difference from lookup3 is that "+ (length<<2)" is missing
// since we don't know the number of code points to start with,
// and don't want to have to pre-scan the string to find out.
int i=start;
boolean mixed=true; // have the 3 state variables been adequately mixed?
for(;;) {
if (i>= end) break;
mixed=false;
char ch;
ch = s.charAt(i++);
a += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
ch = s.charAt(i++);
b += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
ch = s.charAt(i++);
c += Character.isHighSurrogate(ch) && i< end ? Character.toCodePoint(ch, s.charAt(i++)) : ch;
if (i>= end) break;
// mix(a,b,c)... Java needs "out" parameters!!!
// Note: recent JVMs (Sun JDK6) turn pairs of shifts (needed to do a rotate)
// into real x86 rotate instructions.
{
a -= c; a ^= (c<<4)|(c>>>-4); c += b;
b -= a; b ^= (a<<6)|(a>>>-6); a += c;
c -= b; c ^= (b<<8)|(b>>>-8); b += a;
a -= c; a ^= (c<<16)|(c>>>-16); c += b;
b -= a; b ^= (a<<19)|(a>>>-19); a += c;
c -= b; c ^= (b<<4)|(b>>>-4); b += a;
}
mixed=true;
}
if (!mixed) {
// final(a,b,c)
c ^= b; c -= (b<<14)|(b>>>-14);
a ^= c; a -= (c<<11)|(c>>>-11);
b ^= a; b -= (a<<25)|(a>>>-25);
c ^= b; c -= (b<<16)|(b>>>-16);
a ^= c; a -= (c<<4)|(c>>>-4);
b ^= a; b -= (a<<14)|(a>>>-14);
c ^= b; c -= (b<<24)|(b>>>-24);
}
return c + (((long)b) << 32);
}
}

View File

@ -26,6 +26,8 @@ import java.io.IOException;
* @version $Id$ * @version $Id$
*/ */
public class StrUtils { public class StrUtils {
public static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
'7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
/** /**
* Split a string based on a separator, but don't split if it's inside * Split a string based on a separator, but don't split if it's inside

View File

@ -19,6 +19,7 @@ package org.apache.solr.update;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField; import org.apache.solr.common.SolrInputField;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
@ -43,7 +44,7 @@ public class AddUpdateCommand extends UpdateCommand {
public boolean overwritePending; public boolean overwritePending;
public boolean overwriteCommitted; public boolean overwriteCommitted;
public Term updateTerm;
public int commitWithin = -1; public int commitWithin = -1;

View File

@ -20,9 +20,15 @@
package org.apache.solr.update; package org.apache.solr.update;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.Future; import java.util.concurrent.Future;
@ -210,12 +216,28 @@ public class DirectUpdateHandler2 extends UpdateHandler {
// this is the only unsynchronized code in the iwAccess block, which // this is the only unsynchronized code in the iwAccess block, which
// should account for most of the time // should account for most of the time
Term updateTerm = null;
if (cmd.overwriteCommitted || cmd.overwritePending) { if (cmd.overwriteCommitted || cmd.overwritePending) {
if (cmd.indexedId == null) { if (cmd.indexedId == null) {
cmd.indexedId = getIndexedId(cmd.doc); cmd.indexedId = getIndexedId(cmd.doc);
} }
writer.updateDocument(idTerm.createTerm(cmd.indexedId), cmd.getLuceneDocument(schema)); Term idTerm = this.idTerm.createTerm(cmd.indexedId);
boolean del = false;
if (cmd.updateTerm == null) {
updateTerm = idTerm;
} else {
del = true;
updateTerm = cmd.updateTerm;
}
writer.updateDocument(updateTerm, cmd.getLuceneDocument(schema));
if(del) { // ensure id remains unique
BooleanQuery bq = new BooleanQuery();
bq.add(new BooleanClause(new TermQuery(updateTerm), Occur.MUST_NOT));
bq.add(new BooleanClause(new TermQuery(idTerm), Occur.MUST));
writer.deleteDocuments(bq);
}
} else { } else {
// allow duplicates // allow duplicates
writer.addDocument(cmd.getLuceneDocument(schema)); writer.addDocument(cmd.getLuceneDocument(schema));

View File

@ -0,0 +1,25 @@
package org.apache.solr.update.processor;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.solr.common.util.Hash;
public class Lookup3Signature extends Signature {
protected long hash;
public Lookup3Signature() {
}
public void add(String content) {
hash = Hash.lookup3ycs64(content,0,content.length(),hash);
}
public byte[] getSignature() {
return new byte[]{(byte)(hash>>56),(byte)(hash>>48),(byte)(hash>>40),(byte)(hash>>32),(byte)(hash>>24),(byte)(hash>>16),(byte)(hash>>8),(byte)(hash>>0)};
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.update.processor;
import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MD5Signature extends Signature {
protected final static Logger log = LoggerFactory.getLogger(MD5Signature.class);
private static ThreadLocal<MessageDigest> DIGESTER_FACTORY = new ThreadLocal<MessageDigest>() {
protected MessageDigest initialValue() {
try {
return MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}
};
private MessageDigest digester;
public MD5Signature() {
digester = DIGESTER_FACTORY.get();
digester.reset();
}
public void add(String content) {
try {
digester.update(content.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
// won't happen
log.error("UTF-8 not supported", e);
throw new RuntimeException(e);
}
}
public byte[] getSignature() {
return digester.digest();
}
}

View File

@ -0,0 +1,12 @@
package org.apache.solr.update.processor;
import org.apache.solr.common.params.SolrParams;
public abstract class Signature {
public void init(SolrParams nl) {
}
abstract public void add(String content);
abstract public byte[] getSignature();
}

View File

@ -0,0 +1,195 @@
package org.apache.solr.update.processor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.Term;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.DeleteUpdateCommand;
public class SignatureUpdateProcessorFactory extends
UpdateRequestProcessorFactory {
private List<String> sigFields;
private String signatureField;
private Term signatureTerm;
private boolean enabled = true;
private String signatureClass;
private boolean overwriteDupes;
private SolrParams params;
@Override
public void init(final NamedList args) {
if (args != null) {
SolrParams params = SolrParams.toSolrParams(args);
boolean enabled = params.getBool("enabled", true);
this.enabled = enabled;
overwriteDupes = params.getBool("overwriteDupes", true);
signatureField = params.get("signatureField", "signatureField");
signatureTerm = new Term(signatureField, "");
signatureClass = params.get("signatureClass",
"org.apache.solr.update.processor.Lookup3Signature");
this.params = params;
Object fields = args.get("fields");
sigFields = fields == null ? null: StrUtils.splitSmart((String)fields, ",", true);
if (sigFields != null) {
Collections.sort(sigFields);
}
}
}
public List<String> getSigFields() {
return sigFields;
}
public String getSignatureField() {
return signatureField;
}
public boolean isEnabled() {
return enabled;
}
public String getSignatureClass() {
return signatureClass;
}
public boolean getOverwriteDupes() {
return overwriteDupes;
}
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp, UpdateRequestProcessor next) {
return new SignatureUpdateProcessor(req, rsp, this, next);
}
class SignatureUpdateProcessor extends UpdateRequestProcessor {
public SignatureUpdateProcessor(SolrQueryRequest req,
SolrQueryResponse rsp, SignatureUpdateProcessorFactory factory,
UpdateRequestProcessor next) {
super(next);
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
if (enabled) {
SolrInputDocument doc = cmd.getSolrInputDocument();
if (sigFields == null || sigFields.size() == 0) {
Collection<String> docFields = doc.getFieldNames();
sigFields = new ArrayList<String>(docFields.size());
sigFields.addAll(docFields);
Collections.sort(sigFields);
}
Signature sig = (Signature) loadClass(signatureClass);
sig.init(params);
for (String field : sigFields) {
SolrInputField f = doc.getField(field);
if (f != null) {
sig.add(field);
Object o = f.getValue();
if (o instanceof String) {
sig.add((String)o);
} else if (o instanceof Collection) {
for (Object oo : (Collection)o) {
if (oo instanceof String) {
sig.add((String)oo);
}
}
}
}
}
byte[] signature = sig.getSignature();
char[] arr = new char[signature.length<<1];
for (int i=0; i<signature.length; i++) {
int b = signature[i];
int idx = i<<1;
arr[idx]= StrUtils.HEX_DIGITS[(b >> 4) & 0xf];
arr[idx+1]= StrUtils.HEX_DIGITS[b & 0xf];
}
String sigString = new String(arr);
doc.addField(signatureField, sigString);
if (overwriteDupes) {
cmd.updateTerm = signatureTerm.createTerm(sigString);
}
}
if (next != null)
next.processAdd(cmd);
}
@Override
public void processDelete(DeleteUpdateCommand cmd) throws IOException {
if (next != null)
next.processDelete(cmd);
}
@Override
public void processCommit(CommitUpdateCommand cmd) throws IOException {
if (next != null)
next.processCommit(cmd);
}
@Override
public void finish() throws IOException {
if (next != null)
next.finish();
}
}
// for testing
void setEnabled(boolean enabled) {
this.enabled = enabled;
}
/**
* Utility method to dynamically load classes
*/
public static Object loadClass(final String clazz) {
Object loadedClass = null;
Class handlerClass = null;
try {
handlerClass = Class.forName(clazz);
} catch (final NoClassDefFoundError e) {
throw new RuntimeException("Cannot find class : " + clazz, e);
} catch (final ClassNotFoundException e) {
throw new RuntimeException("Cannot find class : " + clazz, e);
}
try {
loadedClass = handlerClass.newInstance();
} catch (final InstantiationException e) {
throw new RuntimeException("Cannot create instance of : " + clazz, e);
} catch (final IllegalAccessException e) {
throw new RuntimeException("Cannot create instance of : " + clazz, e);
}
return loadedClass;
}
}

View File

@ -0,0 +1,142 @@
package org.apache.solr.update.processor;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.solr.common.params.SolrParams;
/*
* <p>This implementation is copied from Apache Nutch. </p>
* <p>An implementation of a page signature. It calculates an MD5 hash
* of a plain text "profile" of a page.</p>
* <p>The algorithm to calculate a page "profile" takes the plain text version of
* a page and performs the following steps:
* <ul>
* <li>remove all characters except letters and digits, and bring all characters
* to lower case,</li>
* <li>split the text into tokens (all consecutive non-whitespace characters),</li>
* <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),</li>
* <li>sort the list of tokens by decreasing frequency,</li>
* <li>round down the counts of tokens to the nearest multiple of QUANT
* (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is 0.01f
* by default, and <code>maxFreq</code> is the maximum token frequency). If
* <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 (which
* means that tokens with frequency 1 are always discarded).</li>
* <li>tokens, which frequency after quantization falls below QUANT, are discarded.</li>
* <li>create a list of tokens and their quantized frequency, separated by spaces,
* in the order of decreasing frequency.</li>
* </ul>
* This list is then submitted to an MD5 hash calculation.*/
public class TextProfileSignature extends MD5Signature {
private float quantRate;
private float minTokenLen;
public void init(SolrParams params) {
quantRate = params.getFloat("quantRate", 0.01f);
minTokenLen = params.getInt("minTokenLen", 2);
}
public byte[] getSignature() {
return super.getSignature();
}
@Override
public void add(String content) {
HashMap<String, Token> tokens = new HashMap<String, Token>();
StringBuilder curToken = new StringBuilder();
int maxFreq = 0;
for (int i = 0; i < content.length(); i++) {
char c = content.charAt(i);
if (Character.isLetterOrDigit(c)) {
curToken.append(Character.toLowerCase(c));
} else {
if (curToken.length() > 0) {
if (curToken.length() > minTokenLen) {
// add it
String s = curToken.toString();
Token tok = tokens.get(s);
if (tok == null) {
tok = new Token(0, s);
tokens.put(s, tok);
}
tok.cnt++;
if (tok.cnt > maxFreq)
maxFreq = tok.cnt;
}
curToken.setLength(0);
}
}
}
// check the last token
if (curToken.length() > minTokenLen) {
// add it
String s = curToken.toString();
Token tok = tokens.get(s);
if (tok == null) {
tok = new Token(0, s);
tokens.put(s, tok);
}
tok.cnt++;
if (tok.cnt > maxFreq)
maxFreq = tok.cnt;
}
Iterator<Token> it = tokens.values().iterator();
ArrayList<Token> profile = new ArrayList<Token>();
// calculate the QUANT value
int quant = Math.round(maxFreq * quantRate);
if (quant < 2) {
if (maxFreq > 1)
quant = 2;
else
quant = 1;
}
while (it.hasNext()) {
Token t = it.next();
// round down to the nearest QUANT
t.cnt = (t.cnt / quant) * quant;
// discard the frequencies below the QUANT
if (t.cnt < quant) {
continue;
}
profile.add(t);
}
Collections.sort(profile, new TokenComparator());
StringBuilder newText = new StringBuilder();
it = profile.iterator();
while (it.hasNext()) {
Token t = it.next();
if (newText.length() > 0)
newText.append("\n");
newText.append(t.toString());
}
super.add(newText.toString());
}
private static class Token {
public int cnt;
public String val;
public Token(int cnt, String val) {
this.cnt = cnt;
this.val = val;
}
public String toString() {
return val + " " + cnt;
}
}
private static class TokenComparator implements Comparator<Token> {
public int compare(Token t1, Token t2) {
return t2.cnt - t1.cnt;
}
}
}

View File

@ -0,0 +1,102 @@
package org.apache.solr.common.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import java.util.Random;
/** Tests for lookup3ycs hash functions
* @author yonik
*/
public class TestHash extends TestCase {
// Test that the java version produces the same output as the C version
public void testEqualsLOOKUP3() {
int[] hashes = new int[] {0xc4c20dd5,0x3ab04cc3,0xebe874a3,0x0e770ef3,0xec321498,0x73845e86,0x8a2db728,0x03c313bb,0xfe5b9199,0x95965125,0xcbc4e7c2};
/*** the hash values were generated by adding the following to lookup3.c
*
* char* s = "hello world";
* int len = strlen(s);
* uint32_t a[len];
* for (int i=0; i<len; i++) {
* a[i]=s[i];
* uint32_t result = hashword(a, i+1, i*12345);
* printf("0x%.8x\n", result);
* }
*
*/
String s = "hello world";
int[] a = new int[s.length()];
for (int i=0; i<s.length(); i++) {
a[i] = s.charAt(i);
int len = i+1;
int hash = Hash.lookup3(a, 0, len, i*12345);
assertEquals(hashes[i], hash);
int hash2 = Hash.lookup3ycs(a, 0, len, i*12345+(len<<2));
assertEquals(hashes[i], hash2);
int hash3 = Hash.lookup3ycs(s, 0, len, i*12345+(len<<2));
assertEquals(hashes[i], hash3);
}
}
// test that the hash of the UTF-16 encoded Java String is equal to the hash of the unicode code points
void tstEquiv(int[] utf32, int len) {
int seed=100;
StringBuilder sb = new StringBuilder();
for (int i=0; i<len; i++) sb.appendCodePoint(utf32[i]);
int hash = Hash.lookup3(utf32, 0, len, seed -(len<<2));
int hash2 = Hash.lookup3ycs(utf32, 0, len, seed);
assertEquals(hash, hash2);
int hash3 = Hash.lookup3ycs(sb, 0, sb.length(), seed);
assertEquals(hash, hash3);
long hash4 = Hash.lookup3ycs64(sb, 0, sb.length(), seed);
assertEquals((int)hash4, hash);
}
public void testHash() {
Random r = new Random(0);
int[] utf32 = new int[20];
tstEquiv(utf32,0);
utf32[0]=0x10000;
tstEquiv(utf32,1);
utf32[0]=0x8000;
tstEquiv(utf32,1);
utf32[0]=Character.MAX_CODE_POINT;
tstEquiv(utf32,1);
for (int iter=0; iter<10000; iter++) {
int len = r.nextInt(utf32.length+1);
for (int i=0; i<len; i++) {
int codePoint;
do {
codePoint = r.nextInt(Character.MAX_CODE_POINT+1);
} while((codePoint & 0xF800) == 0xD800); // avoid surrogate code points
utf32[i] = codePoint;
}
// System.out.println("len="+len + ","+utf32[0]+","+utf32[1]);
tstEquiv(utf32, len);
}
}
}

View File

@ -0,0 +1,187 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.util.AbstractSolrTestCase;
/**
*
*/
public class SignatureUpdateProcessorFactoryTest extends AbstractSolrTestCase {
@Override
public String getSchemaFile() {
return "schema.xml";
}
@Override
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void testDupeDetection() throws Exception {
SolrCore core = h.getCore();
UpdateRequestProcessorChain chained = core.getUpdateProcessingChain(
"dedupe");
SignatureUpdateProcessorFactory factory = ((SignatureUpdateProcessorFactory) chained
.getFactories()[0]);
factory.setEnabled(true);
assertNotNull(chained);
addDoc(adoc("id", "1a", "v_t", "Hello Dude man!", "name", "ali babi'"));
addDoc(adoc("id", "2a", "name", "ali babi", "v_t", "Hello Dude man . -"));
addDoc(commit());
addDoc(adoc("name", "ali babi'", "id", "3a", "v_t", "Hello Dude man!"));
addDoc(commit());
assertEquals(1l, core.getSearcher().get().getReader().numDocs());
addDoc(adoc("id", "3b", "v_t", "Hello Dude man!", "t_field",
"fake value galore"));
addDoc(commit());
assertEquals(2l, core.getSearcher().get().getReader().numDocs());
assertU(adoc("id", "5a", "name", "ali babi", "v_t", "MMMMM"));
addDoc(delI("5a"));
addDoc(adoc("id", "5a", "name", "ali babi", "v_t", "MMMMM"));
addDoc(commit());
assertEquals(3l, core.getSearcher().get().getReader().numDocs());
addDoc(adoc("id", "same", "name", "baryy white", "v_t", "random1"));
addDoc(adoc("id", "same", "name", "bishop black", "v_t", "random2"));
addDoc(commit());
assertEquals(4l, core.getSearcher().get().getReader().numDocs());
factory.setEnabled(false);
}
public void testMultiThreaded() throws Exception {
UpdateRequestProcessorChain chained = h.getCore().getUpdateProcessingChain(
"dedupe");
SignatureUpdateProcessorFactory factory = ((SignatureUpdateProcessorFactory) chained
.getFactories()[0]);
factory.setEnabled(true);
Thread[] threads = null;
Thread[] threads2 = null;
threads = new Thread[7];
for (int i = 0; i < threads.length; i++) {
threads[i] = new Thread() {
public void run() {
for (int i = 0; i < 30; i++) {
// h.update(adoc("id", Integer.toString(1+ i), "v_t",
// "Goodbye Dude girl!"));
try {
addDoc(adoc("id", Integer.toString(1 + i), "v_t",
"Goodbye Dude girl!"));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
};
threads[i].setName("testThread-" + i);
}
threads2 = new Thread[3];
for (int i = 0; i < threads2.length; i++) {
threads2[i] = new Thread() {
public void run() {
for (int i = 0; i < 10; i++) {
// h.update(adoc("id" , Integer.toString(1+ i + 10000), "v_t",
// "Goodbye Dude girl"));
// h.update(commit());
try {
addDoc(adoc("id", Integer.toString(1 + i), "v_t",
"Goodbye Dude girl!"));
addDoc(commit());
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
};
threads2[i].setName("testThread2-" + i);
}
for (int i = 0; i < threads.length; i++) {
threads[i].start();
}
for (int i = 0; i < threads2.length; i++) {
threads2[i].start();
}
for (int i = 0; i < threads.length; i++) {
threads[i].join();
}
for (int i = 0; i < threads2.length; i++) {
threads2[i].join();
}
SolrCore core = h.getCore();
assertU(commit());
assertEquals(1l, core.getSearcher().get().getReader().numDocs());
factory.setEnabled(false);
}
private void addDoc(String doc) throws Exception {
Map<String, String[]> params = new HashMap<String, String[]>();
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "dedupe" });
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(),
(SolrParams) mmparams) {
};
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
handler.init(null);
ArrayList<ContentStream> streams = new ArrayList<ContentStream>(2);
streams.add(new ContentStreamBase.StringStream(doc));
req.setContentStreams(streams);
handler.handleRequestBody(req, new SolrQueryResponse());
}
}

View File

@ -348,6 +348,7 @@
<fields> <fields>
<field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/> <field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="signatureField" type="string" indexed="true" stored="false"/>
<field name="uuid" type="uuid" stored="true" /> <field name="uuid" type="uuid" stored="true" />
<field name="name" type="nametext" indexed="true" stored="true"/> <field name="name" type="nametext" indexed="true" stored="true"/>
<field name="text" type="text" indexed="true" stored="false"/> <field name="text" type="text" indexed="true" stored="false"/>

View File

@ -432,6 +432,14 @@
<queryParser name="foo" class="FooQParserPlugin"/> <queryParser name="foo" class="FooQParserPlugin"/>
<updateRequestProcessorChain name="dedupe">
<processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
<bool name="enabled">false</bool>
<bool name="overwriteDupes">true</bool>
<str name="fields">v_t,t_field</str>
<str name="signatureClass">org.apache.solr.update.processor.TextProfileSignature</str>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
</config> </config>