LUCENE-6818: Add DFISimilarity implementing the divergence from independence model

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725205 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2016-01-18 09:24:22 +00:00
parent ddf13a54a3
commit 1a9f11ce45
11 changed files with 269 additions and 38 deletions

View File

@ -123,6 +123,9 @@ New Features
as alternative to their SPI name. This enables compile-time safety when
defining analyzer's components. (Uwe Schindler, Shai Erera)
* LUCENE-6818: Add DFISimilarity implementing the divergence from independence
model. (Ahmet Arslan via Robert Muir)
API Changes
* LUCENE-6908: GeoUtils static relational methods have been refactored to new

View File

@ -0,0 +1,65 @@
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Implements the <em>Divergence from Independence (DFI)</em> model based on Chi-square statistics
* (i.e., standardized Chi-squared distance from independence in term frequency tf).
* <p>
* DFI is both parameter-free and non-parametric:
* <ul>
* <li>parameter-free: it does not require any parameter tuning or training.</li>
* <li>non-parametric: it does not make any assumptions about word frequency distributions on document collections.</li>
* </ul>
* <p>
* It is highly recommended <b>not</b> to remove stopwords (very common terms: the, of, and, to, a, in, for, is, on, that, etc) with this similarity.
* <p>
* For more information see: <a href="http://dx.doi.org/10.1007/s10791-013-9225-4">A nonparametric term weighting method for information retrieval based on measuring the divergence from independence</a>
*
* @lucene.experimental
* @see org.apache.lucene.search.similarities.DFRSimilarity
*/
public class DFISimilarity extends SimilarityBase {
/**
* Sole constructor: DFI is parameter-free.
*/
public DFISimilarity() {
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
// if the observed frequency is less than or equal to the expected value, then return zero.
if (freq <= expected) return 0;
final float chiSquare = (freq - expected) * (freq - expected) / expected;
return stats.getBoost() * (float) log2(chiSquare + 1);
}
@Override
public String toString() {
return "DFI";
}
}

View File

@ -72,6 +72,7 @@ public class TestSimilarity2 extends LuceneTestCase {
sims.add(new LMDirichletSimilarity());
sims.add(new LMJelinekMercerSimilarity(0.1f));
sims.add(new LMJelinekMercerSimilarity(0.7f));
sims.add(new DFISimilarity());
}
/** because of stupid things like querynorm, it's possible we computeStats on a field that doesnt exist at all

View File

@ -140,6 +140,7 @@ public class TestSimilarityBase extends LuceneTestCase {
sims.add(new LMDirichletSimilarity());
sims.add(new LMJelinekMercerSimilarity(0.1f));
sims.add(new LMJelinekMercerSimilarity(0.7f));
sims.add(new DFISimilarity());
}
// ------------------------------- Unit tests --------------------------------

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -24,37 +24,6 @@ import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.lucene.search.similarities.AfterEffect;
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.BasicModel;
import org.apache.lucene.search.similarities.BasicModelBE;
import org.apache.lucene.search.similarities.BasicModelD;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelIn;
import org.apache.lucene.search.similarities.BasicModelIne;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.Distribution;
import org.apache.lucene.search.similarities.DistributionLL;
import org.apache.lucene.search.similarities.DistributionSPL;
import org.apache.lucene.search.similarities.IBSimilarity;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Lambda;
import org.apache.lucene.search.similarities.LambdaDF;
import org.apache.lucene.search.similarities.LambdaTTF;
import org.apache.lucene.search.similarities.Normalization;
import org.apache.lucene.search.similarities.NormalizationH1;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3;
import org.apache.lucene.search.similarities.NormalizationZ;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
/**
* Similarity implementation that randomizes Similarity implementations
* per-field.
@ -62,7 +31,7 @@ import org.apache.lucene.search.similarities.Similarity;
* The choices are 'sticky', so the selected algorithm is always used
* for the same field.
*/
public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
public class RandomSimilarity extends PerFieldSimilarityWrapper {
final ClassicSimilarity defaultSim = new ClassicSimilarity();
final List<Similarity> knownSims;
Map<String,Similarity> previousMappings = new HashMap<>();
@ -70,7 +39,7 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
final int coordType; // 0 = no coord, 1 = coord, 2 = crazy coord
final boolean shouldQueryNorm;
public RandomSimilarityProvider(Random random) {
public RandomSimilarity(Random random) {
perFieldSeed = random.nextInt();
coordType = random.nextInt(3);
shouldQueryNorm = random.nextBoolean();
@ -159,6 +128,7 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
allSims.add(new LMDirichletSimilarity()); */
allSims.add(new LMJelinekMercerSimilarity(0.1f));
allSims.add(new LMJelinekMercerSimilarity(0.7f));
allSims.add(new DFISimilarity());
}
@Override
@ -171,6 +141,6 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
} else {
coordMethod = "crazy";
}
return "RandomSimilarityProvider(queryNorm=" + shouldQueryNorm + ",coord=" + coordMethod + "): " + previousMappings.toString();
return "RandomSimilarity(queryNorm=" + shouldQueryNorm + ",coord=" + coordMethod + "): " + previousMappings.toString();
}
}

View File

@ -0,0 +1,32 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- not a package-info.java, because we already defined this package in core/ -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Support for testing similarities
<p>
The primary classes are:
<ul>
<li>{@link org.apache.lucene.search.similarities.RandomSimilarity}: Randomizes similarity per-field in tests.
</ul>
</p>
</body>
</html>

View File

@ -37,8 +37,8 @@ import org.apache.lucene.codecs.lucene60.Lucene60Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
import org.apache.lucene.search.RandomSimilarityProvider;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.RandomSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.junit.internal.AssumptionViolatedException;
@ -207,7 +207,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
TimeZone randomTimeZone = randomTimeZone(random());
timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
TimeZone.setDefault(timeZone);
similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarityProvider(random());
similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
// Check codec restrictions once at class level.
try {

View File

@ -1217,7 +1217,7 @@ public final class TestUtil {
int evilness = TestUtil.nextInt(random, 0, 20);
StringBuilder sb = new StringBuilder();
while (sb.length() < wordLength) {;
while (sb.length() < wordLength) {
if (simple) {
sb.append(random.nextBoolean() ? TestUtil.randomSimpleString(random, wordLength) : TestUtil.randomHtmlishString(random, wordLength));
} else {

View File

@ -0,0 +1,52 @@
package org.apache.solr.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.schema.SimilarityFactory;
/**
* Factory for {@link DFISimilarity}
* <p>
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets {@link org.apache.lucene.search.similarities.SimilarityBase#setDiscountOverlaps(boolean)}</li>
* </ul>
*
* @lucene.experimental
*/
public class DFISimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
@Override
public void init(SolrParams params) {
super.init(params);
discountOverlaps = params.getBool(ClassicSimilarityFactory.DISCOUNT_OVERLAPS, true);
}
@Override
public Similarity getSimilarity() {
DFISimilarity sim = new DFISimilarity();
sim.setDiscountOverlaps(discountOverlaps);
return sim;
}
}

View File

@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for DFISimilarityFactory -->
<schema name="dfi" version="1.5">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- default parameters -->
<fieldType name="text" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFISimilarityFactory"/>
</fieldType>
<!-- with discountOverlaps parameter -->
<fieldType name="text_params" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFISimilarityFactory">
<bool name="discountOverlaps">false</bool>
</similarity>
</fieldType>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="text_params" type="text_params" indexed="true" stored="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
<similarity class="solr.SchemaSimilarityFactory"/>
</schema>

View File

@ -0,0 +1,54 @@
package org.apache.solr.search.similarities;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.similarities.DFISimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.junit.BeforeClass;
/**
* Tests {@link DFISimilarityFactory}
*/
public class TestDFISimilarityFactory extends BaseSimilarityTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-basic.xml", "schema-dfi.xml");
}
/**
* dfi with no parameters
*/
public void test() throws Exception {
Similarity sim = getSimilarity("text");
assertEquals(DFISimilarity.class, sim.getClass());
DFISimilarity dfi = (DFISimilarity) sim;
assertTrue(dfi.getDiscountOverlaps());
}
/**
* dfi with discountOverlaps parameter set to false
*/
public void testParameters() throws Exception {
Similarity sim = getSimilarity("text_params");
assertEquals(DFISimilarity.class, sim.getClass());
DFISimilarity dfr = (DFISimilarity) sim;
assertFalse(dfr.getDiscountOverlaps());
}
}