mirror of https://github.com/apache/lucene.git
LUCENE-6818: Add DFISimilarity implementing the divergence from independence model
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1725205 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ddf13a54a3
commit
1a9f11ce45
|
@ -123,6 +123,9 @@ New Features
|
|||
as alternative to their SPI name. This enables compile-time safety when
|
||||
defining analyzer's components. (Uwe Schindler, Shai Erera)
|
||||
|
||||
* LUCENE-6818: Add DFISimilarity implementing the divergence from independence
|
||||
model. (Ahmet Arslan via Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-6908: GeoUtils static relational methods have been refactored to new
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Implements the <em>Divergence from Independence (DFI)</em> model based on Chi-square statistics
|
||||
* (i.e., standardized Chi-squared distance from independence in term frequency tf).
|
||||
* <p>
|
||||
* DFI is both parameter-free and non-parametric:
|
||||
* <ul>
|
||||
* <li>parameter-free: it does not require any parameter tuning or training.</li>
|
||||
* <li>non-parametric: it does not make any assumptions about word frequency distributions on document collections.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* It is highly recommended <b>not</b> to remove stopwords (very common terms: the, of, and, to, a, in, for, is, on, that, etc) with this similarity.
|
||||
* <p>
|
||||
* For more information see: <a href="http://dx.doi.org/10.1007/s10791-013-9225-4">A nonparametric term weighting method for information retrieval based on measuring the divergence from independence</a>
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see org.apache.lucene.search.similarities.DFRSimilarity
|
||||
*/
|
||||
|
||||
|
||||
public class DFISimilarity extends SimilarityBase {
|
||||
|
||||
/**
|
||||
* Sole constructor: DFI is parameter-free.
|
||||
*/
|
||||
public DFISimilarity() {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(BasicStats stats, float freq, float docLen) {
|
||||
|
||||
final float expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1);
|
||||
|
||||
// if the observed frequency is less than or equal to the expected value, then return zero.
|
||||
if (freq <= expected) return 0;
|
||||
|
||||
final float chiSquare = (freq - expected) * (freq - expected) / expected;
|
||||
|
||||
return stats.getBoost() * (float) log2(chiSquare + 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "DFI";
|
||||
}
|
||||
}
|
||||
|
|
@ -72,6 +72,7 @@ public class TestSimilarity2 extends LuceneTestCase {
|
|||
sims.add(new LMDirichletSimilarity());
|
||||
sims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
sims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
sims.add(new DFISimilarity());
|
||||
}
|
||||
|
||||
/** because of stupid things like querynorm, it's possible we computeStats on a field that doesnt exist at all
|
||||
|
|
|
@ -140,6 +140,7 @@ public class TestSimilarityBase extends LuceneTestCase {
|
|||
sims.add(new LMDirichletSimilarity());
|
||||
sims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
sims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
sims.add(new DFISimilarity());
|
||||
}
|
||||
|
||||
// ------------------------------- Unit tests --------------------------------
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search;
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -24,37 +24,6 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.search.similarities.AfterEffect;
|
||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||
import org.apache.lucene.search.similarities.BasicModel;
|
||||
import org.apache.lucene.search.similarities.BasicModelBE;
|
||||
import org.apache.lucene.search.similarities.BasicModelD;
|
||||
import org.apache.lucene.search.similarities.BasicModelG;
|
||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||
import org.apache.lucene.search.similarities.BasicModelIn;
|
||||
import org.apache.lucene.search.similarities.BasicModelIne;
|
||||
import org.apache.lucene.search.similarities.BasicModelP;
|
||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.Distribution;
|
||||
import org.apache.lucene.search.similarities.DistributionLL;
|
||||
import org.apache.lucene.search.similarities.DistributionSPL;
|
||||
import org.apache.lucene.search.similarities.IBSimilarity;
|
||||
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
|
||||
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
|
||||
import org.apache.lucene.search.similarities.Lambda;
|
||||
import org.apache.lucene.search.similarities.LambdaDF;
|
||||
import org.apache.lucene.search.similarities.LambdaTTF;
|
||||
import org.apache.lucene.search.similarities.Normalization;
|
||||
import org.apache.lucene.search.similarities.NormalizationH1;
|
||||
import org.apache.lucene.search.similarities.NormalizationH2;
|
||||
import org.apache.lucene.search.similarities.NormalizationH3;
|
||||
import org.apache.lucene.search.similarities.NormalizationZ;
|
||||
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
|
||||
/**
|
||||
* Similarity implementation that randomizes Similarity implementations
|
||||
* per-field.
|
||||
|
@ -62,7 +31,7 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
* The choices are 'sticky', so the selected algorithm is always used
|
||||
* for the same field.
|
||||
*/
|
||||
public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
|
||||
public class RandomSimilarity extends PerFieldSimilarityWrapper {
|
||||
final ClassicSimilarity defaultSim = new ClassicSimilarity();
|
||||
final List<Similarity> knownSims;
|
||||
Map<String,Similarity> previousMappings = new HashMap<>();
|
||||
|
@ -70,7 +39,7 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
|
|||
final int coordType; // 0 = no coord, 1 = coord, 2 = crazy coord
|
||||
final boolean shouldQueryNorm;
|
||||
|
||||
public RandomSimilarityProvider(Random random) {
|
||||
public RandomSimilarity(Random random) {
|
||||
perFieldSeed = random.nextInt();
|
||||
coordType = random.nextInt(3);
|
||||
shouldQueryNorm = random.nextBoolean();
|
||||
|
@ -159,6 +128,7 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
|
|||
allSims.add(new LMDirichletSimilarity()); */
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.1f));
|
||||
allSims.add(new LMJelinekMercerSimilarity(0.7f));
|
||||
allSims.add(new DFISimilarity());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -171,6 +141,6 @@ public class RandomSimilarityProvider extends PerFieldSimilarityWrapper {
|
|||
} else {
|
||||
coordMethod = "crazy";
|
||||
}
|
||||
return "RandomSimilarityProvider(queryNorm=" + shouldQueryNorm + ",coord=" + coordMethod + "): " + previousMappings.toString();
|
||||
return "RandomSimilarity(queryNorm=" + shouldQueryNorm + ",coord=" + coordMethod + "): " + previousMappings.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- not a package-info.java, because we already defined this package in core/ -->
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
Support for testing similarities
|
||||
<p>
|
||||
The primary classes are:
|
||||
<ul>
|
||||
<li>{@link org.apache.lucene.search.similarities.RandomSimilarity}: Randomizes similarity per-field in tests.
|
||||
</ul>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -37,8 +37,8 @@ import org.apache.lucene.codecs.lucene60.Lucene60Codec;
|
|||
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
|
||||
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
|
||||
import org.apache.lucene.index.RandomCodec;
|
||||
import org.apache.lucene.search.RandomSimilarityProvider;
|
||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||
import org.apache.lucene.search.similarities.RandomSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.junit.internal.AssumptionViolatedException;
|
||||
|
@ -207,7 +207,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
|
|||
TimeZone randomTimeZone = randomTimeZone(random());
|
||||
timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
|
||||
TimeZone.setDefault(timeZone);
|
||||
similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarityProvider(random());
|
||||
similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
|
||||
|
||||
// Check codec restrictions once at class level.
|
||||
try {
|
||||
|
|
|
@ -1217,7 +1217,7 @@ public final class TestUtil {
|
|||
int evilness = TestUtil.nextInt(random, 0, 20);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
while (sb.length() < wordLength) {;
|
||||
while (sb.length() < wordLength) {
|
||||
if (simple) {
|
||||
sb.append(random.nextBoolean() ? TestUtil.randomSimpleString(random, wordLength) : TestUtil.randomHtmlishString(random, wordLength));
|
||||
} else {
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.solr.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.similarities.DFISimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.schema.SimilarityFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link DFISimilarity}
|
||||
* <p>
|
||||
* Optional settings:
|
||||
* <ul>
|
||||
* <li>discountOverlaps (bool): Sets {@link org.apache.lucene.search.similarities.SimilarityBase#setDiscountOverlaps(boolean)}</li>
|
||||
* </ul>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DFISimilarityFactory extends SimilarityFactory {
|
||||
|
||||
private boolean discountOverlaps;
|
||||
|
||||
@Override
|
||||
public void init(SolrParams params) {
|
||||
super.init(params);
|
||||
discountOverlaps = params.getBool(ClassicSimilarityFactory.DISCOUNT_OVERLAPS, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Similarity getSimilarity() {
|
||||
DFISimilarity sim = new DFISimilarity();
|
||||
sim.setDiscountOverlaps(discountOverlaps);
|
||||
return sim;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Test schema file for DFISimilarityFactory -->
|
||||
|
||||
<schema name="dfi" version="1.5">
|
||||
<types>
|
||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||
|
||||
<!-- default parameters -->
|
||||
<fieldType name="text" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFISimilarityFactory"/>
|
||||
</fieldType>
|
||||
|
||||
<!-- with discountOverlaps parameter -->
|
||||
<fieldType name="text_params" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFISimilarityFactory">
|
||||
<bool name="discountOverlaps">false</bool>
|
||||
</similarity>
|
||||
</fieldType>
|
||||
|
||||
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="text_params" type="text_params" indexed="true" stored="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<similarity class="solr.SchemaSimilarityFactory"/>
|
||||
</schema>
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.solr.search.similarities;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.search.similarities.DFISimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/**
|
||||
* Tests {@link DFISimilarityFactory}
|
||||
*/
|
||||
public class TestDFISimilarityFactory extends BaseSimilarityTestCase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-basic.xml", "schema-dfi.xml");
|
||||
}
|
||||
|
||||
/**
|
||||
* dfi with no parameters
|
||||
*/
|
||||
public void test() throws Exception {
|
||||
Similarity sim = getSimilarity("text");
|
||||
assertEquals(DFISimilarity.class, sim.getClass());
|
||||
DFISimilarity dfi = (DFISimilarity) sim;
|
||||
assertTrue(dfi.getDiscountOverlaps());
|
||||
}
|
||||
|
||||
/**
|
||||
* dfi with discountOverlaps parameter set to false
|
||||
*/
|
||||
public void testParameters() throws Exception {
|
||||
Similarity sim = getSimilarity("text_params");
|
||||
assertEquals(DFISimilarity.class, sim.getClass());
|
||||
DFISimilarity dfr = (DFISimilarity) sim;
|
||||
assertFalse(dfr.getDiscountOverlaps());
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue