mirror of https://github.com/apache/lucene.git
LUCENE-496 - Command line tool for modifying the field norms of an existing index
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@409428 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f68744c32b
commit
dd3d979905
|
@ -18,6 +18,11 @@ API Changes
|
||||||
now throws an IllegalArgumentException
|
now throws an IllegalArgumentException
|
||||||
(Daniel Naber)
|
(Daniel Naber)
|
||||||
|
|
||||||
|
New features
|
||||||
|
|
||||||
|
1. LUCENE-496: Command line tool for modifying the field norms of an
|
||||||
|
existing index; added to contrib/miscellaneous. (Chris Hostetter)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-330: Fix issue of FilteredQuery not working properly within
|
1. LUCENE-330: Fix issue of FilteredQuery not working properly within
|
||||||
|
|
|
@ -0,0 +1,156 @@
|
||||||
|
package org.apache.lucene.misc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2006 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.index.TermDocs;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a directory, a Similarity, and a list of fields, updates the
|
||||||
|
* fieldNorms in place for every document using the Similarity.lengthNorm.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* NOTE: This only works if you do <b>not</b> use field/document boosts in your
|
||||||
|
* index.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @version $Id:$
|
||||||
|
*/
|
||||||
|
public class LengthNormModifier {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Command Line Execution method
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* Usage: LengthNormModifier /path/index package.SimilarityClassName field1 field2 ...
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
if (args.length < 3) {
|
||||||
|
System.err.println("Usage: LengthNormModifier <index> <package.SimilarityClassName> <field1> [field2] ...");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Similarity s = null;
|
||||||
|
try {
|
||||||
|
Class simClass = Class.forName(args[1]);
|
||||||
|
s = (Similarity)simClass.newInstance();
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]);
|
||||||
|
e.printStackTrace(System.err);
|
||||||
|
}
|
||||||
|
|
||||||
|
File index = new File(args[0]);
|
||||||
|
Directory d = FSDirectory.getDirectory(index,false);
|
||||||
|
|
||||||
|
LengthNormModifier lnm = new LengthNormModifier(d, s);
|
||||||
|
|
||||||
|
for (int i = 2; i < args.length; i++) {
|
||||||
|
System.out.print("Updating field: " + args[i] + " " +
|
||||||
|
(new Date()).toString() + " ... ");
|
||||||
|
lnm.reSetNorms(args[i]);
|
||||||
|
System.out.println(new Date().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
d.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Directory dir;
|
||||||
|
private Similarity sim;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructor for code that wishes to use this class progromaticaly
|
||||||
|
*
|
||||||
|
* @param d The Directory to modify
|
||||||
|
* @param s The Similarity to use in <code>reSetNorms</code>
|
||||||
|
*/
|
||||||
|
public LengthNormModifier(Directory d, Similarity s) {
|
||||||
|
dir = d;
|
||||||
|
sim = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Re-Set the norms for the specified field.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Opens a new IndexReader on the Directory given to this instance,
|
||||||
|
* modifies the norms using the Similarity given to this instance,
|
||||||
|
* and closes the IndexReader.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param field the field whose norms
|
||||||
|
*/
|
||||||
|
public void reSetNorms(String field) throws IOException {
|
||||||
|
String fieldName = field.intern();
|
||||||
|
int[] termCounts = new int[0];
|
||||||
|
|
||||||
|
IndexReader reader = null;
|
||||||
|
TermEnum termEnum = null;
|
||||||
|
TermDocs termDocs = null;
|
||||||
|
try {
|
||||||
|
reader = IndexReader.open(dir);
|
||||||
|
termCounts = new int[reader.maxDoc()];
|
||||||
|
try {
|
||||||
|
termEnum = reader.terms(new Term(field,""));
|
||||||
|
try {
|
||||||
|
termDocs = reader.termDocs();
|
||||||
|
do {
|
||||||
|
Term term = termEnum.term();
|
||||||
|
if (term != null && term.field().equals(fieldName)) {
|
||||||
|
termDocs.seek(termEnum.term());
|
||||||
|
while (termDocs.next()) {
|
||||||
|
termCounts[termDocs.doc()] += termDocs.freq();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (termEnum.next());
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
if (null != termDocs) termDocs.close();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (null != termEnum) termEnum.close();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
if (null != reader) reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
reader = IndexReader.open(dir);
|
||||||
|
for (int d = 0; d < termCounts.length; d++) {
|
||||||
|
if (! reader.isDeleted(d)) {
|
||||||
|
byte norm = sim.encodeNorm
|
||||||
|
(sim.lengthNorm(fieldName, termCounts[d]));
|
||||||
|
reader.setNorm(d, fieldName, norm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
if (null != reader) reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,179 @@
|
||||||
|
package org.apache.lucene.misc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2006 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.HitCollector;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests changing the norms after changing the simularity
|
||||||
|
*
|
||||||
|
* @version $Id:$
|
||||||
|
*/
|
||||||
|
public class TestLengthNormModifier extends TestCase {
|
||||||
|
public TestLengthNormModifier(String name) {
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f);
|
||||||
|
|
||||||
|
public static int NUM_DOCS = 5;
|
||||||
|
|
||||||
|
public Directory store = new RAMDirectory();
|
||||||
|
|
||||||
|
/** inverts the normal notion of lengthNorm */
|
||||||
|
public static Similarity s = new DefaultSimilarity() {
|
||||||
|
public float lengthNorm(String fieldName, int numTokens) {
|
||||||
|
return (float)numTokens;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
IndexWriter writer = new
|
||||||
|
IndexWriter(store, new SimpleAnalyzer(), true);
|
||||||
|
|
||||||
|
IndexSearcher searcher;
|
||||||
|
|
||||||
|
for (int i = 0; i < NUM_DOCS; i++) {
|
||||||
|
Document d = new Document();
|
||||||
|
d.add(new Field("field", "word",
|
||||||
|
Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("nonorm", "word",
|
||||||
|
Field.Store.YES, Field.Index.NO_NORMS));
|
||||||
|
|
||||||
|
for (int j = 1; j <= i; j++) {
|
||||||
|
d.add(new Field("field", "crap",
|
||||||
|
Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
d.add(new Field("nonorm", "more words",
|
||||||
|
Field.Store.YES, Field.Index.NO_NORMS));
|
||||||
|
}
|
||||||
|
writer.addDocument(d);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMissingField() {
|
||||||
|
LengthNormModifier lnm = new LengthNormModifier(store, s);
|
||||||
|
try {
|
||||||
|
lnm.reSetNorms("nobodyherebutuschickens");
|
||||||
|
} catch (Exception e) {
|
||||||
|
assertNull("caught something", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFieldWithNoNorm() throws Exception {
|
||||||
|
|
||||||
|
IndexReader r = IndexReader.open(store);
|
||||||
|
byte[] norms = r.norms("nonorm");
|
||||||
|
|
||||||
|
// sanity check, norms should all be 1
|
||||||
|
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
|
||||||
|
for (int i = 0; i< norms.length; i++) {
|
||||||
|
assertEquals(""+i, DEFAULT_NORM, norms[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
|
||||||
|
LengthNormModifier lnm = new LengthNormModifier(store, s);
|
||||||
|
try {
|
||||||
|
lnm.reSetNorms("nonorm");
|
||||||
|
} catch (Exception e) {
|
||||||
|
assertNull("caught something", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nothing should have changed
|
||||||
|
r = IndexReader.open(store);
|
||||||
|
|
||||||
|
norms = r.norms("nonorm");
|
||||||
|
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
|
||||||
|
for (int i = 0; i< norms.length; i++) {
|
||||||
|
assertEquals(""+i, DEFAULT_NORM, norms[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testGoodCases() throws Exception {
|
||||||
|
|
||||||
|
IndexSearcher searcher;
|
||||||
|
final float[] scores = new float[NUM_DOCS];
|
||||||
|
float lastScore = 0.0f;
|
||||||
|
|
||||||
|
// default similarity should put docs with shorter length first
|
||||||
|
searcher = new IndexSearcher(store);
|
||||||
|
searcher.search
|
||||||
|
(new TermQuery(new Term("field", "word")),
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
scores[doc] = score;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
searcher.close();
|
||||||
|
|
||||||
|
lastScore = Float.MAX_VALUE;
|
||||||
|
for (int i = 0; i < NUM_DOCS; i++) {
|
||||||
|
String msg = "i=" + i + ", "+scores[i]+" <= "+lastScore;
|
||||||
|
assertTrue(msg, scores[i] <= lastScore);
|
||||||
|
//System.out.println(msg);
|
||||||
|
lastScore = scores[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// override the norms to be inverted
|
||||||
|
Similarity s = new DefaultSimilarity() {
|
||||||
|
public float lengthNorm(String fieldName, int numTokens) {
|
||||||
|
return (float)numTokens;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
LengthNormModifier lnm = new LengthNormModifier(store, s);
|
||||||
|
lnm.reSetNorms("field");
|
||||||
|
|
||||||
|
// new norm (with default similarity) should put longer docs first
|
||||||
|
searcher = new IndexSearcher(store);
|
||||||
|
searcher.search
|
||||||
|
(new TermQuery(new Term("field", "word")),
|
||||||
|
new HitCollector() {
|
||||||
|
public final void collect(int doc, float score) {
|
||||||
|
scores[doc] = score;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
searcher.close();
|
||||||
|
|
||||||
|
lastScore = 0.0f;
|
||||||
|
for (int i = 0; i < NUM_DOCS; i++) {
|
||||||
|
String msg = "i=" + i + ", "+scores[i]+" >= "+lastScore;
|
||||||
|
assertTrue(msg, scores[i] >= lastScore);
|
||||||
|
//System.out.println(msg);
|
||||||
|
lastScore = scores[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue