LUCENE-496 - Command line tool for modifying the field norms of an existing index

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@409428 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2006-05-25 18:49:04 +00:00
parent f68744c32b
commit dd3d979905
3 changed files with 340 additions and 0 deletions

View File

@ -18,6 +18,11 @@ API Changes
now throws an IllegalArgumentException now throws an IllegalArgumentException
(Daniel Naber) (Daniel Naber)
New features
1. LUCENE-496: Command line tool for modifying the field norms of an
existing index; added to contrib/miscellaneous. (Chris Hostetter)
Bug fixes Bug fixes
1. LUCENE-330: Fix issue of FilteredQuery not working properly within 1. LUCENE-330: Fix issue of FilteredQuery not working properly within

View File

@ -0,0 +1,156 @@
package org.apache.lucene.misc;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.IOException;
import java.util.Date;
/**
* Given a directory, a Similarity, and a list of fields, updates the
* fieldNorms in place for every document using the Similarity.lengthNorm.
*
* <p>
* NOTE: This only works if you do <b>not</b> use field/document boosts in your
* index.
* </p>
*
* @version $Id:$
*/
public class LengthNormModifier {
/**
* Command Line Execution method
*
* <pre>
* Usage: LengthNormModifier /path/index package.SimilarityClassName field1 field2 ...
* </pre>
*
public static void main(String[] args) throws IOException {
if (args.length < 3) {
System.err.println("Usage: LengthNormModifier <index> <package.SimilarityClassName> <field1> [field2] ...");
System.exit(1);
}
Similarity s = null;
try {
Class simClass = Class.forName(args[1]);
s = (Similarity)simClass.newInstance();
} catch (Exception e) {
System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]);
e.printStackTrace(System.err);
}
File index = new File(args[0]);
Directory d = FSDirectory.getDirectory(index,false);
LengthNormModifier lnm = new LengthNormModifier(d, s);
for (int i = 2; i < args.length; i++) {
System.out.print("Updating field: " + args[i] + " " +
(new Date()).toString() + " ... ");
lnm.reSetNorms(args[i]);
System.out.println(new Date().toString());
}
d.close();
}
private Directory dir;
private Similarity sim;
/**
* Constructor for code that wishes to use this class progromaticaly
*
* @param d The Directory to modify
* @param s The Similarity to use in <code>reSetNorms</code>
*/
public LengthNormModifier(Directory d, Similarity s) {
dir = d;
sim = s;
}
/**
* Re-Set the norms for the specified field.
*
* <p>
* Opens a new IndexReader on the Directory given to this instance,
* modifies the norms using the Similarity given to this instance,
* and closes the IndexReader.
* </p>
*
* @param field the field whose norms
*/
public void reSetNorms(String field) throws IOException {
String fieldName = field.intern();
int[] termCounts = new int[0];
IndexReader reader = null;
TermEnum termEnum = null;
TermDocs termDocs = null;
try {
reader = IndexReader.open(dir);
termCounts = new int[reader.maxDoc()];
try {
termEnum = reader.terms(new Term(field,""));
try {
termDocs = reader.termDocs();
do {
Term term = termEnum.term();
if (term != null && term.field().equals(fieldName)) {
termDocs.seek(termEnum.term());
while (termDocs.next()) {
termCounts[termDocs.doc()] += termDocs.freq();
}
}
} while (termEnum.next());
} finally {
if (null != termDocs) termDocs.close();
}
} finally {
if (null != termEnum) termEnum.close();
}
} finally {
if (null != reader) reader.close();
}
try {
reader = IndexReader.open(dir);
for (int d = 0; d < termCounts.length; d++) {
if (! reader.isDeleted(d)) {
byte norm = sim.encodeNorm
(sim.lengthNorm(fieldName, termCounts[d]));
reader.setNorm(d, fieldName, norm);
}
}
} finally {
if (null != reader) reader.close();
}
}
}

View File

@ -0,0 +1,179 @@
package org.apache.lucene.misc;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* Tests changing the norms after changing the simularity
*
* @version $Id:$
*/
public class TestLengthNormModifier extends TestCase {
public TestLengthNormModifier(String name) {
super(name);
}
public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f);
public static int NUM_DOCS = 5;
public Directory store = new RAMDirectory();
/** inverts the normal notion of lengthNorm */
public static Similarity s = new DefaultSimilarity() {
public float lengthNorm(String fieldName, int numTokens) {
return (float)numTokens;
}
};
public void setUp() throws Exception {
IndexWriter writer = new
IndexWriter(store, new SimpleAnalyzer(), true);
IndexSearcher searcher;
for (int i = 0; i < NUM_DOCS; i++) {
Document d = new Document();
d.add(new Field("field", "word",
Field.Store.YES, Field.Index.TOKENIZED));
d.add(new Field("nonorm", "word",
Field.Store.YES, Field.Index.NO_NORMS));
for (int j = 1; j <= i; j++) {
d.add(new Field("field", "crap",
Field.Store.YES, Field.Index.TOKENIZED));
d.add(new Field("nonorm", "more words",
Field.Store.YES, Field.Index.NO_NORMS));
}
writer.addDocument(d);
}
writer.close();
}
public void testMissingField() {
LengthNormModifier lnm = new LengthNormModifier(store, s);
try {
lnm.reSetNorms("nobodyherebutuschickens");
} catch (Exception e) {
assertNull("caught something", e);
}
}
public void testFieldWithNoNorm() throws Exception {
IndexReader r = IndexReader.open(store);
byte[] norms = r.norms("nonorm");
// sanity check, norms should all be 1
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
for (int i = 0; i< norms.length; i++) {
assertEquals(""+i, DEFAULT_NORM, norms[i]);
}
r.close();
LengthNormModifier lnm = new LengthNormModifier(store, s);
try {
lnm.reSetNorms("nonorm");
} catch (Exception e) {
assertNull("caught something", e);
}
// nothing should have changed
r = IndexReader.open(store);
norms = r.norms("nonorm");
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
for (int i = 0; i< norms.length; i++) {
assertEquals(""+i, DEFAULT_NORM, norms[i]);
}
r.close();
}
public void testGoodCases() throws Exception {
IndexSearcher searcher;
final float[] scores = new float[NUM_DOCS];
float lastScore = 0.0f;
// default similarity should put docs with shorter length first
searcher = new IndexSearcher(store);
searcher.search
(new TermQuery(new Term("field", "word")),
new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
searcher.close();
lastScore = Float.MAX_VALUE;
for (int i = 0; i < NUM_DOCS; i++) {
String msg = "i=" + i + ", "+scores[i]+" <= "+lastScore;
assertTrue(msg, scores[i] <= lastScore);
//System.out.println(msg);
lastScore = scores[i];
}
// override the norms to be inverted
Similarity s = new DefaultSimilarity() {
public float lengthNorm(String fieldName, int numTokens) {
return (float)numTokens;
}
};
LengthNormModifier lnm = new LengthNormModifier(store, s);
lnm.reSetNorms("field");
// new norm (with default similarity) should put longer docs first
searcher = new IndexSearcher(store);
searcher.search
(new TermQuery(new Term("field", "word")),
new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
searcher.close();
lastScore = 0.0f;
for (int i = 0; i < NUM_DOCS; i++) {
String msg = "i=" + i + ", "+scores[i]+" >= "+lastScore;
assertTrue(msg, scores[i] >= lastScore);
//System.out.println(msg);
lastScore = scores[i];
}
}
}