LUCENE-496 - Command line tool for modifying the field norms of an existing index

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@409428 13f79535-47bb-0310-9956-ffa450edef68
2006-05-25 18:49:04 +00:00 · 2006-05-25 18:49:04 +00:00 · dd3d979905
parent f68744c32b
commit dd3d979905
3 changed files with 340 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -18,6 +18,11 @@ API Changes
    now throws an IllegalArgumentException
    (Daniel Naber)

+New features
+
+ 1. LUCENE-496: Command line tool for modifying the field norms of an
+    existing index; added to contrib/miscellaneous.  (Chris Hostetter)
+
 Bug fixes

 1. LUCENE-330: Fix issue of FilteredQuery not working properly within
--- a/contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java
@ -0,0 +1,156 @@
+package org.apache.lucene.misc;
+
+/**
+  * Copyright 2006 The Apache Software Foundation
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+
+/**
+ * Given a directory, a Similarity, and a list of fields, updates the
+ * fieldNorms in place for every document using the Similarity.lengthNorm.
+ *
+ * <p>
+ * NOTE: This only works if you do <b>not</b> use field/document boosts in your
+ * index.
+ * </p>
+ *
+ * @version $Id:$
+ */
+public class LengthNormModifier {
+
+    /**
+     * Command Line Execution method
+     *
+     * <pre>
+     * Usage: LengthNormModifier /path/index package.SimilarityClassName field1 field2 ...
+     * </pre>
+     * 
+    public static void main(String[] args) throws IOException {
+	if (args.length < 3) {
+	    System.err.println("Usage: LengthNormModifier <index> <package.SimilarityClassName> <field1> [field2] ...");
+	    System.exit(1);
+	}
+
+	Similarity s = null;
+	try {
+	    Class simClass = Class.forName(args[1]);
+	    s = (Similarity)simClass.newInstance();
+	} catch (Exception e) {
+	    System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]);
+	    e.printStackTrace(System.err);
+	}
+	
+	File index = new File(args[0]);
+	Directory d = FSDirectory.getDirectory(index,false);
+
+	LengthNormModifier lnm = new LengthNormModifier(d, s);
+	
+	for (int i = 2; i < args.length; i++) {
+	    System.out.print("Updating field: " + args[i] + " " +
+			     (new Date()).toString() + " ... ");
+	    lnm.reSetNorms(args[i]);
+	    System.out.println(new Date().toString());
+	}
+	
+	d.close();
+    }
+
+
+    private Directory dir;
+    private Similarity sim;
+    
+    /**
+     * Constructor for code that wishes to use this class progromaticaly
+     *
+     * @param d The Directory to modify
+     * @param s The Similarity to use in <code>reSetNorms</code>
+     */
+    public LengthNormModifier(Directory d, Similarity s) {
+	dir = d;
+	sim = s;
+    }
+
+    /**
+     * Re-Set the norms for the specified field.
+     *
+     * <p>
+     * Opens a new IndexReader on the Directory given to this instance,
+     * modifies the norms using the Similarity given to this instance,
+     * and closes the IndexReader.
+     * </p>
+     *
+     * @param field the field whose norms
+     */
+    public void reSetNorms(String field) throws IOException {
+	String fieldName = field.intern();
+	int[] termCounts = new int[0];
+
+	IndexReader reader = null;
+	TermEnum termEnum = null;
+	TermDocs termDocs = null;
+	try {
+	    reader = IndexReader.open(dir);
+	    termCounts = new int[reader.maxDoc()];
+	    try {
+		termEnum = reader.terms(new Term(field,""));
+		try {
+		    termDocs = reader.termDocs();
+		    do {
+			Term term = termEnum.term();
+			if (term != null && term.field().equals(fieldName)) {
+			    termDocs.seek(termEnum.term());
+			    while (termDocs.next()) {
+				termCounts[termDocs.doc()] += termDocs.freq();
+			    }
+			}
+		    } while (termEnum.next());
+	    
+		} finally {
+		    if (null != termDocs) termDocs.close();
+		}
+	    } finally {
+		if (null != termEnum) termEnum.close();
+	    }
+	} finally {
+	    if (null != reader) reader.close();
+	}
+
+	try {
+	    reader = IndexReader.open(dir); 
+	    for (int d = 0; d < termCounts.length; d++) {
+		if (! reader.isDeleted(d)) {
+		    byte norm = sim.encodeNorm
+			(sim.lengthNorm(fieldName, termCounts[d]));
+		    reader.setNorm(d, fieldName, norm);
+		}
+	    }
+	    
+	} finally {
+	    if (null != reader) reader.close();
+	}
+    }
+    
+}
--- a/contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java
+++ b/contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java
@ -0,0 +1,179 @@
+package org.apache.lucene.misc;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.HitCollector;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ * Tests changing the norms after changing the simularity
+ *
+ * @version $Id:$
+ */
+public class TestLengthNormModifier extends TestCase {
+    public TestLengthNormModifier(String name) {
+	super(name);
+    }
+
+    public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f);
+    
+    public static int NUM_DOCS = 5;
+
+    public Directory store = new RAMDirectory();
+
+    /** inverts the normal notion of lengthNorm */
+    public static Similarity s = new DefaultSimilarity() {
+	    public float lengthNorm(String fieldName, int numTokens) {
+		return (float)numTokens;
+	    }
+	};
+    
+    public void setUp() throws Exception {
+	IndexWriter writer = new
+	    IndexWriter(store, new SimpleAnalyzer(), true);
+	
+	IndexSearcher searcher;
+	
+	for (int i = 0; i < NUM_DOCS; i++) {
+	    Document d = new Document();
+	    d.add(new Field("field", "word",
+			    Field.Store.YES, Field.Index.TOKENIZED));
+	    d.add(new Field("nonorm", "word",
+			    Field.Store.YES, Field.Index.NO_NORMS));
+		
+	    for (int j = 1; j <= i; j++) {
+		d.add(new Field("field", "crap",
+				Field.Store.YES, Field.Index.TOKENIZED));
+		d.add(new Field("nonorm", "more words",
+				Field.Store.YES, Field.Index.NO_NORMS));
+	    }
+	    writer.addDocument(d);
+	}
+	writer.close();
+    }
+    
+    public void testMissingField() {
+	LengthNormModifier lnm = new LengthNormModifier(store, s);
+	try {
+	    lnm.reSetNorms("nobodyherebutuschickens");
+	} catch (Exception e) {
+	    assertNull("caught something", e);
+	}
+    }
+	
+    public void testFieldWithNoNorm() throws Exception {
+
+	IndexReader r = IndexReader.open(store);
+	byte[] norms = r.norms("nonorm");
+
+	// sanity check, norms should all be 1
+	assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
+	for (int i = 0; i< norms.length; i++) {
+	    assertEquals(""+i, DEFAULT_NORM, norms[i]);
+	}
+
+	r.close();
+	
+	LengthNormModifier lnm = new LengthNormModifier(store, s);
+	try {
+	    lnm.reSetNorms("nonorm");
+	} catch (Exception e) {
+	    assertNull("caught something", e);
+	}
+
+	// nothing should have changed
+	r = IndexReader.open(store);
+	
+	norms = r.norms("nonorm");
+	assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
+	for (int i = 0; i< norms.length; i++) {
+	    assertEquals(""+i, DEFAULT_NORM, norms[i]);
+	}
+
+	r.close();
+	
+    }
+	
+    
+    public void testGoodCases() throws Exception {
+	
+	IndexSearcher searcher;
+	final float[] scores = new float[NUM_DOCS];
+	float lastScore = 0.0f;
+	
+	// default similarity should put docs with shorter length first
+	searcher = new IndexSearcher(store);
+	searcher.search
+	    (new TermQuery(new Term("field", "word")),
+	     new HitCollector() {
+		 public final void collect(int doc, float score) {
+		     scores[doc] = score;
+		 }
+	     });
+	searcher.close();
+	
+	lastScore = Float.MAX_VALUE;
+	for (int i = 0; i < NUM_DOCS; i++) {
+	    String msg = "i=" + i + ", "+scores[i]+" <= "+lastScore;
+	    assertTrue(msg, scores[i] <= lastScore);
+	    //System.out.println(msg);
+	    lastScore = scores[i];
+	}
+
+	// override the norms to be inverted
+	Similarity s = new DefaultSimilarity() {
+		public float lengthNorm(String fieldName, int numTokens) {
+		    return (float)numTokens;
+		}
+	    };
+	LengthNormModifier lnm = new LengthNormModifier(store, s);
+	lnm.reSetNorms("field");
+
+	// new norm (with default similarity) should put longer docs first
+	searcher = new IndexSearcher(store);
+	searcher.search
+	    (new TermQuery(new Term("field", "word")),
+	     new HitCollector() {
+		 public final void collect(int doc, float score) {
+		     scores[doc] = score;
+		 }
+	     });
+	searcher.close();
+	
+	lastScore = 0.0f;
+	for (int i = 0; i < NUM_DOCS; i++) {
+	    String msg = "i=" + i + ", "+scores[i]+" >= "+lastScore;
+	    assertTrue(msg, scores[i] >= lastScore);
+	    //System.out.println(msg);
+	    lastScore = scores[i];
+	}
+	
+    }
+}