- LUCENE-741: command-line utility for modifying or removing field norms

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@489213 13f79535-47bb-0310-9956-ffa450edef68
2006-12-20 22:32:13 +00:00 · 2006-12-20 22:32:13 +00:00 · 17315a9be7
parent 42592aeb49
commit 17315a9be7
3 changed files with 364 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -86,6 +86,11 @@ New features
 10. LUCENE-723: QueryParser now parses *:* as MatchAllDocsQuery
    (Yonik Seeley)

+11. LUCENE-741: Command-line utility for modifying or removing norms
+    on fields in an existing index.  This is mostly based on LUCENE-496
+    and lives in contrib/miscellaneous.
+    (Chris Hostetter, Otis Gospodnetic)
+
 API Changes

 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
--- a/contrib/miscellaneous/src/java/org/apache/lucene/index/FieldNormModifier.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/index/FieldNormModifier.java
@ -0,0 +1,158 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+/**
+ * Given a directory and a list of fields, updates the fieldNorms in place for every document.
+ * If Similarity class is specified, uses its lengthNorm method to set norms.
+ * If -n command line argument is used, removed field norms, as if {@link Field.Index.NO_NORMS} was used.
+ *
+ * <p>
+ * NOTE: This will overwrite any length normalization or field/document boosts.
+ * </p>
+ *
+ * @author Chris Hostetter
+ * @author Otis Gospodnetic
+ */
+public class FieldNormModifier {
+  
+  /**
+   * Command Line Execution method
+   *
+   * <pre>
+   * Usage: FieldNormModifier /path/index <package.SimilarityClassName | -n> field1 field2 ...
+   * </pre>
+   */
+  public static void main(String[] args) throws IOException {
+    if (args.length < 3) {
+      System.err.println("Usage: FieldNormModifier <index> <package.SimilarityClassName | -n> <field1> [field2] ...");
+      System.exit(1);
+    }
+
+    Similarity s = null;
+    if (!args[1].equals("-n")) {
+      try {
+        Class simClass = Class.forName(args[1]);
+        s = (Similarity)simClass.newInstance();
+      } catch (Exception e) {
+        System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]);
+        e.printStackTrace(System.err);
+        System.exit(1);
+      }
+    }
+
+    Directory d = FSDirectory.getDirectory(args[0], false);
+    FieldNormModifier fnm = new FieldNormModifier(d, s);
+
+    for (int i = 2; i < args.length; i++) {
+      System.out.print("Updating field: " + args[i] + " " + (new Date()).toString() + " ... ");
+      fnm.reSetNorms(args[i]);
+      System.out.println(new Date().toString());
+    }
+    
+    d.close();
+  }
+  
+  
+  private Directory dir;
+  private Similarity sim;
+  
+  /**
+   * Constructor for code that wishes to use this class programatically
+   * If Similarity is null, kill the field norms.
+   *
+   * @param d the Directory to modify
+   * @param s the Similiary to use (can be null)
+   */
+  public FieldNormModifier(Directory d, Similarity s) {
+    dir = d;
+    sim = s;
+  }
+
+  /**
+   * Resets the norms for the specified field.
+   *
+   * <p>
+   * Opens a new IndexReader on the Directory given to this instance,
+   * modifies the norms (either using the Similarity given to this instance, or by using fake norms,
+   * and closes the IndexReader.
+   * </p>
+   *
+   * @param field the field whose norms should be reset
+   */
+  public void reSetNorms(String field) throws IOException {
+    String fieldName = field.intern();
+    int[] termCounts = new int[0];
+    byte[] fakeNorms = new byte[0];
+    
+    IndexReader reader = null;
+    TermEnum termEnum = null;
+    TermDocs termDocs = null;
+    try {
+      reader = IndexReader.open(dir);
+      termCounts = new int[reader.maxDoc()];
+      // if we are killing norms, get fake ones
+      if (sim == null)
+        fakeNorms = SegmentReader.createFakeNorms(reader.maxDoc());
+      try {
+        termEnum = reader.terms(new Term(field,""));
+        try {
+          termDocs = reader.termDocs();
+          do {
+            Term term = termEnum.term();
+            if (term != null && term.field().equals(fieldName)) {
+              termDocs.seek(termEnum.term());
+              while (termDocs.next()) {
+                termCounts[termDocs.doc()] += termDocs.freq();
+              }
+            }
+          } while (termEnum.next());
+          
+        } finally {
+          if (null != termDocs) termDocs.close();
+        }
+      } finally {
+        if (null != termEnum) termEnum.close();
+      }
+    } finally {
+      if (null != reader) reader.close();
+    }
+    
+    try {
+      reader = IndexReader.open(dir); 
+      for (int d = 0; d < termCounts.length; d++) {
+        if (! reader.isDeleted(d)) {
+          if (sim == null)
+            reader.setNorm(d, fieldName, fakeNorms[0]);
+          else
+            reader.setNorm(d, fieldName, sim.encodeNorm(sim.lengthNorm(fieldName, termCounts[d])));
+        }
+      }
+      
+    } finally {
+      if (null != reader) reader.close();
+    }
+  }
+  
+}
--- a/contrib/miscellaneous/src/test/org/apache/lucene/index/TestFieldNormModifier.java
+++ b/contrib/miscellaneous/src/test/org/apache/lucene/index/TestFieldNormModifier.java
@ -0,0 +1,201 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.HitCollector;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ * Tests changing of field norms with a custom similarity and with fake norms.
+ *
+ * @version $Id$
+ */
+public class TestFieldNormModifier extends TestCase {
+  public TestFieldNormModifier(String name) {
+    super(name);
+  }
+  
+  public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f);
+  
+  public static int NUM_DOCS = 5;
+  
+  public Directory store = new RAMDirectory();
+  
+  /** inverts the normal notion of lengthNorm */
+  public static Similarity s = new DefaultSimilarity() {
+    public float lengthNorm(String fieldName, int numTokens) {
+      return (float)numTokens;
+    }
+  };
+  
+  public void setUp() throws Exception {
+    IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
+    
+    for (int i = 0; i < NUM_DOCS; i++) {
+      Document d = new Document();
+      d.add(new Field("field", "word", Field.Store.YES, Field.Index.TOKENIZED));
+      d.add(new Field("nonorm", "word", Field.Store.YES, Field.Index.NO_NORMS));
+      d.add(new Field("untokfield", "20061212 20071212", Field.Store.YES, Field.Index.TOKENIZED));
+      
+      for (int j = 1; j <= i; j++) {
+        d.add(new Field("field", "crap", Field.Store.YES, Field.Index.TOKENIZED));
+        d.add(new Field("nonorm", "more words", Field.Store.YES, Field.Index.NO_NORMS));
+      }
+      writer.addDocument(d);
+    }
+    writer.close();
+  }
+  
+  public void testMissingField() {
+    FieldNormModifier fnm = new FieldNormModifier(store, s);
+    try {
+      fnm.reSetNorms("nobodyherebutuschickens");
+    } catch (Exception e) {
+      assertNull("caught something", e);
+    }
+  }
+  
+  public void testFieldWithNoNorm() throws Exception {
+    
+    IndexReader r = IndexReader.open(store);
+    byte[] norms = r.norms("nonorm");
+    
+    // sanity check, norms should all be 1
+    assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
+    for (int i = 0; i< norms.length; i++) {
+      assertEquals(""+i, DEFAULT_NORM, norms[i]);
+    }
+    
+    r.close();
+    
+    FieldNormModifier fnm = new FieldNormModifier(store, s);
+    try {
+      fnm.reSetNorms("nonorm");
+    } catch (Exception e) {
+      assertNull("caught something", e);
+    }
+    
+    // nothing should have changed
+    r = IndexReader.open(store);
+    
+    norms = r.norms("nonorm");
+    assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
+    for (int i = 0; i< norms.length; i++) {
+      assertEquals(""+i, DEFAULT_NORM, norms[i]);
+    }
+
+    r.close();
+  }
+  
+  
+  public void testGoodCases() throws Exception {
+    
+    IndexSearcher searcher = new IndexSearcher(store);
+    final float[] scores = new float[NUM_DOCS];
+    float lastScore = 0.0f;
+    
+    // default similarity should put docs with shorter length first
+    searcher.search(new TermQuery(new Term("field", "word")), new HitCollector() {
+      public final void collect(int doc, float score) {
+        scores[doc] = score;
+      }
+    });
+    searcher.close();
+    
+    lastScore = Float.MAX_VALUE;
+    for (int i = 0; i < NUM_DOCS; i++) {
+      String msg = "i=" + i + ", " + scores[i] + " <= " + lastScore;
+      assertTrue(msg, scores[i] <= lastScore);
+      //System.out.println(msg);
+      lastScore = scores[i];
+    }
+
+    FieldNormModifier fnm = new FieldNormModifier(store, s);
+    fnm.reSetNorms("field");
+    
+    // new norm (with default similarity) should put longer docs first
+    searcher = new IndexSearcher(store);
+    searcher.search(new TermQuery(new Term("field", "word")),  new HitCollector() {
+      public final void collect(int doc, float score) {
+        scores[doc] = score;
+      }
+    });
+    searcher.close();
+    
+    lastScore = 0.0f;
+    for (int i = 0; i < NUM_DOCS; i++) {
+      String msg = "i=" + i + ", " + scores[i] + " >= " + lastScore;
+      assertTrue(msg, scores[i] >= lastScore);
+      //System.out.println(msg);
+      lastScore = scores[i];
+    }
+  }
+
+  public void testNormKiller() throws IOException {
+
+    IndexReader r = IndexReader.open(store);
+    byte[] oldNorms = r.norms("untokfield");    
+    r.close();
+    
+    FieldNormModifier fnm = new FieldNormModifier(store, s);
+    fnm.reSetNorms("untokfield");
+
+    r = IndexReader.open(store);
+    byte[] newNorms = r.norms("untokfield");
+    r.close();
+    assertFalse(Arrays.equals(oldNorms, newNorms));    
+
+    
+    // verify that we still get documents in the same order as originally
+    IndexSearcher searcher = new IndexSearcher(store);
+    final float[] scores = new float[NUM_DOCS];
+    float lastScore = 0.0f;
+    
+    // default similarity should return the same score for all documents for this query
+    searcher.search(new TermQuery(new Term("untokfield", "20061212")), new HitCollector() {
+      public final void collect(int doc, float score) {
+        scores[doc] = score;
+      }
+    });
+    searcher.close();
+    
+    lastScore = scores[0];
+    for (int i = 0; i < NUM_DOCS; i++) {
+      String msg = "i=" + i + ", " + scores[i] + " == " + lastScore;
+      assertTrue(msg, scores[i] == lastScore);
+      //System.out.println(msg);
+      lastScore = scores[i];
+    }
+  }
+}