From dd3d97990547ab18cb8cef9548cc8697138baf4d Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Thu, 25 May 2006 18:49:04 +0000 Subject: [PATCH] LUCENE-496 - Command line tool for modifying the field norms of an existing index git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@409428 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + .../lucene/misc/LengthNormModifier.java | 156 +++++++++++++++ .../lucene/misc/TestLengthNormModifier.java | 179 ++++++++++++++++++ 3 files changed, 340 insertions(+) create mode 100644 contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java create mode 100644 contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java diff --git a/CHANGES.txt b/CHANGES.txt index d9ca00ce5ff..814a6168663 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -18,6 +18,11 @@ API Changes now throws an IllegalArgumentException (Daniel Naber) +New features + + 1. LUCENE-496: Command line tool for modifying the field norms of an + existing index; added to contrib/miscellaneous. (Chris Hostetter) + Bug fixes 1. LUCENE-330: Fix issue of FilteredQuery not working properly within diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java b/contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java new file mode 100644 index 00000000000..e64af666071 --- /dev/null +++ b/contrib/miscellaneous/src/java/org/apache/lucene/misc/LengthNormModifier.java @@ -0,0 +1,156 @@ +package org.apache.lucene.misc; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import java.io.File; +import java.io.IOException; +import java.util.Date; + +/** + * Given a directory, a Similarity, and a list of fields, updates the + * fieldNorms in place for every document using the Similarity.lengthNorm. + * + *

+ * NOTE: This only works if you do not use field/document boosts in your + * index. + *

+ * + * @version $Id:$ + */ +public class LengthNormModifier { + + /** + * Command Line Execution method + * + *
+     * Usage: LengthNormModifier /path/index package.SimilarityClassName field1 field2 ...
+     * 
+ * + public static void main(String[] args) throws IOException { + if (args.length < 3) { + System.err.println("Usage: LengthNormModifier [field2] ..."); + System.exit(1); + } + + Similarity s = null; + try { + Class simClass = Class.forName(args[1]); + s = (Similarity)simClass.newInstance(); + } catch (Exception e) { + System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]); + e.printStackTrace(System.err); + } + + File index = new File(args[0]); + Directory d = FSDirectory.getDirectory(index,false); + + LengthNormModifier lnm = new LengthNormModifier(d, s); + + for (int i = 2; i < args.length; i++) { + System.out.print("Updating field: " + args[i] + " " + + (new Date()).toString() + " ... "); + lnm.reSetNorms(args[i]); + System.out.println(new Date().toString()); + } + + d.close(); + } + + + private Directory dir; + private Similarity sim; + + /** + * Constructor for code that wishes to use this class progromaticaly + * + * @param d The Directory to modify + * @param s The Similarity to use in reSetNorms + */ + public LengthNormModifier(Directory d, Similarity s) { + dir = d; + sim = s; + } + + /** + * Re-Set the norms for the specified field. + * + *

+ * Opens a new IndexReader on the Directory given to this instance, + * modifies the norms using the Similarity given to this instance, + * and closes the IndexReader. + *

+ * + * @param field the field whose norms + */ + public void reSetNorms(String field) throws IOException { + String fieldName = field.intern(); + int[] termCounts = new int[0]; + + IndexReader reader = null; + TermEnum termEnum = null; + TermDocs termDocs = null; + try { + reader = IndexReader.open(dir); + termCounts = new int[reader.maxDoc()]; + try { + termEnum = reader.terms(new Term(field,"")); + try { + termDocs = reader.termDocs(); + do { + Term term = termEnum.term(); + if (term != null && term.field().equals(fieldName)) { + termDocs.seek(termEnum.term()); + while (termDocs.next()) { + termCounts[termDocs.doc()] += termDocs.freq(); + } + } + } while (termEnum.next()); + + } finally { + if (null != termDocs) termDocs.close(); + } + } finally { + if (null != termEnum) termEnum.close(); + } + } finally { + if (null != reader) reader.close(); + } + + try { + reader = IndexReader.open(dir); + for (int d = 0; d < termCounts.length; d++) { + if (! reader.isDeleted(d)) { + byte norm = sim.encodeNorm + (sim.lengthNorm(fieldName, termCounts[d])); + reader.setNorm(d, fieldName, norm); + } + } + + } finally { + if (null != reader) reader.close(); + } + } + +} diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java b/contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java new file mode 100644 index 00000000000..40ddf99094f --- /dev/null +++ b/contrib/miscellaneous/src/test/org/apache/lucene/misc/TestLengthNormModifier.java @@ -0,0 +1,179 @@ +package org.apache.lucene.misc; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.HitCollector; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +/** + * Tests changing the norms after changing the simularity + * + * @version $Id:$ + */ +public class TestLengthNormModifier extends TestCase { + public TestLengthNormModifier(String name) { + super(name); + } + + public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f); + + public static int NUM_DOCS = 5; + + public Directory store = new RAMDirectory(); + + /** inverts the normal notion of lengthNorm */ + public static Similarity s = new DefaultSimilarity() { + public float lengthNorm(String fieldName, int numTokens) { + return (float)numTokens; + } + }; + + public void setUp() throws Exception { + IndexWriter writer = new + IndexWriter(store, new SimpleAnalyzer(), true); + + IndexSearcher searcher; + + for (int i = 0; i < NUM_DOCS; i++) { + Document d = new Document(); + d.add(new Field("field", "word", + Field.Store.YES, Field.Index.TOKENIZED)); + d.add(new Field("nonorm", "word", + Field.Store.YES, Field.Index.NO_NORMS)); + + for (int j = 1; j <= i; j++) { + d.add(new Field("field", "crap", + Field.Store.YES, Field.Index.TOKENIZED)); + d.add(new Field("nonorm", "more words", + Field.Store.YES, Field.Index.NO_NORMS)); + } + writer.addDocument(d); + } + writer.close(); + } + + public void testMissingField() { + LengthNormModifier lnm = new LengthNormModifier(store, s); + try { + lnm.reSetNorms("nobodyherebutuschickens"); + } catch (Exception e) { + assertNull("caught something", e); + } + } + + public void testFieldWithNoNorm() throws Exception { + + IndexReader r = IndexReader.open(store); + byte[] norms = r.norms("nonorm"); + + // sanity check, norms should all be 1 + assertTrue("Whoops we have norms?", !r.hasNorms("nonorm")); + for (int i = 0; i< norms.length; i++) { + assertEquals(""+i, DEFAULT_NORM, norms[i]); + } + + r.close(); + + LengthNormModifier lnm = new LengthNormModifier(store, s); + try { + lnm.reSetNorms("nonorm"); + } catch (Exception e) { + assertNull("caught something", e); + } + + // nothing should have changed + r = IndexReader.open(store); + + norms = r.norms("nonorm"); + assertTrue("Whoops we have norms?", !r.hasNorms("nonorm")); + for (int i = 0; i< norms.length; i++) { + assertEquals(""+i, DEFAULT_NORM, norms[i]); + } + + r.close(); + + } + + + public void testGoodCases() throws Exception { + + IndexSearcher searcher; + final float[] scores = new float[NUM_DOCS]; + float lastScore = 0.0f; + + // default similarity should put docs with shorter length first + searcher = new IndexSearcher(store); + searcher.search + (new TermQuery(new Term("field", "word")), + new HitCollector() { + public final void collect(int doc, float score) { + scores[doc] = score; + } + }); + searcher.close(); + + lastScore = Float.MAX_VALUE; + for (int i = 0; i < NUM_DOCS; i++) { + String msg = "i=" + i + ", "+scores[i]+" <= "+lastScore; + assertTrue(msg, scores[i] <= lastScore); + //System.out.println(msg); + lastScore = scores[i]; + } + + // override the norms to be inverted + Similarity s = new DefaultSimilarity() { + public float lengthNorm(String fieldName, int numTokens) { + return (float)numTokens; + } + }; + LengthNormModifier lnm = new LengthNormModifier(store, s); + lnm.reSetNorms("field"); + + // new norm (with default similarity) should put longer docs first + searcher = new IndexSearcher(store); + searcher.search + (new TermQuery(new Term("field", "word")), + new HitCollector() { + public final void collect(int doc, float score) { + scores[doc] = score; + } + }); + searcher.close(); + + lastScore = 0.0f; + for (int i = 0; i < NUM_DOCS; i++) { + String msg = "i=" + i + ", "+scores[i]+" >= "+lastScore; + assertTrue(msg, scores[i] >= lastScore); + //System.out.println(msg); + lastScore = scores[i]; + } + + } +}