- LUCENE-741: command-line utility for modifying or removing field norms

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@489213 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2006-12-20 22:32:13 +00:00
parent 42592aeb49
commit 17315a9be7
3 changed files with 364 additions and 0 deletions

View File

@ -86,6 +86,11 @@ New features
10. LUCENE-723: QueryParser now parses *:* as MatchAllDocsQuery
(Yonik Seeley)
11. LUCENE-741: Command-line utility for modifying or removing norms
on fields in an existing index. This is mostly based on LUCENE-496
and lives in contrib/miscellaneous.
(Chris Hostetter, Otis Gospodnetic)
API Changes
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow

View File

@ -0,0 +1,158 @@
package org.apache.lucene.index;
/**
* Copyright 2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* Given a directory and a list of fields, updates the fieldNorms in place for every document.
* If Similarity class is specified, uses its lengthNorm method to set norms.
* If -n command line argument is used, removed field norms, as if {@link Field.Index.NO_NORMS} was used.
*
* <p>
* NOTE: This will overwrite any length normalization or field/document boosts.
* </p>
*
* @author Chris Hostetter
* @author Otis Gospodnetic
*/
public class FieldNormModifier {
/**
* Command Line Execution method
*
* <pre>
* Usage: FieldNormModifier /path/index <package.SimilarityClassName | -n> field1 field2 ...
* </pre>
*/
public static void main(String[] args) throws IOException {
if (args.length < 3) {
System.err.println("Usage: FieldNormModifier <index> <package.SimilarityClassName | -n> <field1> [field2] ...");
System.exit(1);
}
Similarity s = null;
if (!args[1].equals("-n")) {
try {
Class simClass = Class.forName(args[1]);
s = (Similarity)simClass.newInstance();
} catch (Exception e) {
System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]);
e.printStackTrace(System.err);
System.exit(1);
}
}
Directory d = FSDirectory.getDirectory(args[0], false);
FieldNormModifier fnm = new FieldNormModifier(d, s);
for (int i = 2; i < args.length; i++) {
System.out.print("Updating field: " + args[i] + " " + (new Date()).toString() + " ... ");
fnm.reSetNorms(args[i]);
System.out.println(new Date().toString());
}
d.close();
}
private Directory dir;
private Similarity sim;
/**
* Constructor for code that wishes to use this class programatically
* If Similarity is null, kill the field norms.
*
* @param d the Directory to modify
* @param s the Similiary to use (can be null)
*/
public FieldNormModifier(Directory d, Similarity s) {
dir = d;
sim = s;
}
/**
* Resets the norms for the specified field.
*
* <p>
* Opens a new IndexReader on the Directory given to this instance,
* modifies the norms (either using the Similarity given to this instance, or by using fake norms,
* and closes the IndexReader.
* </p>
*
* @param field the field whose norms should be reset
*/
public void reSetNorms(String field) throws IOException {
String fieldName = field.intern();
int[] termCounts = new int[0];
byte[] fakeNorms = new byte[0];
IndexReader reader = null;
TermEnum termEnum = null;
TermDocs termDocs = null;
try {
reader = IndexReader.open(dir);
termCounts = new int[reader.maxDoc()];
// if we are killing norms, get fake ones
if (sim == null)
fakeNorms = SegmentReader.createFakeNorms(reader.maxDoc());
try {
termEnum = reader.terms(new Term(field,""));
try {
termDocs = reader.termDocs();
do {
Term term = termEnum.term();
if (term != null && term.field().equals(fieldName)) {
termDocs.seek(termEnum.term());
while (termDocs.next()) {
termCounts[termDocs.doc()] += termDocs.freq();
}
}
} while (termEnum.next());
} finally {
if (null != termDocs) termDocs.close();
}
} finally {
if (null != termEnum) termEnum.close();
}
} finally {
if (null != reader) reader.close();
}
try {
reader = IndexReader.open(dir);
for (int d = 0; d < termCounts.length; d++) {
if (! reader.isDeleted(d)) {
if (sim == null)
reader.setNorm(d, fieldName, fakeNorms[0]);
else
reader.setNorm(d, fieldName, sim.encodeNorm(sim.lengthNorm(fieldName, termCounts[d])));
}
}
} finally {
if (null != reader) reader.close();
}
}
}

View File

@ -0,0 +1,201 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import junit.framework.TestCase;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* Tests changing of field norms with a custom similarity and with fake norms.
*
* @version $Id$
*/
public class TestFieldNormModifier extends TestCase {
public TestFieldNormModifier(String name) {
super(name);
}
public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f);
public static int NUM_DOCS = 5;
public Directory store = new RAMDirectory();
/** inverts the normal notion of lengthNorm */
public static Similarity s = new DefaultSimilarity() {
public float lengthNorm(String fieldName, int numTokens) {
return (float)numTokens;
}
};
public void setUp() throws Exception {
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
for (int i = 0; i < NUM_DOCS; i++) {
Document d = new Document();
d.add(new Field("field", "word", Field.Store.YES, Field.Index.TOKENIZED));
d.add(new Field("nonorm", "word", Field.Store.YES, Field.Index.NO_NORMS));
d.add(new Field("untokfield", "20061212 20071212", Field.Store.YES, Field.Index.TOKENIZED));
for (int j = 1; j <= i; j++) {
d.add(new Field("field", "crap", Field.Store.YES, Field.Index.TOKENIZED));
d.add(new Field("nonorm", "more words", Field.Store.YES, Field.Index.NO_NORMS));
}
writer.addDocument(d);
}
writer.close();
}
public void testMissingField() {
FieldNormModifier fnm = new FieldNormModifier(store, s);
try {
fnm.reSetNorms("nobodyherebutuschickens");
} catch (Exception e) {
assertNull("caught something", e);
}
}
public void testFieldWithNoNorm() throws Exception {
IndexReader r = IndexReader.open(store);
byte[] norms = r.norms("nonorm");
// sanity check, norms should all be 1
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
for (int i = 0; i< norms.length; i++) {
assertEquals(""+i, DEFAULT_NORM, norms[i]);
}
r.close();
FieldNormModifier fnm = new FieldNormModifier(store, s);
try {
fnm.reSetNorms("nonorm");
} catch (Exception e) {
assertNull("caught something", e);
}
// nothing should have changed
r = IndexReader.open(store);
norms = r.norms("nonorm");
assertTrue("Whoops we have norms?", !r.hasNorms("nonorm"));
for (int i = 0; i< norms.length; i++) {
assertEquals(""+i, DEFAULT_NORM, norms[i]);
}
r.close();
}
public void testGoodCases() throws Exception {
IndexSearcher searcher = new IndexSearcher(store);
final float[] scores = new float[NUM_DOCS];
float lastScore = 0.0f;
// default similarity should put docs with shorter length first
searcher.search(new TermQuery(new Term("field", "word")), new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
searcher.close();
lastScore = Float.MAX_VALUE;
for (int i = 0; i < NUM_DOCS; i++) {
String msg = "i=" + i + ", " + scores[i] + " <= " + lastScore;
assertTrue(msg, scores[i] <= lastScore);
//System.out.println(msg);
lastScore = scores[i];
}
FieldNormModifier fnm = new FieldNormModifier(store, s);
fnm.reSetNorms("field");
// new norm (with default similarity) should put longer docs first
searcher = new IndexSearcher(store);
searcher.search(new TermQuery(new Term("field", "word")), new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
searcher.close();
lastScore = 0.0f;
for (int i = 0; i < NUM_DOCS; i++) {
String msg = "i=" + i + ", " + scores[i] + " >= " + lastScore;
assertTrue(msg, scores[i] >= lastScore);
//System.out.println(msg);
lastScore = scores[i];
}
}
public void testNormKiller() throws IOException {
IndexReader r = IndexReader.open(store);
byte[] oldNorms = r.norms("untokfield");
r.close();
FieldNormModifier fnm = new FieldNormModifier(store, s);
fnm.reSetNorms("untokfield");
r = IndexReader.open(store);
byte[] newNorms = r.norms("untokfield");
r.close();
assertFalse(Arrays.equals(oldNorms, newNorms));
// verify that we still get documents in the same order as originally
IndexSearcher searcher = new IndexSearcher(store);
final float[] scores = new float[NUM_DOCS];
float lastScore = 0.0f;
// default similarity should return the same score for all documents for this query
searcher.search(new TermQuery(new Term("untokfield", "20061212")), new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
searcher.close();
lastScore = scores[0];
for (int i = 0; i < NUM_DOCS; i++) {
String msg = "i=" + i + ", " + scores[i] + " == " + lastScore;
assertTrue(msg, scores[i] == lastScore);
//System.out.println(msg);
lastScore = scores[i];
}
}
}