HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@898093 13f79535-47bb-0310-9956-ffa450edef68
2010-01-11 22:29:29 +00:00 · 2010-01-11 22:29:29 +00:00 · 704b52fbe9
parent 9692e3d5da
commit 704b52fbe9
6 changed files with 94 additions and 14 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -292,6 +292,7 @@ Release 0.21.0 - Unreleased
   HBASE-2035  Binary values are formatted wrong in shell
   HBASE-2095  TIF shuold support more confs for the scanner (Bassam Tabbara
               via Andrew Purtell)
   HBASE-2107  Upgrading Lucene 2.2 to Lucene 3.0.0 (Kay Kay via Stack)
  NEW FEATURES
   HBASE-1901  "General" partitioner for "hbase-48" bulk (behind the api, write
--- a/ivy/libraries.properties
+++ b/ivy/libraries.properties
@ -26,7 +26,7 @@ hadoop-mapred.version=0.21.0-SNAPSHOT
 zookeeper.version=3.2.2
 thrift.version=r771587
-lucene.version=2.2.0
+lucene.version=3.0.0
 jsr311.version=1.1.1
--- a/src/java/org/apache/hadoop/hbase/mapreduce/IndexConfiguration.java
+++ b/src/java/org/apache/hadoop/hbase/mapreduce/IndexConfiguration.java
@ -37,6 +37,7 @@ import javax.xml.transform.stream.StreamResult;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@ -53,7 +54,15 @@ public class IndexConfiguration extends Configuration {
  static final String HBASE_COLUMN_NAME = "hbase.column.name";
  static final String HBASE_COLUMN_STORE = "hbase.column.store";
  static final String HBASE_COLUMN_INDEX = "hbase.column.index";
  /**
   * Tokenize property terminology is deprecated in lucene / replaced by analyze.
   * @see #HBASE_COLUMN_ANALYZE
   * @deprecated
   */
  static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
  static final String HBASE_COLUMN_ANALYZE = "hbase.column.analyze";
  static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
  static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
  static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
@ -131,14 +140,34 @@ public class IndexConfiguration extends Configuration {
    getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
  }
  /**
   * @deprecated 
   * @see Use #isAnalyze(String) for replacement.
   * @param columnName
   * @return
   */
  public boolean isTokenize(String columnName) {
    return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
  }
  /**
   * @deprecated
   * @see Use #setAnalyze(String, boolean) for replacement.
   * @param columnName
   * @param tokenize
   */
  public void setTokenize(String columnName, boolean tokenize) {
    getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
  }
  public boolean isAnalyze(String columnName) {
 	    return getColumn(columnName).getBoolean(HBASE_COLUMN_ANALYZE, true);
  }
  public void setAnalyze(String columnName, boolean analyze) {
 	  getColumn(columnName).setBoolean(HBASE_COLUMN_ANALYZE, analyze);
  }
  public float getBoost(String columnName) {
    return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
  }
@ -166,7 +195,7 @@ public class IndexConfiguration extends Configuration {
  public String getAnalyzerName() {
    return get(HBASE_INDEX_ANALYZER_NAME,
-        "org.apache.lucene.analysis.standard.StandardAnalyzer");
+        StandardAnalyzer.class.getName());
  }
  public void setAnalyzerName(String analyzerName) {
--- a/src/java/org/apache/hadoop/hbase/mapreduce/IndexOutputFormat.java
+++ b/src/java/org/apache/hadoop/hbase/mapreduce/IndexOutputFormat.java
@ -19,6 +19,7 @@
 */
 package org.apache.hadoop.hbase.mapreduce;
 import java.io.File;
 import java.io.IOException;
 import java.util.Random;
@ -32,7 +33,9 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriter.MaxFieldLength;
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.FSDirectory;
 /**
 * Create a local index, unwrap Lucene documents created by reduce, add them to
@ -87,8 +90,8 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
    }
    // build locally first
-    final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp)
+    final IndexWriter writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp)
-        .toString(), analyzer, true);
+        .toString())), analyzer, true, MaxFieldLength.LIMITED);
    // no delete, so no need for maxBufferedDeleteTerms
    writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
@ -98,11 +101,10 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
    String similarityName = indexConf.getSimilarityName();
    if (similarityName != null) {
      try {
-        Class<?> similarityClass = Class.forName(similarityName);
+        Similarity similarity = Class.forName(similarityName).asSubclass(Similarity.class).newInstance();
        Similarity similarity = (Similarity) similarityClass.newInstance();
        writer.setSimilarity(similarity);
      } catch (Exception e) {
-        throw new IOException("Error in creating a similarty object "
+        throw new IOException("Error in creating a similarity object "
            + similarityName);
      }
    }
--- a/src/java/org/apache/hadoop/hbase/mapreduce/IndexTableReducer.java
+++ b/src/java/org/apache/hadoop/hbase/mapreduce/IndexTableReducer.java
@ -68,7 +68,7 @@ implements Configurable {
        // index and store row key, row key already UTF-8 encoded
        Field keyField = new Field(indexConf.getRowkeyName(),
          Bytes.toString(key.get(), key.getOffset(), key.getLength()),
-          Field.Store.YES, Field.Index.UN_TOKENIZED);
+          Field.Store.YES, Field.Index.NOT_ANALYZED);
        keyField.setOmitNorms(true);
        doc.add(keyField);
      }
@ -82,7 +82,7 @@ implements Configurable {
          Field.Store.YES: Field.Store.NO;
        Field.Index index = indexConf.isIndex(column)?
          (indexConf.isTokenize(column)?
-            Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED):
+            Field.Index.ANALYZED: Field.Index.NOT_ANALYZED):
            Field.Index.NO;
        // UTF-8 encode value
--- a/src/test/org/apache/hadoop/hbase/mapreduce/DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex.java
+++ b/src/test/org/apache/hadoop/hbase/mapreduce/DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex.java
@ -45,12 +45,16 @@ import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.mapred.MiniMRCluster;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.Searchable;
 import org.apache.lucene.search.Searcher;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.FSDirectory;
 /**
 * Test Map/Reduce job to build index over HBase table
@ -205,13 +209,13 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
    ResultScanner scanner = null;
    try {
      if (indexDirs.length == 1) {
-        searcher = new IndexSearcher((new File(indexDirs[0].getPath().
+        searcher = new IndexSearcher(FSDirectory.open(new File(indexDirs[0].getPath().
-          toUri())).getAbsolutePath());
+          toUri())));
      } else if (indexDirs.length > 1) {
        Searchable[] searchers = new Searchable[indexDirs.length];
        for (int i = 0; i < indexDirs.length; i++) {
-          searchers[i] = new IndexSearcher((new File(indexDirs[i].getPath().
+          searchers[i] = new IndexSearcher(FSDirectory.open(new File(indexDirs[i].getPath().
-            toUri()).getAbsolutePath()));
+            toUri())));
        }
        searcher = new MultiSearcher(searchers);
      } else {
@ -235,7 +239,9 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
      for (Result r : scanner) {
        String value = Bytes.toString(r.getRow());
        Term term = new Term(rowkeyName, value);
-        int hitCount = searcher.search(new TermQuery(term)).length();
+        CountCollector collector = new CountCollector();
        searcher.search(new TermQuery(term), collector);
        int hitCount = collector.getCount();
        assertEquals("check row " + value, 1, hitCount);
        count++;
      }
@ -250,6 +256,48 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
        scanner.close();
    }
  }
  /**
   * Collector that retrieves the count of the documents.
   * 
   * @author Kay Kay
   *
   */
  public static class CountCollector extends Collector {
    private int count; 
    public CountCollector() { 
      count = 0;
    }
    public int getCount() { 
      return this.count;
    }
    @Override
    public boolean acceptsDocsOutOfOrder() {
      //Make this accept docs out of order as some collectors can be efficient that way.
      return true;
    }
    @Override
    public void collect(int doc) throws IOException {
      ++count;
    }
    @Override
    public void setNextReader(IndexReader reader, int docBase)
    throws IOException {
      //Do nothing		
    }
    @Override
    public void setScorer(Scorer scorer) throws IOException {
      //Nothing to do with scorer.
    } 
  }
  /**
   * @param args unused
   */