HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@898093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-01-11 22:29:29 +00:00
parent 9692e3d5da
commit 704b52fbe9
6 changed files with 94 additions and 14 deletions

View File

@ -292,6 +292,7 @@ Release 0.21.0 - Unreleased
HBASE-2035 Binary values are formatted wrong in shell
HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara
via Andrew Purtell)
HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0.0 (Kay Kay via Stack)
NEW FEATURES
HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write

View File

@ -26,7 +26,7 @@ hadoop-mapred.version=0.21.0-SNAPSHOT
zookeeper.version=3.2.2
thrift.version=r771587
lucene.version=2.2.0
lucene.version=3.0.0
jsr311.version=1.1.1

View File

@ -37,6 +37,7 @@ import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@ -53,7 +54,15 @@ public class IndexConfiguration extends Configuration {
static final String HBASE_COLUMN_NAME = "hbase.column.name";
static final String HBASE_COLUMN_STORE = "hbase.column.store";
static final String HBASE_COLUMN_INDEX = "hbase.column.index";
/**
* Tokenize property terminology is deprecated in lucene / replaced by analyze.
* @see #HBASE_COLUMN_ANALYZE
* @deprecated
*/
static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
static final String HBASE_COLUMN_ANALYZE = "hbase.column.analyze";
static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
@ -131,14 +140,34 @@ public class IndexConfiguration extends Configuration {
getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
}
/**
* @deprecated
* @see Use #isAnalyze(String) for replacement.
* @param columnName
* @return
*/
public boolean isTokenize(String columnName) {
return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
}
/**
* @deprecated
* @see Use #setAnalyze(String, boolean) for replacement.
* @param columnName
* @param tokenize
*/
public void setTokenize(String columnName, boolean tokenize) {
getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
}
public boolean isAnalyze(String columnName) {
return getColumn(columnName).getBoolean(HBASE_COLUMN_ANALYZE, true);
}
public void setAnalyze(String columnName, boolean analyze) {
getColumn(columnName).setBoolean(HBASE_COLUMN_ANALYZE, analyze);
}
public float getBoost(String columnName) {
return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
}
@ -166,7 +195,7 @@ public class IndexConfiguration extends Configuration {
public String getAnalyzerName() {
return get(HBASE_INDEX_ANALYZER_NAME,
"org.apache.lucene.analysis.standard.StandardAnalyzer");
StandardAnalyzer.class.getName());
}
public void setAnalyzerName(String analyzerName) {

View File

@ -19,6 +19,7 @@
*/
package org.apache.hadoop.hbase.mapreduce;
import java.io.File;
import java.io.IOException;
import java.util.Random;
@ -32,7 +33,9 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.FSDirectory;
/**
* Create a local index, unwrap Lucene documents created by reduce, add them to
@ -87,8 +90,8 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
}
// build locally first
final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp)
.toString(), analyzer, true);
final IndexWriter writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp)
.toString())), analyzer, true, MaxFieldLength.LIMITED);
// no delete, so no need for maxBufferedDeleteTerms
writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
@ -98,11 +101,10 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
String similarityName = indexConf.getSimilarityName();
if (similarityName != null) {
try {
Class<?> similarityClass = Class.forName(similarityName);
Similarity similarity = (Similarity) similarityClass.newInstance();
Similarity similarity = Class.forName(similarityName).asSubclass(Similarity.class).newInstance();
writer.setSimilarity(similarity);
} catch (Exception e) {
throw new IOException("Error in creating a similarty object "
throw new IOException("Error in creating a similarity object "
+ similarityName);
}
}

View File

@ -68,7 +68,7 @@ implements Configurable {
// index and store row key, row key already UTF-8 encoded
Field keyField = new Field(indexConf.getRowkeyName(),
Bytes.toString(key.get(), key.getOffset(), key.getLength()),
Field.Store.YES, Field.Index.UN_TOKENIZED);
Field.Store.YES, Field.Index.NOT_ANALYZED);
keyField.setOmitNorms(true);
doc.add(keyField);
}
@ -82,7 +82,7 @@ implements Configurable {
Field.Store.YES: Field.Store.NO;
Field.Index index = indexConf.isIndex(column)?
(indexConf.isTokenize(column)?
Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED):
Field.Index.ANALYZED: Field.Index.NOT_ANALYZED):
Field.Index.NO;
// UTF-8 encode value

View File

@ -45,12 +45,16 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
/**
* Test Map/Reduce job to build index over HBase table
@ -205,13 +209,13 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
ResultScanner scanner = null;
try {
if (indexDirs.length == 1) {
searcher = new IndexSearcher((new File(indexDirs[0].getPath().
toUri())).getAbsolutePath());
searcher = new IndexSearcher(FSDirectory.open(new File(indexDirs[0].getPath().
toUri())));
} else if (indexDirs.length > 1) {
Searchable[] searchers = new Searchable[indexDirs.length];
for (int i = 0; i < indexDirs.length; i++) {
searchers[i] = new IndexSearcher((new File(indexDirs[i].getPath().
toUri()).getAbsolutePath()));
searchers[i] = new IndexSearcher(FSDirectory.open(new File(indexDirs[i].getPath().
toUri())));
}
searcher = new MultiSearcher(searchers);
} else {
@ -235,7 +239,9 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
for (Result r : scanner) {
String value = Bytes.toString(r.getRow());
Term term = new Term(rowkeyName, value);
int hitCount = searcher.search(new TermQuery(term)).length();
CountCollector collector = new CountCollector();
searcher.search(new TermQuery(term), collector);
int hitCount = collector.getCount();
assertEquals("check row " + value, 1, hitCount);
count++;
}
@ -250,6 +256,48 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
scanner.close();
}
}
/**
* Collector that retrieves the count of the documents.
*
* @author Kay Kay
*
*/
public static class CountCollector extends Collector {
private int count;
public CountCollector() {
count = 0;
}
public int getCount() {
return this.count;
}
@Override
public boolean acceptsDocsOutOfOrder() {
//Make this accept docs out of order as some collectors can be efficient that way.
return true;
}
@Override
public void collect(int doc) throws IOException {
++count;
}
@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
//Do nothing
}
@Override
public void setScorer(Scorer scorer) throws IOException {
//Nothing to do with scorer.
}
}
/**
* @param args unused
*/