HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@898093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-01-11 22:29:29 +00:00
parent 9692e3d5da
commit 704b52fbe9
6 changed files with 94 additions and 14 deletions

View File

@ -292,6 +292,7 @@ Release 0.21.0 - Unreleased
HBASE-2035 Binary values are formatted wrong in shell HBASE-2035 Binary values are formatted wrong in shell
HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara
via Andrew Purtell) via Andrew Purtell)
HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0.0 (Kay Kay via Stack)
NEW FEATURES NEW FEATURES
HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write

View File

@ -26,7 +26,7 @@ hadoop-mapred.version=0.21.0-SNAPSHOT
zookeeper.version=3.2.2 zookeeper.version=3.2.2
thrift.version=r771587 thrift.version=r771587
lucene.version=2.2.0 lucene.version=3.0.0
jsr311.version=1.1.1 jsr311.version=1.1.1

View File

@ -37,6 +37,7 @@ import javax.xml.transform.stream.StreamResult;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.w3c.dom.Document; import org.w3c.dom.Document;
import org.w3c.dom.Element; import org.w3c.dom.Element;
import org.w3c.dom.Node; import org.w3c.dom.Node;
@ -53,7 +54,15 @@ public class IndexConfiguration extends Configuration {
static final String HBASE_COLUMN_NAME = "hbase.column.name"; static final String HBASE_COLUMN_NAME = "hbase.column.name";
static final String HBASE_COLUMN_STORE = "hbase.column.store"; static final String HBASE_COLUMN_STORE = "hbase.column.store";
static final String HBASE_COLUMN_INDEX = "hbase.column.index"; static final String HBASE_COLUMN_INDEX = "hbase.column.index";
/**
* Tokenize property terminology is deprecated in lucene / replaced by analyze.
* @see #HBASE_COLUMN_ANALYZE
* @deprecated
*/
static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize"; static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
static final String HBASE_COLUMN_ANALYZE = "hbase.column.analyze";
static final String HBASE_COLUMN_BOOST = "hbase.column.boost"; static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms"; static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name"; static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
@ -131,14 +140,34 @@ public class IndexConfiguration extends Configuration {
getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store); getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
} }
/**
* @deprecated
* @see Use #isAnalyze(String) for replacement.
* @param columnName
* @return
*/
public boolean isTokenize(String columnName) { public boolean isTokenize(String columnName) {
return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true); return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
} }
/**
* @deprecated
* @see Use #setAnalyze(String, boolean) for replacement.
* @param columnName
* @param tokenize
*/
public void setTokenize(String columnName, boolean tokenize) { public void setTokenize(String columnName, boolean tokenize) {
getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize); getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
} }
public boolean isAnalyze(String columnName) {
return getColumn(columnName).getBoolean(HBASE_COLUMN_ANALYZE, true);
}
public void setAnalyze(String columnName, boolean analyze) {
getColumn(columnName).setBoolean(HBASE_COLUMN_ANALYZE, analyze);
}
public float getBoost(String columnName) { public float getBoost(String columnName) {
return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f); return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
} }
@ -166,7 +195,7 @@ public class IndexConfiguration extends Configuration {
public String getAnalyzerName() { public String getAnalyzerName() {
return get(HBASE_INDEX_ANALYZER_NAME, return get(HBASE_INDEX_ANALYZER_NAME,
"org.apache.lucene.analysis.standard.StandardAnalyzer"); StandardAnalyzer.class.getName());
} }
public void setAnalyzerName(String analyzerName) { public void setAnalyzerName(String analyzerName) {

View File

@ -19,6 +19,7 @@
*/ */
package org.apache.hadoop.hbase.mapreduce; package org.apache.hadoop.hbase.mapreduce;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Random; import java.util.Random;
@ -32,7 +33,9 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.FSDirectory;
/** /**
* Create a local index, unwrap Lucene documents created by reduce, add them to * Create a local index, unwrap Lucene documents created by reduce, add them to
@ -87,8 +90,8 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
} }
// build locally first // build locally first
final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp) final IndexWriter writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp)
.toString(), analyzer, true); .toString())), analyzer, true, MaxFieldLength.LIMITED);
// no delete, so no need for maxBufferedDeleteTerms // no delete, so no need for maxBufferedDeleteTerms
writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs()); writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
@ -98,11 +101,10 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
String similarityName = indexConf.getSimilarityName(); String similarityName = indexConf.getSimilarityName();
if (similarityName != null) { if (similarityName != null) {
try { try {
Class<?> similarityClass = Class.forName(similarityName); Similarity similarity = Class.forName(similarityName).asSubclass(Similarity.class).newInstance();
Similarity similarity = (Similarity) similarityClass.newInstance();
writer.setSimilarity(similarity); writer.setSimilarity(similarity);
} catch (Exception e) { } catch (Exception e) {
throw new IOException("Error in creating a similarty object " throw new IOException("Error in creating a similarity object "
+ similarityName); + similarityName);
} }
} }

View File

@ -68,7 +68,7 @@ implements Configurable {
// index and store row key, row key already UTF-8 encoded // index and store row key, row key already UTF-8 encoded
Field keyField = new Field(indexConf.getRowkeyName(), Field keyField = new Field(indexConf.getRowkeyName(),
Bytes.toString(key.get(), key.getOffset(), key.getLength()), Bytes.toString(key.get(), key.getOffset(), key.getLength()),
Field.Store.YES, Field.Index.UN_TOKENIZED); Field.Store.YES, Field.Index.NOT_ANALYZED);
keyField.setOmitNorms(true); keyField.setOmitNorms(true);
doc.add(keyField); doc.add(keyField);
} }
@ -82,7 +82,7 @@ implements Configurable {
Field.Store.YES: Field.Store.NO; Field.Store.YES: Field.Store.NO;
Field.Index index = indexConf.isIndex(column)? Field.Index index = indexConf.isIndex(column)?
(indexConf.isTokenize(column)? (indexConf.isTokenize(column)?
Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED): Field.Index.ANALYZED: Field.Index.NOT_ANALYZED):
Field.Index.NO; Field.Index.NO;
// UTF-8 encode value // UTF-8 encode value

View File

@ -45,12 +45,16 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.MiniMRCluster; import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiSearcher; import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searchable; import org.apache.lucene.search.Searchable;
import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
/** /**
* Test Map/Reduce job to build index over HBase table * Test Map/Reduce job to build index over HBase table
@ -205,13 +209,13 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
ResultScanner scanner = null; ResultScanner scanner = null;
try { try {
if (indexDirs.length == 1) { if (indexDirs.length == 1) {
searcher = new IndexSearcher((new File(indexDirs[0].getPath(). searcher = new IndexSearcher(FSDirectory.open(new File(indexDirs[0].getPath().
toUri())).getAbsolutePath()); toUri())));
} else if (indexDirs.length > 1) { } else if (indexDirs.length > 1) {
Searchable[] searchers = new Searchable[indexDirs.length]; Searchable[] searchers = new Searchable[indexDirs.length];
for (int i = 0; i < indexDirs.length; i++) { for (int i = 0; i < indexDirs.length; i++) {
searchers[i] = new IndexSearcher((new File(indexDirs[i].getPath(). searchers[i] = new IndexSearcher(FSDirectory.open(new File(indexDirs[i].getPath().
toUri()).getAbsolutePath())); toUri())));
} }
searcher = new MultiSearcher(searchers); searcher = new MultiSearcher(searchers);
} else { } else {
@ -235,7 +239,9 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
for (Result r : scanner) { for (Result r : scanner) {
String value = Bytes.toString(r.getRow()); String value = Bytes.toString(r.getRow());
Term term = new Term(rowkeyName, value); Term term = new Term(rowkeyName, value);
int hitCount = searcher.search(new TermQuery(term)).length(); CountCollector collector = new CountCollector();
searcher.search(new TermQuery(term), collector);
int hitCount = collector.getCount();
assertEquals("check row " + value, 1, hitCount); assertEquals("check row " + value, 1, hitCount);
count++; count++;
} }
@ -250,6 +256,48 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
scanner.close(); scanner.close();
} }
} }
/**
* Collector that retrieves the count of the documents.
*
* @author Kay Kay
*
*/
public static class CountCollector extends Collector {
private int count;
public CountCollector() {
count = 0;
}
public int getCount() {
return this.count;
}
@Override
public boolean acceptsDocsOutOfOrder() {
//Make this accept docs out of order as some collectors can be efficient that way.
return true;
}
@Override
public void collect(int doc) throws IOException {
++count;
}
@Override
public void setNextReader(IndexReader reader, int docBase)
throws IOException {
//Do nothing
}
@Override
public void setScorer(Scorer scorer) throws IOException {
//Nothing to do with scorer.
}
}
/** /**
* @param args unused * @param args unused
*/ */