HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@898093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9692e3d5da
commit
704b52fbe9
|
@ -292,6 +292,7 @@ Release 0.21.0 - Unreleased
|
|||
HBASE-2035 Binary values are formatted wrong in shell
|
||||
HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara
|
||||
via Andrew Purtell)
|
||||
HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0.0 (Kay Kay via Stack)
|
||||
|
||||
NEW FEATURES
|
||||
HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write
|
||||
|
|
|
@ -26,7 +26,7 @@ hadoop-mapred.version=0.21.0-SNAPSHOT
|
|||
zookeeper.version=3.2.2
|
||||
thrift.version=r771587
|
||||
|
||||
lucene.version=2.2.0
|
||||
lucene.version=3.0.0
|
||||
|
||||
jsr311.version=1.1.1
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ import javax.xml.transform.stream.StreamResult;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
|
@ -53,7 +54,15 @@ public class IndexConfiguration extends Configuration {
|
|||
static final String HBASE_COLUMN_NAME = "hbase.column.name";
|
||||
static final String HBASE_COLUMN_STORE = "hbase.column.store";
|
||||
static final String HBASE_COLUMN_INDEX = "hbase.column.index";
|
||||
|
||||
/**
|
||||
* Tokenize property terminology is deprecated in lucene / replaced by analyze.
|
||||
* @see #HBASE_COLUMN_ANALYZE
|
||||
* @deprecated
|
||||
*/
|
||||
static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
|
||||
static final String HBASE_COLUMN_ANALYZE = "hbase.column.analyze";
|
||||
|
||||
static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
|
||||
static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
|
||||
static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
|
||||
|
@ -131,14 +140,34 @@ public class IndexConfiguration extends Configuration {
|
|||
getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
* @see Use #isAnalyze(String) for replacement.
|
||||
* @param columnName
|
||||
* @return
|
||||
*/
|
||||
public boolean isTokenize(String columnName) {
|
||||
return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated
|
||||
* @see Use #setAnalyze(String, boolean) for replacement.
|
||||
* @param columnName
|
||||
* @param tokenize
|
||||
*/
|
||||
public void setTokenize(String columnName, boolean tokenize) {
|
||||
getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
|
||||
}
|
||||
|
||||
public boolean isAnalyze(String columnName) {
|
||||
return getColumn(columnName).getBoolean(HBASE_COLUMN_ANALYZE, true);
|
||||
}
|
||||
|
||||
public void setAnalyze(String columnName, boolean analyze) {
|
||||
getColumn(columnName).setBoolean(HBASE_COLUMN_ANALYZE, analyze);
|
||||
}
|
||||
|
||||
public float getBoost(String columnName) {
|
||||
return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
|
||||
}
|
||||
|
@ -166,7 +195,7 @@ public class IndexConfiguration extends Configuration {
|
|||
|
||||
public String getAnalyzerName() {
|
||||
return get(HBASE_INDEX_ANALYZER_NAME,
|
||||
"org.apache.lucene.analysis.standard.StandardAnalyzer");
|
||||
StandardAnalyzer.class.getName());
|
||||
}
|
||||
|
||||
public void setAnalyzerName(String analyzerName) {
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.mapreduce;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
|
@ -32,7 +33,9 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
|||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
/**
|
||||
* Create a local index, unwrap Lucene documents created by reduce, add them to
|
||||
|
@ -87,8 +90,8 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
|
|||
}
|
||||
|
||||
// build locally first
|
||||
final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp)
|
||||
.toString(), analyzer, true);
|
||||
final IndexWriter writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp)
|
||||
.toString())), analyzer, true, MaxFieldLength.LIMITED);
|
||||
|
||||
// no delete, so no need for maxBufferedDeleteTerms
|
||||
writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
|
||||
|
@ -98,11 +101,10 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
|
|||
String similarityName = indexConf.getSimilarityName();
|
||||
if (similarityName != null) {
|
||||
try {
|
||||
Class<?> similarityClass = Class.forName(similarityName);
|
||||
Similarity similarity = (Similarity) similarityClass.newInstance();
|
||||
Similarity similarity = Class.forName(similarityName).asSubclass(Similarity.class).newInstance();
|
||||
writer.setSimilarity(similarity);
|
||||
} catch (Exception e) {
|
||||
throw new IOException("Error in creating a similarty object "
|
||||
throw new IOException("Error in creating a similarity object "
|
||||
+ similarityName);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ implements Configurable {
|
|||
// index and store row key, row key already UTF-8 encoded
|
||||
Field keyField = new Field(indexConf.getRowkeyName(),
|
||||
Bytes.toString(key.get(), key.getOffset(), key.getLength()),
|
||||
Field.Store.YES, Field.Index.UN_TOKENIZED);
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
keyField.setOmitNorms(true);
|
||||
doc.add(keyField);
|
||||
}
|
||||
|
@ -82,7 +82,7 @@ implements Configurable {
|
|||
Field.Store.YES: Field.Store.NO;
|
||||
Field.Index index = indexConf.isIndex(column)?
|
||||
(indexConf.isTokenize(column)?
|
||||
Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED):
|
||||
Field.Index.ANALYZED: Field.Index.NOT_ANALYZED):
|
||||
Field.Index.NO;
|
||||
|
||||
// UTF-8 encode value
|
||||
|
|
|
@ -45,12 +45,16 @@ import org.apache.hadoop.hbase.util.Bytes;
|
|||
import org.apache.hadoop.mapred.MiniMRCluster;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiSearcher;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Searchable;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
/**
|
||||
* Test Map/Reduce job to build index over HBase table
|
||||
|
@ -205,13 +209,13 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
|||
ResultScanner scanner = null;
|
||||
try {
|
||||
if (indexDirs.length == 1) {
|
||||
searcher = new IndexSearcher((new File(indexDirs[0].getPath().
|
||||
toUri())).getAbsolutePath());
|
||||
searcher = new IndexSearcher(FSDirectory.open(new File(indexDirs[0].getPath().
|
||||
toUri())));
|
||||
} else if (indexDirs.length > 1) {
|
||||
Searchable[] searchers = new Searchable[indexDirs.length];
|
||||
for (int i = 0; i < indexDirs.length; i++) {
|
||||
searchers[i] = new IndexSearcher((new File(indexDirs[i].getPath().
|
||||
toUri()).getAbsolutePath()));
|
||||
searchers[i] = new IndexSearcher(FSDirectory.open(new File(indexDirs[i].getPath().
|
||||
toUri())));
|
||||
}
|
||||
searcher = new MultiSearcher(searchers);
|
||||
} else {
|
||||
|
@ -235,7 +239,9 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
|||
for (Result r : scanner) {
|
||||
String value = Bytes.toString(r.getRow());
|
||||
Term term = new Term(rowkeyName, value);
|
||||
int hitCount = searcher.search(new TermQuery(term)).length();
|
||||
CountCollector collector = new CountCollector();
|
||||
searcher.search(new TermQuery(term), collector);
|
||||
int hitCount = collector.getCount();
|
||||
assertEquals("check row " + value, 1, hitCount);
|
||||
count++;
|
||||
}
|
||||
|
@ -250,6 +256,48 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
|||
scanner.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Collector that retrieves the count of the documents.
|
||||
*
|
||||
* @author Kay Kay
|
||||
*
|
||||
*/
|
||||
public static class CountCollector extends Collector {
|
||||
|
||||
private int count;
|
||||
|
||||
public CountCollector() {
|
||||
count = 0;
|
||||
}
|
||||
|
||||
public int getCount() {
|
||||
return this.count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean acceptsDocsOutOfOrder() {
|
||||
//Make this accept docs out of order as some collectors can be efficient that way.
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void collect(int doc) throws IOException {
|
||||
++count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(IndexReader reader, int docBase)
|
||||
throws IOException {
|
||||
//Do nothing
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorer scorer) throws IOException {
|
||||
//Nothing to do with scorer.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param args unused
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue