HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@898093 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9692e3d5da
commit
704b52fbe9
|
@ -292,6 +292,7 @@ Release 0.21.0 - Unreleased
|
||||||
HBASE-2035 Binary values are formatted wrong in shell
|
HBASE-2035 Binary values are formatted wrong in shell
|
||||||
HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara
|
HBASE-2095 TIF shuold support more confs for the scanner (Bassam Tabbara
|
||||||
via Andrew Purtell)
|
via Andrew Purtell)
|
||||||
|
HBASE-2107 Upgrading Lucene 2.2 to Lucene 3.0.0 (Kay Kay via Stack)
|
||||||
|
|
||||||
NEW FEATURES
|
NEW FEATURES
|
||||||
HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write
|
HBASE-1901 "General" partitioner for "hbase-48" bulk (behind the api, write
|
||||||
|
|
|
@ -26,7 +26,7 @@ hadoop-mapred.version=0.21.0-SNAPSHOT
|
||||||
zookeeper.version=3.2.2
|
zookeeper.version=3.2.2
|
||||||
thrift.version=r771587
|
thrift.version=r771587
|
||||||
|
|
||||||
lucene.version=2.2.0
|
lucene.version=3.0.0
|
||||||
|
|
||||||
jsr311.version=1.1.1
|
jsr311.version=1.1.1
|
||||||
|
|
||||||
|
|
|
@ -37,6 +37,7 @@ import javax.xml.transform.stream.StreamResult;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.w3c.dom.Document;
|
import org.w3c.dom.Document;
|
||||||
import org.w3c.dom.Element;
|
import org.w3c.dom.Element;
|
||||||
import org.w3c.dom.Node;
|
import org.w3c.dom.Node;
|
||||||
|
@ -53,7 +54,15 @@ public class IndexConfiguration extends Configuration {
|
||||||
static final String HBASE_COLUMN_NAME = "hbase.column.name";
|
static final String HBASE_COLUMN_NAME = "hbase.column.name";
|
||||||
static final String HBASE_COLUMN_STORE = "hbase.column.store";
|
static final String HBASE_COLUMN_STORE = "hbase.column.store";
|
||||||
static final String HBASE_COLUMN_INDEX = "hbase.column.index";
|
static final String HBASE_COLUMN_INDEX = "hbase.column.index";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tokenize property terminology is deprecated in lucene / replaced by analyze.
|
||||||
|
* @see #HBASE_COLUMN_ANALYZE
|
||||||
|
* @deprecated
|
||||||
|
*/
|
||||||
static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
|
static final String HBASE_COLUMN_TOKENIZE = "hbase.column.tokenize";
|
||||||
|
static final String HBASE_COLUMN_ANALYZE = "hbase.column.analyze";
|
||||||
|
|
||||||
static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
|
static final String HBASE_COLUMN_BOOST = "hbase.column.boost";
|
||||||
static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
|
static final String HBASE_COLUMN_OMIT_NORMS = "hbase.column.omit.norms";
|
||||||
static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
|
static final String HBASE_INDEX_ROWKEY_NAME = "hbase.index.rowkey.name";
|
||||||
|
@ -131,14 +140,34 @@ public class IndexConfiguration extends Configuration {
|
||||||
getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
|
getColumn(columnName).setBoolean(HBASE_COLUMN_STORE, store);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated
|
||||||
|
* @see Use #isAnalyze(String) for replacement.
|
||||||
|
* @param columnName
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
public boolean isTokenize(String columnName) {
|
public boolean isTokenize(String columnName) {
|
||||||
return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
|
return getColumn(columnName).getBoolean(HBASE_COLUMN_TOKENIZE, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated
|
||||||
|
* @see Use #setAnalyze(String, boolean) for replacement.
|
||||||
|
* @param columnName
|
||||||
|
* @param tokenize
|
||||||
|
*/
|
||||||
public void setTokenize(String columnName, boolean tokenize) {
|
public void setTokenize(String columnName, boolean tokenize) {
|
||||||
getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
|
getColumn(columnName).setBoolean(HBASE_COLUMN_TOKENIZE, tokenize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isAnalyze(String columnName) {
|
||||||
|
return getColumn(columnName).getBoolean(HBASE_COLUMN_ANALYZE, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAnalyze(String columnName, boolean analyze) {
|
||||||
|
getColumn(columnName).setBoolean(HBASE_COLUMN_ANALYZE, analyze);
|
||||||
|
}
|
||||||
|
|
||||||
public float getBoost(String columnName) {
|
public float getBoost(String columnName) {
|
||||||
return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
|
return getColumn(columnName).getFloat(HBASE_COLUMN_BOOST, 1.0f);
|
||||||
}
|
}
|
||||||
|
@ -166,7 +195,7 @@ public class IndexConfiguration extends Configuration {
|
||||||
|
|
||||||
public String getAnalyzerName() {
|
public String getAnalyzerName() {
|
||||||
return get(HBASE_INDEX_ANALYZER_NAME,
|
return get(HBASE_INDEX_ANALYZER_NAME,
|
||||||
"org.apache.lucene.analysis.standard.StandardAnalyzer");
|
StandardAnalyzer.class.getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAnalyzerName(String analyzerName) {
|
public void setAnalyzerName(String analyzerName) {
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hbase.mapreduce;
|
package org.apache.hadoop.hbase.mapreduce;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
|
@ -32,7 +33,9 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
||||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a local index, unwrap Lucene documents created by reduce, add them to
|
* Create a local index, unwrap Lucene documents created by reduce, add them to
|
||||||
|
@ -87,8 +90,8 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// build locally first
|
// build locally first
|
||||||
final IndexWriter writer = new IndexWriter(fs.startLocalOutput(perm, temp)
|
final IndexWriter writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp)
|
||||||
.toString(), analyzer, true);
|
.toString())), analyzer, true, MaxFieldLength.LIMITED);
|
||||||
|
|
||||||
// no delete, so no need for maxBufferedDeleteTerms
|
// no delete, so no need for maxBufferedDeleteTerms
|
||||||
writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
|
writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
|
||||||
|
@ -98,11 +101,10 @@ extends FileOutputFormat<ImmutableBytesWritable, LuceneDocumentWrapper> {
|
||||||
String similarityName = indexConf.getSimilarityName();
|
String similarityName = indexConf.getSimilarityName();
|
||||||
if (similarityName != null) {
|
if (similarityName != null) {
|
||||||
try {
|
try {
|
||||||
Class<?> similarityClass = Class.forName(similarityName);
|
Similarity similarity = Class.forName(similarityName).asSubclass(Similarity.class).newInstance();
|
||||||
Similarity similarity = (Similarity) similarityClass.newInstance();
|
|
||||||
writer.setSimilarity(similarity);
|
writer.setSimilarity(similarity);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new IOException("Error in creating a similarty object "
|
throw new IOException("Error in creating a similarity object "
|
||||||
+ similarityName);
|
+ similarityName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,7 +68,7 @@ implements Configurable {
|
||||||
// index and store row key, row key already UTF-8 encoded
|
// index and store row key, row key already UTF-8 encoded
|
||||||
Field keyField = new Field(indexConf.getRowkeyName(),
|
Field keyField = new Field(indexConf.getRowkeyName(),
|
||||||
Bytes.toString(key.get(), key.getOffset(), key.getLength()),
|
Bytes.toString(key.get(), key.getOffset(), key.getLength()),
|
||||||
Field.Store.YES, Field.Index.UN_TOKENIZED);
|
Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||||
keyField.setOmitNorms(true);
|
keyField.setOmitNorms(true);
|
||||||
doc.add(keyField);
|
doc.add(keyField);
|
||||||
}
|
}
|
||||||
|
@ -82,7 +82,7 @@ implements Configurable {
|
||||||
Field.Store.YES: Field.Store.NO;
|
Field.Store.YES: Field.Store.NO;
|
||||||
Field.Index index = indexConf.isIndex(column)?
|
Field.Index index = indexConf.isIndex(column)?
|
||||||
(indexConf.isTokenize(column)?
|
(indexConf.isTokenize(column)?
|
||||||
Field.Index.TOKENIZED: Field.Index.UN_TOKENIZED):
|
Field.Index.ANALYZED: Field.Index.NOT_ANALYZED):
|
||||||
Field.Index.NO;
|
Field.Index.NO;
|
||||||
|
|
||||||
// UTF-8 encode value
|
// UTF-8 encode value
|
||||||
|
|
|
@ -45,12 +45,16 @@ import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.mapred.MiniMRCluster;
|
import org.apache.hadoop.mapred.MiniMRCluster;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.Collector;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.MultiSearcher;
|
import org.apache.lucene.search.MultiSearcher;
|
||||||
|
import org.apache.lucene.search.Scorer;
|
||||||
import org.apache.lucene.search.Searchable;
|
import org.apache.lucene.search.Searchable;
|
||||||
import org.apache.lucene.search.Searcher;
|
import org.apache.lucene.search.Searcher;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test Map/Reduce job to build index over HBase table
|
* Test Map/Reduce job to build index over HBase table
|
||||||
|
@ -205,13 +209,13 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
||||||
ResultScanner scanner = null;
|
ResultScanner scanner = null;
|
||||||
try {
|
try {
|
||||||
if (indexDirs.length == 1) {
|
if (indexDirs.length == 1) {
|
||||||
searcher = new IndexSearcher((new File(indexDirs[0].getPath().
|
searcher = new IndexSearcher(FSDirectory.open(new File(indexDirs[0].getPath().
|
||||||
toUri())).getAbsolutePath());
|
toUri())));
|
||||||
} else if (indexDirs.length > 1) {
|
} else if (indexDirs.length > 1) {
|
||||||
Searchable[] searchers = new Searchable[indexDirs.length];
|
Searchable[] searchers = new Searchable[indexDirs.length];
|
||||||
for (int i = 0; i < indexDirs.length; i++) {
|
for (int i = 0; i < indexDirs.length; i++) {
|
||||||
searchers[i] = new IndexSearcher((new File(indexDirs[i].getPath().
|
searchers[i] = new IndexSearcher(FSDirectory.open(new File(indexDirs[i].getPath().
|
||||||
toUri()).getAbsolutePath()));
|
toUri())));
|
||||||
}
|
}
|
||||||
searcher = new MultiSearcher(searchers);
|
searcher = new MultiSearcher(searchers);
|
||||||
} else {
|
} else {
|
||||||
|
@ -235,7 +239,9 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
||||||
for (Result r : scanner) {
|
for (Result r : scanner) {
|
||||||
String value = Bytes.toString(r.getRow());
|
String value = Bytes.toString(r.getRow());
|
||||||
Term term = new Term(rowkeyName, value);
|
Term term = new Term(rowkeyName, value);
|
||||||
int hitCount = searcher.search(new TermQuery(term)).length();
|
CountCollector collector = new CountCollector();
|
||||||
|
searcher.search(new TermQuery(term), collector);
|
||||||
|
int hitCount = collector.getCount();
|
||||||
assertEquals("check row " + value, 1, hitCount);
|
assertEquals("check row " + value, 1, hitCount);
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
@ -250,6 +256,48 @@ public class DisabledBecauseVariableSubstTooLargeExceptionTestTableIndex extends
|
||||||
scanner.close();
|
scanner.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collector that retrieves the count of the documents.
|
||||||
|
*
|
||||||
|
* @author Kay Kay
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static class CountCollector extends Collector {
|
||||||
|
|
||||||
|
private int count;
|
||||||
|
|
||||||
|
public CountCollector() {
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCount() {
|
||||||
|
return this.count;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean acceptsDocsOutOfOrder() {
|
||||||
|
//Make this accept docs out of order as some collectors can be efficient that way.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void collect(int doc) throws IOException {
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setNextReader(IndexReader reader, int docBase)
|
||||||
|
throws IOException {
|
||||||
|
//Do nothing
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setScorer(Scorer scorer) throws IOException {
|
||||||
|
//Nothing to do with scorer.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param args unused
|
* @param args unused
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in New Issue