SOLR-1900: change terms component to use flex APIs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@948201 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2010-05-25 20:36:53 +00:00
parent 71b59ca566
commit 865002bcd8
10 changed files with 265 additions and 107 deletions

View File

@ -16,9 +16,10 @@ package org.apache.solr.handler.component;
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;
import org.apache.noggit.CharArr;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.*;
import org.apache.solr.common.util.NamedList;
@ -27,6 +28,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.StrField;
import org.apache.solr.request.SimpleFacets.CountPair;
import org.apache.solr.search.SolrIndexReader;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.client.solrj.response.TermsResponse;
@ -69,112 +71,156 @@ public class TermsComponent extends SearchComponent {
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (params.getBool(TermsParams.TERMS, false)) {
String lowerStr = params.get(TermsParams.TERMS_LOWER, null);
String[] fields = params.getParams(TermsParams.TERMS_FIELD);
if (fields != null && fields.length > 0) {
NamedList terms = new SimpleOrderedMap();
rb.rsp.add("terms", terms);
int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
if (limit < 0) {
limit = Integer.MAX_VALUE;
if (!params.getBool(TermsParams.TERMS, false)) return;
String[] fields = params.getParams(TermsParams.TERMS_FIELD);
NamedList termsResult = new SimpleOrderedMap();
rb.rsp.add("terms", termsResult);
if (fields == null || fields.length==0) return;
int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
if (limit < 0) {
limit = Integer.MAX_VALUE;
}
String lowerStr = params.get(TermsParams.TERMS_LOWER);
String upperStr = params.get(TermsParams.TERMS_UPPER);
boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(
params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
if (freqmax<0) {
freqmax = Integer.MAX_VALUE;
}
String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;
boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
SolrIndexReader sr = rb.req.getSearcher().getReader();
Fields lfields = MultiFields.getFields(sr);
for (String field : fields) {
NamedList fieldTerms = new NamedList();
termsResult.add(field, fieldTerms);
Terms terms = lfields.terms(field);
if (terms == null) {
// no terms for this field
continue;
}
FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
if (ft==null) ft = new StrField();
// prefix must currently be text
BytesRef prefixBytes = prefix==null ? null : new BytesRef(prefix);
BytesRef upperBytes = null;
if (upperStr != null) {
upperBytes = new BytesRef();
ft.readableToIndexed(upperStr, upperBytes);
}
BytesRef lowerBytes;
if (lowerStr == null) {
// If no lower bound was specified, use the prefix
lowerBytes = prefixBytes;
} else {
lowerBytes = new BytesRef();
if (raw) {
// TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
// perhaps we detect if the FieldType is non-character and expect hex if so?
lowerBytes = new BytesRef(lowerStr);
} else {
lowerBytes = new BytesRef();
ft.readableToIndexed(lowerStr, lowerBytes);
}
String upperStr = params.get(TermsParams.TERMS_UPPER);
boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(
params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1); // initialize freqmin
int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT); // initialize freqmax
if (freqmax<0) {
freqmax = Integer.MAX_VALUE;
}
String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;
}
boolean raw = params.getBool(TermsParams.TERMS_RAW, false);
for (int j = 0; j < fields.length; j++) {
String field = StringHelper.intern(fields[j]);
FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
if (ft==null) ft = new StrField();
// If no lower bound was specified, use the prefix
String lower = lowerStr==null ? prefix : (raw ? lowerStr : ft.toInternal(lowerStr));
if (lower == null) lower="";
String upper = upperStr==null ? null : (raw ? upperStr : ft.toInternal(upperStr));
Term lowerTerm = new Term(field, lower);
Term upperTerm = upper==null ? null : new Term(field, upper);
TermEnum termEnum = rb.req.getSearcher().getReader().terms(lowerTerm); //this will be positioned ready to go
int i = 0;
BoundedTreeSet<CountPair<String, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<String, Integer>>(limit) : null);
NamedList fieldTerms = new NamedList();
terms.add(field, fieldTerms);
Term lowerTestTerm = termEnum.term();
TermsEnum termsEnum = terms.iterator();
BytesRef term = null;
if (lowerBytes != null) {
if (termsEnum.seek(lowerBytes, true) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
//Only advance the enum if we are excluding the lower bound and the lower Term actually matches
if (lowerTestTerm!=null && lowerIncl == false && lowerTestTerm.field() == field // intern'd comparison
&& lowerTestTerm.text().equals(lower)) {
termEnum.next();
}
while (i<limit || sort) {
Term theTerm = termEnum.term();
// check for a different field, or the end of the index.
if (theTerm==null || field != theTerm.field()) // intern'd comparison
break;
String indexedText = theTerm.text();
// stop if the prefix doesn't match
if (prefix != null && !indexedText.startsWith(prefix)) break;
if (pattern != null && !pattern.matcher(indexedText).matches()) {
termEnum.next();
continue;
}
if (upperTerm != null) {
int upperCmp = theTerm.compareTo(upperTerm);
// if we are past the upper term, or equal to it (when don't include upper) then stop.
if (upperCmp>0 || (upperCmp==0 && !upperIncl)) break;
}
// This is a good term in the range. Check if mincount/maxcount conditions are satisfied.
int docFreq = termEnum.docFreq();
if (docFreq >= freqmin && docFreq <= freqmax) {
// add the term to the list
String label = raw ? indexedText : ft.indexedToReadable(indexedText);
if (sort) {
queue.add(new CountPair<String, Integer>(label, docFreq));
} else {
fieldTerms.add(label, docFreq);
i++;
}
}
termEnum.next();
}
termEnum.close();
if (sort) {
for (CountPair<String, Integer> item : queue) {
if (i < limit) {
fieldTerms.add(item.key, item.val);
i++;
} else {
break;
}
}
if (lowerIncl == false && term.equals(lowerBytes)) {
term = termsEnum.next();
}
}
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No terms.fl parameter specified");
// position termsEnum on first term
term = termsEnum.next();
}
int i = 0;
BoundedTreeSet<CountPair<String, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<String, Integer>>(limit) : null);
CharArr external = new CharArr();
while (term != null && (i<limit || sort)) {
boolean externalized = false; // did we fill in "external" yet for this term?
// stop if the prefix doesn't match
if (prefixBytes != null && !term.startsWith(prefixBytes)) break;
if (pattern != null) {
// indexed text or external text?
// TODO: support "raw" mode?
external.reset();
ft.indexedToReadable(term, external);
if (!pattern.matcher(external).matches()) {
term = termsEnum.next();
continue;
}
}
if (upperBytes != null) {
int upperCmp = term.compareTo(upperBytes);
// if we are past the upper term, or equal to it (when don't include upper) then stop.
if (upperCmp>0 || (upperCmp==0 && !upperIncl)) break;
}
// This is a good term in the range. Check if mincount/maxcount conditions are satisfied.
int docFreq = termsEnum.docFreq();
if (docFreq >= freqmin && docFreq <= freqmax) {
// add the term to the list
// TODO: handle raw somehow
if (!externalized) {
external.reset();
ft.indexedToReadable(term, external);
}
String label = external.toString();
if (sort) {
// TODO: defer conversion to string until the end...
// using the label now is a bug since tiebreak will not be in index order
queue.add(new CountPair<String, Integer>(label, docFreq));
} else {
fieldTerms.add(label, docFreq);
i++;
}
}
term = termsEnum.next();
}
if (sort) {
for (CountPair<String, Integer> item : queue) {
if (i >= limit) break;
fieldTerms.add(item.key, item.val);
i++;
}
}
}
}

View File

@ -18,6 +18,8 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.OrdFieldSource;
import org.apache.lucene.analysis.Analyzer;
@ -112,6 +114,15 @@ public class BoolField extends FieldType {
return ch=='T' ? "true" : "false";
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
if (input.length > 0 && input.bytes[input.offset] == 'T') {
out.write("true");
} else {
out.write("false");
}
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
xmlWriter.writeBool(name, f.stringValue().charAt(0) =='T');
}

View File

@ -22,6 +22,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.request.SolrQueryRequest;
@ -29,6 +31,7 @@ import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.response.XMLWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.function.*;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.DateMathParser;
import java.io.IOException;
@ -185,6 +188,12 @@ public class DateField extends FieldType {
return indexedForm + Z;
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
ByteUtils.UTF8toUTF16(input, out);
out.write(Z);
}
public String toExternal(Fieldable f) {
return indexedToReadable(f.stringValue());
}

View File

@ -28,6 +28,9 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.OrdFieldSource;
import org.apache.solr.search.Sorting;
@ -39,6 +42,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.util.ByteUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Map;
@ -347,6 +351,11 @@ public abstract class FieldType extends FieldProperties {
return indexedForm;
}
/** Given an indexed term, append the human readable representation to out */
public void indexedToReadable(BytesRef input, CharArr out) {
ByteUtils.UTF8toUTF16(input, out);
}
/** Given the stored field, return the human readable representation */
public String storedToReadable(Fieldable f) {
return toExternal(f);
@ -365,6 +374,12 @@ public abstract class FieldType extends FieldProperties {
return toInternal(val);
}
/** Given the readable value, return the term value that will match it. */
public void readableToIndexed(CharSequence val, BytesRef result) {
String internal = readableToIndexed(val.toString());
UnicodeUtil.UTF16toUTF8(internal, 0, internal.length(), result);
}
/**
* Default analyzer for types that only produce 1 verbatim token...
* A maximum size of chars to be read must be specified

View File

@ -18,12 +18,15 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.FieldCacheSource;
import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.response.XMLWriter;
@ -62,6 +65,12 @@ public class SortableDoubleField extends FieldType {
return NumberUtils.SortableStr2doubleStr(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
String sval = f.stringValue();
xmlWriter.writeDouble(name, NumberUtils.SortableStr2double(sval));

View File

@ -18,12 +18,15 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.FieldCacheSource;
import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.response.XMLWriter;
@ -62,6 +65,12 @@ public class SortableFloatField extends FieldType {
return NumberUtils.SortableStr2floatStr(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
}
public void write(XMLWriter xmlWriter, String name, Fieldable f) throws IOException {
String sval = f.stringValue();
xmlWriter.writeFloat(name, NumberUtils.SortableStr2float(sval));

View File

@ -18,12 +18,15 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.FieldCacheSource;
import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.response.XMLWriter;
@ -60,6 +63,12 @@ public class SortableIntField extends FieldType {
return NumberUtils.SortableStr2int(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
}
@Override
public Integer toObject(Fieldable f) {
return NumberUtils.SortableStr2int(f.stringValue(), 0, 3);

View File

@ -18,12 +18,15 @@
package org.apache.solr.schema;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;
import org.apache.noggit.CharArr;
import org.apache.solr.search.function.ValueSource;
import org.apache.solr.search.function.FieldCacheSource;
import org.apache.solr.search.function.DocValues;
import org.apache.solr.search.function.StringIndexDocValues;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.util.ByteUtils;
import org.apache.solr.util.NumberUtils;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.response.XMLWriter;
@ -53,6 +56,12 @@ public class SortableLongField extends FieldType {
return NumberUtils.SortableStr2long(indexedForm);
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
// TODO: this could be more efficient, but the sortable types should be deprecated instead
out.write( indexedToReadable(ByteUtils.UTF8toUTF16(input)) );
}
public String toExternal(Fieldable f) {
return indexedToReadable(f.stringValue());
}

View File

@ -17,6 +17,7 @@
package org.apache.solr.schema;
import org.apache.noggit.CharArr;
import org.apache.solr.common.SolrException;
import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
@ -151,6 +152,12 @@ public class TrieDateField extends DateField {
return super.toExternal( new Date(NumericUtils.prefixCodedToLong(indexedForm)) );
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
String ext = super.toExternal( new Date(NumericUtils.prefixCodedToLong(input)) );
out.write(ext);
}
@Override
public String storedToIndexed(Fieldable f) {
// TODO: optimize to remove redundant string conversion

View File

@ -23,6 +23,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream;
import org.apache.noggit.CharArr;
import org.apache.solr.analysis.*;
import org.apache.solr.common.SolrException;
import org.apache.solr.response.TextResponseWriter;
@ -326,29 +327,34 @@ public class TrieField extends FieldType {
public String readableToIndexed(String val) {
// TODO: Numeric should never be handled as String, that may break in future lucene versions! Change to use BytesRef for term texts!
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
readableToIndexed(val, bytes);
return bytes.utf8ToString();
}
@Override
public void readableToIndexed(CharSequence val, BytesRef result) {
String s = val.toString();
switch (type) {
case INTEGER:
NumericUtils.intToPrefixCoded(Integer.parseInt(val), 0, bytes);
NumericUtils.intToPrefixCoded(Integer.parseInt(s), 0, result);
break;
case FLOAT:
NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(val)), 0, bytes);
NumericUtils.intToPrefixCoded(NumericUtils.floatToSortableInt(Float.parseFloat(s)), 0, result);
break;
case LONG:
NumericUtils.longToPrefixCoded(Long.parseLong(val), 0, bytes);
NumericUtils.longToPrefixCoded(Long.parseLong(s), 0, result);
break;
case DOUBLE:
NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(val)), 0, bytes);
NumericUtils.longToPrefixCoded(NumericUtils.doubleToSortableLong(Double.parseDouble(s)), 0, result);
break;
case DATE:
NumericUtils.longToPrefixCoded(dateField.parseMath(null, val).getTime(), 0, bytes);
NumericUtils.longToPrefixCoded(dateField.parseMath(null, s).getTime(), 0, result);
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
}
return bytes.utf8ToString();
}
@Override
public String toInternal(String val) {
return readableToIndexed(val);
@ -399,6 +405,34 @@ public class TrieField extends FieldType {
}
}
@Override
public void indexedToReadable(BytesRef input, CharArr out) {
BytesRef indexedForm = input;
String s;
switch (type) {
case INTEGER:
s = Integer.toString( NumericUtils.prefixCodedToInt(indexedForm) );
break;
case FLOAT:
s = Float.toString( NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(indexedForm)) );
break;
case LONG:
s = Long.toString( NumericUtils.prefixCodedToLong(indexedForm) );
break;
case DOUBLE:
s = Double.toString( NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(indexedForm)) );
break;
case DATE:
s = dateField.formatDate( new Date(NumericUtils.prefixCodedToLong(indexedForm)) );
break;
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown type for trie field: " + type);
}
out.write(s);
}
@Override
public String storedToIndexed(Fieldable f) {
// TODO: optimize to remove redundant string conversion