From c0959adbfcbcdc8363c420d3564bd34905757272 Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Fri, 6 Jan 2012 01:25:12 +0000 Subject: [PATCH] SOLR-1931, speeding up the LukeRequestHandler git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1227924 13f79535-47bb-0310-9956-ffa450edef68 --- .../handler/admin/LukeRequestHandler.java | 476 +++++++++--------- .../org/apache/solr/MinimalSchemaTest.java | 1 - .../handler/admin/LukeRequestHandlerTest.java | 52 +- 3 files changed, 255 insertions(+), 274 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java index 0342243850e..740074a19b9 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java @@ -20,33 +20,16 @@ package org.apache.solr.handler.admin; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.FieldsEnum; +import org.apache.lucene.index.*; import org.apache.lucene.index.FieldInfo.IndexOptions; import static org.apache.lucene.index.FieldInfo.IndexOptions.DOCS_ONLY; import static org.apache.lucene.index.FieldInfo.IndexOptions.DOCS_AND_FREQS; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermRangeQuery; -import org.apache.lucene.search.TopDocs; + import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -72,6 +55,7 @@ import org.apache.solr.schema.SchemaField; import org.apache.solr.search.SolrIndexSearcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.lucene.index.DocsEnum; /** * This handler exposes the internal lucene index. It is inspired by and @@ -82,33 +66,45 @@ import org.slf4j.LoggerFactory; * that it works nicely with an XSLT transformation. Until we have a nice * XSLT front end for /admin, the format is still open to change. *

- * + * * For more documentation see: * http://wiki.apache.org/solr/LukeRequestHandler - * + * * * @since solr 1.2 */ -public class LukeRequestHandler extends RequestHandlerBase +public class LukeRequestHandler extends RequestHandlerBase { private static Logger log = LoggerFactory.getLogger(LukeRequestHandler.class); - + public static final String NUMTERMS = "numTerms"; public static final String DOC_ID = "docId"; public static final String ID = "id"; public static final int DEFAULT_COUNT = 10; + static final int HIST_ARRAY_SIZE = 33; + @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception - { + { IndexSchema schema = req.getSchema(); SolrIndexSearcher searcher = req.getSearcher(); IndexReader reader = searcher.getIndexReader(); SolrParams params = req.getParams(); int numTerms = params.getInt( NUMTERMS, DEFAULT_COUNT ); - + // Always show the core lucene info - rsp.add("index", getIndexInfo(reader, numTerms>0 ) ); + Map topTerms = new TreeMap(); + + // If no doc is given, show all fields and top terms + Set fields = null; + if( params.get( CommonParams.FL ) != null ) { + fields = new TreeSet(Arrays.asList(params.getParams( CommonParams.FL ))); + } + if ( "schema".equals( params.get( "show" ))) { + numTerms = 0; // Abort any statistics gathering. + } + rsp.add("index", getIndexInfo(reader, numTerms, topTerms, fields )); Integer docId = params.getInt( DOC_ID ); if( docId == null && params.get( ID ) != null ) { @@ -121,7 +117,7 @@ public class LukeRequestHandler extends RequestHandlerBase throw new SolrException( SolrException.ErrorCode.NOT_FOUND, "Can't find document: "+params.get( ID ) ); } } - + // Read the document from the index if( docId != null ) { Document doc = null; @@ -132,9 +128,9 @@ public class LukeRequestHandler extends RequestHandlerBase if( doc == null ) { throw new SolrException( SolrException.ErrorCode.NOT_FOUND, "Can't find document: "+docId ); } - + SimpleOrderedMap info = getDocumentFieldsInfo( doc, docId, reader, schema ); - + SimpleOrderedMap docinfo = new SimpleOrderedMap(); docinfo.add( "docId", docId ); docinfo.add( "lucene", info ); @@ -145,26 +141,18 @@ public class LukeRequestHandler extends RequestHandlerBase rsp.add( "schema", getSchemaInfo( req.getSchema() ) ); } else { - // If no doc is given, show all fields and top terms - Set fields = null; - if( params.get( CommonParams.FL ) != null ) { - fields = new HashSet(); - for( String f : params.getParams( CommonParams.FL ) ) { - fields.add( f ); - } - } - rsp.add( "fields", getIndexedFieldsInfo( searcher, fields, numTerms ) ) ; + rsp.add( "fields", getIndexedFieldsInfo( searcher, fields, numTerms, topTerms) ) ; } // Add some generally helpful information NamedList info = new SimpleOrderedMap(); info.add( "key", getFieldFlagsKey() ); - info.add( "NOTE", "Document Frequency (df) is not updated when a document is marked for deletion. df values include deleted documents." ); + info.add( "NOTE", "Document Frequency (df) is not updated when a document is marked for deletion. df values include deleted documents." ); rsp.add( "info", info ); rsp.setHttpCaching(false); } - + /** * @return a string representing a IndexableField's flags. @@ -184,11 +172,11 @@ public class LukeRequestHandler extends RequestHandlerBase flags.append( (f != null && f.fieldType().storeTermVectorPositions()) ? FieldFlag.TERM_VECTOR_POSITION.getAbbreviation() : '-' ); flags.append( (f != null && f.fieldType().omitNorms()) ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' ); - flags.append( (f != null && DOCS_ONLY == opts ) ? - FieldFlag.OMIT_TF.getAbbreviation() : '-' ); - - flags.append( (f != null && DOCS_AND_FREQS == opts ) ? - FieldFlag.OMIT_POSITIONS.getAbbreviation() : '-' ); + flags.append( (f != null && DOCS_ONLY == opts ) ? + FieldFlag.OMIT_TF.getAbbreviation() : '-' ); + + flags.append((f != null && DOCS_AND_FREQS == opts) ? + FieldFlag.OMIT_POSITIONS.getAbbreviation() : '-'); flags.append( (f != null && f.getClass().getSimpleName().equals("LazyField")) ? FieldFlag.LAZY.getAbbreviation() : '-' ); flags.append( (f != null && f.binaryValue()!=null) ? FieldFlag.BINARY.getAbbreviation() : '-' ); @@ -196,18 +184,18 @@ public class LukeRequestHandler extends RequestHandlerBase flags.append( (false) ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); // SchemaField Specific return flags.toString(); } - + /** * @return a string representing a SchemaField's flags. */ private static String getFieldFlags( SchemaField f ) { FieldType t = (f==null) ? null : f.getType(); - + // see: http://www.nabble.com/schema-field-properties-tf3437753.html#a9585549 boolean lazy = false; // "lazy" is purely a property of reading fields boolean binary = false; // Currently not possible - + StringBuilder flags = new StringBuilder(); flags.append( (f != null && f.indexed()) ? FieldFlag.INDEXED.getAbbreviation() : '-' ); flags.append( (t != null && t.isTokenized()) ? FieldFlag.TOKENIZED.getAbbreviation() : '-' ); @@ -217,8 +205,8 @@ public class LukeRequestHandler extends RequestHandlerBase flags.append( (f != null && f.storeTermOffsets() ) ? FieldFlag.TERM_VECTOR_OFFSET.getAbbreviation() : '-' ); flags.append( (f != null && f.storeTermPositions() ) ? FieldFlag.TERM_VECTOR_POSITION.getAbbreviation() : '-' ); flags.append( (f != null && f.omitNorms()) ? FieldFlag.OMIT_NORMS.getAbbreviation() : '-' ); - flags.append( (f != null && - f.omitTermFreqAndPositions() ) ? FieldFlag.OMIT_TF.getAbbreviation() : '-' ); + flags.append( (f != null && + f.omitTermFreqAndPositions() ) ? FieldFlag.OMIT_TF.getAbbreviation() : '-' ); flags.append( (f != null && f.omitPositions() ) ? FieldFlag.OMIT_POSITIONS.getAbbreviation() : '-' ); flags.append( (lazy) ? FieldFlag.LAZY.getAbbreviation() : '-' ); flags.append( (binary) ? FieldFlag.BINARY.getAbbreviation() : '-' ); @@ -226,7 +214,7 @@ public class LukeRequestHandler extends RequestHandlerBase flags.append( (f != null && f.sortMissingLast() ) ? FieldFlag.SORT_MISSING_LAST.getAbbreviation() : '-' ); return flags.toString(); } - + /** * @return a key to what each character means */ @@ -237,15 +225,15 @@ public class LukeRequestHandler extends RequestHandlerBase } return key; } - + private static SimpleOrderedMap getDocumentFieldsInfo( Document doc, int docId, IndexReader reader, IndexSchema schema ) throws IOException - { + { final CharsRef spare = new CharsRef(); SimpleOrderedMap finfo = new SimpleOrderedMap(); for( Object o : doc.getFields() ) { Field field = (Field)o; SimpleOrderedMap f = new SimpleOrderedMap(); - + SchemaField sfield = schema.getFieldOrNull( field.name() ); FieldType ftype = (sfield==null)?null:sfield.getType(); @@ -266,7 +254,7 @@ public class LukeRequestHandler extends RequestHandlerBase } f.add( "boost", field.boost() ); f.add( "docFreq", t.text()==null ? 0 : reader.docFreq( t ) ); // this can be 0 for non-indexed fields - + // If we have a term vector, return that if( field.fieldType().storeTermVectors() ) { try { @@ -287,101 +275,118 @@ public class LukeRequestHandler extends RequestHandlerBase log.warn( "error writing term vector", ex ); } } - + finfo.add( field.name(), f ); } return finfo; } - + @SuppressWarnings("unchecked") - private static SimpleOrderedMap getIndexedFieldsInfo( - final SolrIndexSearcher searcher, final Set fields, final int numTerms ) - throws Exception { + private static SimpleOrderedMap getIndexedFieldsInfo( + final SolrIndexSearcher searcher, final Set fields, final int numTerms, Map ttinfo) + throws Exception { IndexReader reader = searcher.getIndexReader(); IndexSchema schema = searcher.getSchema(); - + // Walk the term enum and keep a priority queue for each map in our set - Map ttinfo = null; - if( numTerms > 0 ) { - ttinfo = getTopTerms(reader, fields, numTerms, null ); - } SimpleOrderedMap finfo = new SimpleOrderedMap(); - Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); + Fields theFields = MultiFields.getFields(reader); + + Set fieldNames = new TreeSet(reader.getFieldNames(IndexReader.FieldOption.ALL)); for (String fieldName : fieldNames) { - if( fields != null && !fields.contains( fieldName ) ) { - continue; // if a field is specified, only them + if (fields != null && ! fields.contains(fieldName)) { + continue; // we're not interested in this term } - + SimpleOrderedMap f = new SimpleOrderedMap(); - + SchemaField sfield = schema.getFieldOrNull( fieldName ); FieldType ftype = (sfield==null)?null:sfield.getType(); f.add( "type", (ftype==null)?null:ftype.getTypeName() ); f.add( "schema", getFieldFlags( sfield ) ); if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) { - f.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); + f.add("dynamicBase", schema.getDynamicPattern(sfield.getName())); } + Terms terms = theFields.terms(fieldName); + if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified) + finfo.add( fieldName, f ); + continue; + } + + TopTermQueue topTerms = ttinfo.get( fieldName ); // If numTerms==0, the call is just asking for a quick field list if( ttinfo != null && sfield != null && sfield.indexed() ) { - Query q = new TermRangeQuery(fieldName,null,null,false,false); - TopDocs top = searcher.search( q, 1 ); - if( top.totalHits > 0 ) { - // Find a document with this field - try { - Document doc = searcher.doc( top.scoreDocs[0].doc ); - IndexableField fld = doc.getField( fieldName ); - if( fld != null ) { - f.add( "index", getFieldFlags( fld ) ); - } - else { - // it is a non-stored field... - f.add( "index", "(unstored field)" ); + if (numTerms > 0) { // Read the actual field from the index and report that too. + Document doc = null; + if (topTerms != null && topTerms.getTopTermInfo() != null) { + Term term = topTerms.getTopTermInfo().term; + DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, + MultiFields.getLiveDocs(reader), + term.field(), + new BytesRef(term.text()), + false); + if (docsEnum != null) { + int docId; + if ((docId = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) { + doc = reader.document(docId); + } } } - catch( Exception ex ) { - log.warn( "error reading field: "+fieldName ); + if( doc != null ) { + // Found a document with this field + try { + IndexableField fld = doc.getField( fieldName ); + if( fld != null ) { + f.add( "index", getFieldFlags( fld ) ); + } + else { + // it is a non-stored field... + f.add( "index", "(unstored field)" ); + } + } + catch( Exception ex ) { + log.warn( "error reading field: "+fieldName ); + } } + f.add("docs", terms.getDocCount()); } - f.add( "docs", top.totalHits ); - - TopTermQueue topTerms = ttinfo.get( fieldName ); if( topTerms != null ) { f.add( "distinct", topTerms.distinctTerms ); - + // Include top terms f.add( "topTerms", topTerms.toNamedList( searcher.getSchema() ) ); - + // Add a histogram f.add( "histogram", topTerms.histogram.toNamedList() ); } } - // Add the field finfo.add( fieldName, f ); } return finfo; } - + /** * Return info from the index */ private static SimpleOrderedMap getSchemaInfo( IndexSchema schema ) { - Map> typeusemap = new HashMap>(); - SimpleOrderedMap fields = new SimpleOrderedMap(); + Map> typeusemap = new TreeMap>(); + Map fields = new TreeMap(); SchemaField uniqueField = schema.getUniqueKeyField(); for( SchemaField f : schema.getFields().values() ) { populateFieldInfo(schema, typeusemap, fields, uniqueField, f); } - - SimpleOrderedMap dynamicFields = new SimpleOrderedMap(); + + Map dynamicFields = new TreeMap(); for (SchemaField f : schema.getDynamicFieldPrototypes()) { - populateFieldInfo(schema, typeusemap, dynamicFields, uniqueField, f); + populateFieldInfo(schema, typeusemap, dynamicFields, uniqueField, f); } SimpleOrderedMap types = new SimpleOrderedMap(); - for( FieldType ft : schema.getFieldTypes().values() ) { + Map sortedTypes = new TreeMap(schema.getFieldTypes()); + for( FieldType ft : sortedTypes.values() ) { SimpleOrderedMap field = new SimpleOrderedMap(); field.add("fields", typeusemap.get( ft.getTypeName() ) ); field.add("tokenized", ft.isTokenized() ); @@ -391,23 +396,35 @@ public class LukeRequestHandler extends RequestHandlerBase types.add( ft.getTypeName(), field ); } + // Must go through this to maintain binary compatbility. Putting a TreeMap into a resp leads to casting errors SimpleOrderedMap finfo = new SimpleOrderedMap(); - finfo.add("fields", fields); - finfo.add("dynamicFields", dynamicFields); - finfo.add("uniqueKeyField", - null == uniqueField ? null : uniqueField.getName()); + + SimpleOrderedMap fieldsSimple = new SimpleOrderedMap(); + for (Map.Entry ent : fields.entrySet()) { + fieldsSimple.add(ent.getKey(), ent.getValue()); + } + finfo.add("fields", fieldsSimple); + + SimpleOrderedMap dynamicSimple = new SimpleOrderedMap(); + for (Map.Entry ent : dynamicFields.entrySet()) { + dynamicSimple.add(ent.getKey(), ent.getValue()); + } + finfo.add("dynamicFields", dynamicSimple); + + finfo.add("uniqueKeyField", + null == uniqueField ? null : uniqueField.getName()); finfo.add("defaultSearchField", schema.getDefaultSearchFieldName()); finfo.add("types", types); return finfo; } - - private static SimpleOrderedMap getAnalyzerInfo(Analyzer analyzer) { - SimpleOrderedMap aninfo = new SimpleOrderedMap(); - aninfo.add("className", analyzer.getClass().getName()); - if (analyzer instanceof TokenizerChain) { - TokenizerChain tchain = (TokenizerChain)analyzer; + private static SimpleOrderedMap getAnalyzerInfo(Analyzer analyzer) { + SimpleOrderedMap aninfo = new SimpleOrderedMap(); + aninfo.add("className", analyzer.getClass().getName()); + if (analyzer instanceof TokenizerChain) { + + TokenizerChain tchain = (TokenizerChain)analyzer; CharFilterFactory[] cfiltfacs = tchain.getCharFilterFactories(); SimpleOrderedMap> cfilters = new SimpleOrderedMap>(); @@ -421,7 +438,7 @@ public class LukeRequestHandler extends RequestHandlerBase if (cfilters.size() > 0) { aninfo.add("charFilters", cfilters); } - + SimpleOrderedMap tokenizer = new SimpleOrderedMap(); TokenizerFactory tfac = tchain.getTokenizerFactory(); tokenizer.add("className", tfac.getClass().getName()); @@ -440,68 +457,102 @@ public class LukeRequestHandler extends RequestHandlerBase if (filters.size() > 0) { aninfo.add("filters", filters); } - } - return aninfo; + } + return aninfo; } private static void populateFieldInfo(IndexSchema schema, - Map> typeusemap, SimpleOrderedMap fields, - SchemaField uniqueField, SchemaField f) { - FieldType ft = f.getType(); - SimpleOrderedMap field = new SimpleOrderedMap(); - field.add( "type", ft.getTypeName() ); - field.add( "flags", getFieldFlags(f) ); - if( f.isRequired() ) { - field.add( "required", f.isRequired() ); - } - if( f.getDefaultValue() != null ) { - field.add( "default", f.getDefaultValue() ); - } - if (f == uniqueField){ - field.add("uniqueKey", true); - } - if (ft.getAnalyzer().getPositionIncrementGap(f.getName()) != 0) { - field.add("positionIncrementGap", ft.getAnalyzer().getPositionIncrementGap(f.getName())); - } - field.add("copyDests", schema.getCopyFieldsList(f.getName())); - field.add("copySources", schema.getCopySources(f.getName())); + Map> typeusemap, Map fields, + SchemaField uniqueField, SchemaField f) { + FieldType ft = f.getType(); + SimpleOrderedMap field = new SimpleOrderedMap(); + field.add( "type", ft.getTypeName() ); + field.add( "flags", getFieldFlags(f) ); + if( f.isRequired() ) { + field.add( "required", f.isRequired() ); + } + if( f.getDefaultValue() != null ) { + field.add( "default", f.getDefaultValue() ); + } + if (f == uniqueField){ + field.add("uniqueKey", true); + } + if (ft.getAnalyzer().getPositionIncrementGap(f.getName()) != 0) { + field.add("positionIncrementGap", ft.getAnalyzer().getPositionIncrementGap(f.getName())); + } + field.add("copyDests", schema.getCopyFieldsList(f.getName())); + field.add("copySources", schema.getCopySources(f.getName())); - - fields.add( f.getName(), field ); - - List v = typeusemap.get( ft.getTypeName() ); - if( v == null ) { - v = new ArrayList(); - } - v.add( f.getName() ); - typeusemap.put( ft.getTypeName(), v ); + + fields.put( f.getName(), field ); + + List v = typeusemap.get( ft.getTypeName() ); + if( v == null ) { + v = new ArrayList(); + } + v.add( f.getName() ); + typeusemap.put( ft.getTypeName(), v ); } - - public static SimpleOrderedMap getIndexInfo( IndexReader reader, boolean countTerms ) throws IOException { + public static SimpleOrderedMap getIndexInfo(IndexReader reader, boolean countTerms) throws IOException { + return getIndexInfo(reader, countTerms ? 1 : 0, null, null); + } + public static SimpleOrderedMap getIndexInfo( IndexReader reader, int numTerms, + Map topTerms, + Set fieldList) throws IOException { Directory dir = reader.directory(); SimpleOrderedMap indexInfo = new SimpleOrderedMap(); - + indexInfo.add("numDocs", reader.numDocs()); indexInfo.add("maxDoc", reader.maxDoc()); - - if( countTerms ) { + final CharsRef spare = new CharsRef(); + if( numTerms > 0 ) { Fields fields = MultiFields.getFields(reader); - int numTerms = 0; + long totalTerms = 0; if (fields != null) { FieldsEnum fieldsEnum = fields.iterator(); - while(fieldsEnum.next() != null) { + String field; + while ((field = fieldsEnum.next()) != null) { Terms terms = fieldsEnum.terms(); - if (terms != null) { - TermsEnum termsEnum = terms.iterator(null); - while(termsEnum.next() != null) { - numTerms++; + if (terms == null) { + continue; + } + totalTerms += terms.getUniqueTermCount(); + + if (fieldList != null && !fieldList.contains(field)) { + continue; + } + + TermsEnum termsEnum = terms.iterator(null); + BytesRef text; + int[] buckets = new int[HIST_ARRAY_SIZE]; + TopTermQueue tiq = topTerms.get(field); + if (tiq == null) { + tiq = new TopTermQueue(numTerms + 1); // Allocating slots for the top N terms to collect freqs. + topTerms.put(field, tiq); + } + while ((text = termsEnum.next()) != null) { + int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to. + int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); + buckets[slot] = buckets[slot] + 1; + if (freq > tiq.minFreq) { + UnicodeUtil.UTF8toUTF16(text, spare); + String t = spare.toString(); + tiq.distinctTerms = new Long(fieldsEnum.terms().getUniqueTermCount()).intValue(); + + tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); + if (tiq.size() > numTerms) { // if tiq full + tiq.pop(); // remove lowest in tiq + tiq.minFreq = tiq.getTopTermInfo().docFreq; + } } } + tiq.histogram.add(buckets); } } - indexInfo.add("numTerms", numTerms ); - } + //Clumsy, but I'm tired. + indexInfo.add("numTerms", (new Long(totalTerms)).intValue()); + } indexInfo.add("version", reader.getVersion()); // TODO? Is this different then: IndexReader.getCurrentVersion( dir )? indexInfo.add("segmentCount", reader.getSequentialSubReaders().length); indexInfo.add("current", reader.isCurrent() ); @@ -510,7 +561,6 @@ public class LukeRequestHandler extends RequestHandlerBase indexInfo.add("lastModified", new Date(IndexReader.lastModified(dir)) ); return indexInfo; } - //////////////////////// SolrInfoMBeans methods ////////////////////// @Override @@ -542,51 +592,33 @@ public class LukeRequestHandler extends RequestHandlerBase } /////////////////////////////////////////////////////////////////////////////////////// - - static class TermHistogram + + static class TermHistogram { - int maxBucket = -1; - public Map hist = new HashMap(); - - public static int getPowerOfTwoBucket( int num ) - { - return Math.max(1, Integer.highestOneBit(num-1) << 1); - } - - public void add( int df ) - { - Integer bucket = getPowerOfTwoBucket( df ); - if( bucket > maxBucket ) { - maxBucket = bucket; + int _maxBucket = -1; + int _buckets[] = new int[HIST_ARRAY_SIZE]; + public void add(int[] buckets) { + for (int idx = 0; idx < buckets.length; ++idx) { + if (buckets[idx] != 0) _maxBucket = idx; } - Integer old = hist.get( bucket ); - if( old == null ) { - hist.put( bucket, 1 ); - } - else { - hist.put( bucket, old+1 ); + for (int idx = 0; idx <= _maxBucket; ++idx) { + _buckets[idx] = buckets[idx]; } } - // TODO? should this be a list or a map? public NamedList toNamedList() { NamedList nl = new NamedList(); - for( int bucket = 1; bucket <= maxBucket; bucket *= 2 ) { - Integer val = hist.get( bucket ); - if( val == null ) { - val = 0; - } - nl.add( ""+bucket, val ); + for( int bucket = 0; bucket <= _maxBucket; bucket++ ) { + nl.add( ""+ (1 << bucket), _buckets[bucket] ); } return nl; } } - /** * Private internal class that counts up frequent terms */ - private static class TopTermQueue extends PriorityQueue + private static class TopTermQueue extends PriorityQueue { static class TermInfo { TermInfo(Term t, int df) { @@ -596,23 +628,24 @@ public class LukeRequestHandler extends RequestHandlerBase int docFreq; Term term; } - + public int minFreq = 0; public int distinctTerms = 0; public TermHistogram histogram; - + + TopTermQueue(int size) { super(size); histogram = new TermHistogram(); } - + @Override protected final boolean lessThan(Object a, Object b) { TermInfo termInfoA = (TermInfo)a; TermInfo termInfoB = (TermInfo)b; return termInfoA.docFreq < termInfoB.docFreq; } - + /** * This is a destructive call... the queue is empty at the end */ @@ -623,7 +656,7 @@ public class LukeRequestHandler extends RequestHandlerBase while( size() > 0 ) { aslist.add( 0, (TermInfo)pop() ); } - + NamedList list = new NamedList(); for (TermInfo i : aslist) { String txt = i.term.text(); @@ -635,65 +668,8 @@ public class LukeRequestHandler extends RequestHandlerBase } return list; } - } - - private static Map getTopTerms( IndexReader reader, Set fields, int numTerms, Set junkWords ) throws Exception - { - Map info = new HashMap(); - final CharsRef spare = new CharsRef(); - Fields fieldsC = MultiFields.getFields(reader); - if (fieldsC != null) { - FieldsEnum fieldsEnum = fieldsC.iterator(); - String field; - while((field = fieldsEnum.next()) != null) { - - Terms terms = fieldsEnum.terms(); - if (terms == null) { - continue; - } - TermsEnum termsEnum = terms.iterator(null); - BytesRef text; - while((text = termsEnum.next()) != null) { - UnicodeUtil.UTF8toUTF16(text, spare); - String t = spare.toString(); - - // Compute distinct terms for every field - TopTermQueue tiq = info.get( field ); - if( tiq == null ) { - tiq = new TopTermQueue( numTerms+1 ); - info.put( field, tiq ); - } - - tiq.distinctTerms++; - tiq.histogram.add( termsEnum.docFreq() ); // add the term to the histogram - - // Only save the distinct terms for fields we worry about - if (fields != null && fields.size() > 0) { - if( !fields.contains( field ) ) { - continue; - } - } - if( junkWords != null && junkWords.contains( t ) ) { - continue; - } - - if( termsEnum.docFreq() > tiq.minFreq ) { - tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); - if (tiq.size() > numTerms) { // if tiq full - tiq.pop(); // remove lowest in tiq - tiq.minFreq = ((TopTermQueue.TermInfo)tiq.top()).docFreq; // reset minFreq - } - } - } - } + public TermInfo getTopTermInfo() { + return (TermInfo)top(); } - return info; } } - - - - - - - diff --git a/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java b/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java index 5b55e45fe34..052f7e93628 100644 --- a/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java +++ b/solr/core/src/test/org/apache/solr/MinimalSchemaTest.java @@ -89,7 +89,6 @@ public class MinimalSchemaTest extends SolrTestCaseJ4 { req("qt", "/admin/luke", "show","schema") ,"//int[@name='numDocs'][.='2']" - ,"//int[@name='numTerms'][.='5']" ,"//null[@name='uniqueKeyField']" ,"//null[@name='defaultSearchField']" ); diff --git a/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java index a1274e73379..2feed50f252 100644 --- a/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java +++ b/solr/core/src/test/org/apache/solr/handler/admin/LukeRequestHandlerTest.java @@ -19,6 +19,7 @@ package org.apache.solr.handler.admin; import org.apache.solr.common.luke.FieldFlag; import org.apache.solr.util.AbstractSolrTestCase; +import org.junit.Test; import java.util.EnumSet; import java.util.Arrays; @@ -38,33 +39,31 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase { return "solrconfig.xml"; } - /** tests some simple edge cases */ - public void doTestHistogramPowerOfTwoBucket() { - assertHistoBucket(1, 1); - assertHistoBucket(2, 2); - assertHistoBucket(4, 3); - assertHistoBucket(4, 4); - assertHistoBucket(8, 5); - assertHistoBucket(8, 6); - assertHistoBucket(8, 7); - assertHistoBucket(8, 8); - assertHistoBucket(16, 9); + public void testHistogramBucket() { + assertHistoBucket(0, 1); + assertHistoBucket(1, 2); + assertHistoBucket(2, 3); + assertHistoBucket(2, 4); + assertHistoBucket(3, 5); + assertHistoBucket(3, 6); + assertHistoBucket(3, 7); + assertHistoBucket(3, 8); + assertHistoBucket(4, 9); final int MAX_VALID = ((Integer.MAX_VALUE/2)+1)/2; - - assertHistoBucket(MAX_VALID, MAX_VALID-1 ); - assertHistoBucket(MAX_VALID, MAX_VALID ); - assertHistoBucket(MAX_VALID*2, MAX_VALID+1 ); - + + assertHistoBucket(29, MAX_VALID-1 ); + assertHistoBucket(29, MAX_VALID ); + assertHistoBucket(30, MAX_VALID+1 ); + } - private void assertHistoBucket(int expected, int in) { - assertEquals("histobucket: " + in, expected, - LukeRequestHandler.TermHistogram.getPowerOfTwoBucket( in )); + private void assertHistoBucket(int slot, int in) { + assertEquals("histobucket: " + in, slot, 32 - Integer.numberOfLeadingZeros(Math.max(0, in - 1))); } + @Test public void testLuke() { - doTestHistogramPowerOfTwoBucket(); assertU(adoc("id","SOLR1000", "name","Apache Solr", "solr_si", "10", @@ -99,7 +98,7 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase { assertQ(req("qt","/admin/luke", "id","SOLR1000")); final int numFlags = EnumSet.allOf(FieldFlag.class).size(); - + assertQ("Not all flags ("+numFlags+") mentioned in info->key", req("qt","/admin/luke"), numFlags+"=count(//lst[@name='info']/lst[@name='key']/str)"); @@ -116,7 +115,7 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase { } - // diff loop for checking 'index' flags, + // diff loop for checking 'index' flags, // only valid for fields that are indexed & stored for (String f : Arrays.asList("solr_t","solr_s","solr_ti", "solr_td","solr_pl","solr_dt","solr_b")) { @@ -125,10 +124,17 @@ public class LukeRequestHandlerTest extends AbstractSolrTestCase { assertQ("Not as many index flags as expected ("+numFlags+") for " + f, req("qt","/admin/luke", "fl", f), numFlags+"=string-length("+xp+"[@name='index'])"); - } + final String hxp = getFieldXPathHistogram(f); + assertQ("Historgram field should be present for field "+f, + req("qt", "/admin/luke", "fl", f), + hxp+"[@name='histogram']"); + } } + private static String getFieldXPathHistogram(String field) { + return "//lst[@name='fields']/lst[@name='"+field+"']/lst"; + } private static String getFieldXPathPrefix(String field) { return "//lst[@name='fields']/lst[@name='"+field+"']/str"; }