diff --git a/dev-tools/README.txt b/dev-tools/README.txt index 766cf2839fd..0bce3287c80 100644 --- a/dev-tools/README.txt +++ b/dev-tools/README.txt @@ -1,7 +1,11 @@ -The tools, documentation, etc. in this directory are intended to be helpful to developers of Lucene and Solr. They are not necessarily maintained in the same way that the core code is maintained and so your mileage may vary as to the usefulness of the tools. +The tools, documentation, etc. in this directory are intended to be helpful +to developers of Lucene and Solr. They are not necessarily maintained in +the same way that the core code is maintained and so your mileage may vary +as to the usefulness of the tools. -Description of directories: -./size-estimator-lucene-solr -- Spreadsheet for estimating memory and disk usage in Lucene/Solr +Description of dev-tools/ contents: + +./size-estimator-lucene-solr.xls -- Spreadsheet for estimating memory and disk usage in Lucene/Solr ./eclipse -- Used to generate project descriptors for the Eclipse IDE. ./idea -- Similar to Eclipse, but for IntelliJ's IDEA IDE. ./maven -- Mavenizes the Lucene/Solr packages diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template index a1b894f9912..58ffb12f869 100644 --- a/dev-tools/maven/pom.xml.template +++ b/dev-tools/maven/pom.xml.template @@ -391,11 +391,6 @@ jcl-over-slf4j ${slf4j.version} - - org.slf4j - log4j-over-slf4j - ${slf4j.version} - org.slf4j slf4j-api diff --git a/dev-tools/maven/solr/core/src/test/pom.xml.template b/dev-tools/maven/solr/core/src/test/pom.xml.template index 3d8b23691fa..17d77d58d30 100644 --- a/dev-tools/maven/solr/core/src/test/pom.xml.template +++ b/dev-tools/maven/solr/core/src/test/pom.xml.template @@ -57,11 +57,6 @@ easymock test - - org.slf4j - log4j-over-slf4j - test - diff --git a/dev-tools/maven/solr/solrj/src/java/pom.xml.template b/dev-tools/maven/solr/solrj/src/java/pom.xml.template index ad8ba2f71d9..2ef6014679d 100644 --- a/dev-tools/maven/solr/solrj/src/java/pom.xml.template +++ b/dev-tools/maven/solr/solrj/src/java/pom.xml.template @@ -76,12 +76,6 @@ - - - org.slf4j - log4j-over-slf4j - true - org.slf4j jcl-over-slf4j diff --git a/dev-tools/maven/solr/solrj/src/test/pom.xml.template b/dev-tools/maven/solr/solrj/src/test/pom.xml.template index 4905cdbe1db..de0a1017218 100644 --- a/dev-tools/maven/solr/solrj/src/test/pom.xml.template +++ b/dev-tools/maven/solr/solrj/src/test/pom.xml.template @@ -52,11 +52,6 @@ ${project.version} test - - org.slf4j - log4j-over-slf4j - test - diff --git a/dev-tools/maven/solr/webapp/pom.xml.template b/dev-tools/maven/solr/webapp/pom.xml.template index ce20f26b761..7d886b15bbc 100644 --- a/dev-tools/maven/solr/webapp/pom.xml.template +++ b/dev-tools/maven/solr/webapp/pom.xml.template @@ -62,10 +62,6 @@ slf4j-jdk14 runtime - - org.slf4j - log4j-over-slf4j - org.slf4j jcl-over-slf4j diff --git a/dev-tools/scripts/buildAndPushRelease.py b/dev-tools/scripts/buildAndPushRelease.py index 8a60b365342..8a4247d61e4 100644 --- a/dev-tools/scripts/buildAndPushRelease.py +++ b/dev-tools/scripts/buildAndPushRelease.py @@ -19,9 +19,9 @@ import shutil import os import sys -# Usage: python -u buildRelease.py [-sign gpgKey(eg: 6E68DA61)] [-prepare] [-push userName] [-pushLocal dirName] [-smoke tmpDir] /path/to/checkout version(eg: 3.4.0) rcNum(eg: 0) +# Usage: python3.2 -u buildAndPushRelease.py [-sign gpgKey(eg: 6E68DA61)] [-prepare] [-push userName] [-pushLocal dirName] [-smoke tmpDir] /path/to/checkout version(eg: 3.4.0) rcNum(eg: 0) # -# EG: python -u buildRelease.py -prepare -push -sign 6E68DA61 mikemccand /lucene/34x 3.4.0 0 +# EG: python3.2 -u buildAndPushRelease.py -prepare -push -sign 6E68DA61 mikemccand /lucene/34x 3.4.0 0 # NOTE: if you specify -sign, you have to type in your gpg password at # some point while this runs; it's VERY confusing because the output @@ -32,15 +32,15 @@ import sys LOG = '/tmp/release.log' def log(msg): - f = open(LOG, 'ab') - f.write(msg) + f = open(LOG, mode='ab') + f.write(msg.encode('utf-8')) f.close() def run(command): log('\n\n%s: RUN: %s\n' % (datetime.datetime.now(), command)) if os.system('%s >> %s 2>&1' % (command, LOG)): msg = ' FAILED: %s [see log %s]' % (command, LOG) - print msg + print(msg) raise RuntimeError(msg) def scrubCheckout(): @@ -53,7 +53,7 @@ def scrubCheckout(): if match: s = match.group(1) if os.path.exists(s): - print ' delete %s' % s + print(' delete %s' % s) if os.path.isdir(s) and not os.path.islink(s): shutil.rmtree(s) else: @@ -69,29 +69,29 @@ def getSVNRev(): def prepare(root, version, gpgKeyID, doTest): - print - print 'Prepare release...' + print() + print('Prepare release...') if os.path.exists(LOG): os.remove(LOG) os.chdir(root) - print ' svn up...' + print(' svn up...') run('svn up') rev = getSVNRev() - print ' svn rev: %s' % rev + print(' svn rev: %s' % rev) log('\nSVN rev: %s\n' % rev) if doTest: # Don't run tests if we are gonna smoke test after the release... - print ' ant clean test' + print(' ant clean test') run('ant clean test') - print ' clean checkout' + print(' clean checkout') scrubCheckout() - open('rev.txt', 'wb').write(rev) + open('rev.txt', mode='wb').write(rev.encode('UTF-8')) - print ' lucene prepare-release' + print(' lucene prepare-release') os.chdir('lucene') cmd = 'ant -Dversion=%s -Dspecversion=%s' % (version, version) if gpgKeyID is not None: @@ -100,7 +100,7 @@ def prepare(root, version, gpgKeyID, doTest): cmd += ' prepare-release-no-sign' run(cmd) - print ' solr prepare-release' + print(' solr prepare-release') os.chdir('../solr') cmd = 'ant -Dversion=%s -Dspecversion=%s' % (version, version) if gpgKeyID is not None: @@ -108,105 +108,94 @@ def prepare(root, version, gpgKeyID, doTest): else: cmd += ' prepare-release-no-sign' run(cmd) - print ' done!' - print + print(' done!') + print() return rev def push(version, root, rev, rcNum, username): - print 'Push...' + print('Push...') dir = 'lucene-solr-%s-RC%d-rev%s' % (version, rcNum, rev) - s = os.popen('ssh %s@people.apache.org "ls -ld public_html/staging_area/%s" 2>&1' % (username, dir)).read() - if s.lower().find('no such file or directory') == -1: - print ' Remove old dir...' + s = os.popen('ssh %s@people.apache.org "ls -ld public_html/staging_area/%s" 2>&1' % (username, dir)).read().decode('UTF-8') + if 'no such file or directory' not in s.lower(): + print(' Remove old dir...') run('ssh %s@people.apache.org "chmod -R u+rwX public_html/staging_area/%s; rm -rf public_html/staging_area/%s"' % (username, dir, dir)) run('ssh %s@people.apache.org "mkdir -p public_html/staging_area/%s/lucene public_html/staging_area/%s/solr"' % \ (username, dir, dir)) - print ' Lucene' + print(' Lucene') os.chdir('%s/lucene/dist' % root) - print ' zip...' + print(' zip...') if os.path.exists('lucene.tar.bz2'): os.remove('lucene.tar.bz2') run('tar cjf lucene.tar.bz2 *') - print ' copy...' + print(' copy...') run('scp lucene.tar.bz2 %s@people.apache.org:public_html/staging_area/%s/lucene' % (username, dir)) - print ' unzip...' + print(' unzip...') run('ssh %s@people.apache.org "cd public_html/staging_area/%s/lucene; tar xjf lucene.tar.bz2; rm -f lucene.tar.bz2"' % (username, dir)) os.remove('lucene.tar.bz2') - print ' copy changes...' - os.chdir('..') - run('scp -r build/docs/changes %s@people.apache.org:public_html/staging_area/%s/lucene/changes-%s' % (username, dir, version)) - print ' Solr' + print(' Solr') os.chdir('%s/solr/package' % root) - print ' zip...' + print(' zip...') if os.path.exists('solr.tar.bz2'): os.remove('solr.tar.bz2') run('tar cjf solr.tar.bz2 *') - print ' copy...' + print(' copy...') run('scp solr.tar.bz2 %s@people.apache.org:public_html/staging_area/%s/solr' % (username, dir)) - print ' unzip...' + print(' unzip...') run('ssh %s@people.apache.org "cd public_html/staging_area/%s/solr; tar xjf solr.tar.bz2; rm -f solr.tar.bz2"' % (username, dir)) os.remove('solr.tar.bz2') - print ' KEYS' - run('wget http://people.apache.org/keys/group/lucene.asc') - os.rename('lucene.asc', 'KEYS') - run('chmod a+r-w KEYS') - run('scp KEYS %s@people.apache.org:public_html/staging_area/%s/lucene' % (username, dir)) - run('scp KEYS %s@people.apache.org:public_html/staging_area/%s/solr' % (username, dir)) - os.remove('KEYS') - - print ' chmod...' + print(' chmod...') run('ssh %s@people.apache.org "chmod -R a+rX-w public_html/staging_area/%s"' % (username, dir)) - print ' done!' + print(' done!') url = 'https://people.apache.org/~%s/staging_area/%s' % (username, dir) return url def pushLocal(version, root, rev, rcNum, localDir): - print 'Push local [%s]...' % localDir + print('Push local [%s]...' % localDir) os.makedirs(localDir) dir = 'lucene-solr-%s-RC%d-rev%s' % (version, rcNum, rev) os.makedirs('%s/%s/lucene' % (localDir, dir)) os.makedirs('%s/%s/solr' % (localDir, dir)) - print ' Lucene' + print(' Lucene') os.chdir('%s/lucene/dist' % root) - print ' zip...' + print(' zip...') if os.path.exists('lucene.tar.bz2'): os.remove('lucene.tar.bz2') run('tar cjf lucene.tar.bz2 *') os.chdir('%s/%s/lucene' % (localDir, dir)) - print ' unzip...' + print(' unzip...') run('tar xjf "%s/lucene/dist/lucene.tar.bz2"' % root) os.remove('%s/lucene/dist/lucene.tar.bz2' % root) - print ' copy changes...' + print(' copy changes...') run('cp -r "%s/lucene/build/docs/changes" changes-%s' % (root, version)) - print ' Solr' + print(' Solr') os.chdir('%s/solr/package' % root) - print ' zip...' + print(' zip...') if os.path.exists('solr.tar.bz2'): os.remove('solr.tar.bz2') run('tar cjf solr.tar.bz2 *') - print ' unzip...' + print(' unzip...') os.chdir('%s/%s/solr' % (localDir, dir)) run('tar xjf "%s/solr/package/solr.tar.bz2"' % root) os.remove('%s/solr/package/solr.tar.bz2' % root) - print ' KEYS' + print(' KEYS') run('wget http://people.apache.org/keys/group/lucene.asc') os.rename('lucene.asc', 'KEYS') run('chmod a+r-w KEYS') run('cp KEYS ../lucene') - print ' chmod...' + print(' chmod...') os.chdir('..') run('chmod -R a+rX-w .') - print ' done!' + print(' done!') return 'file://%s/%s' % (os.path.abspath(localDir), dir) def main(): @@ -231,9 +220,9 @@ def main(): smokeTmpDir = sys.argv[idx+1] del sys.argv[idx:idx+2] if os.path.exists(smokeTmpDir): - print - print 'ERROR: smoke tmpDir "%s" exists; please remove first' % smokeTmpDir - print + print() + print('ERROR: smoke tmpDir "%s" exists; please remove first' % smokeTmpDir) + print() sys.exit(1) try: @@ -245,15 +234,15 @@ def main(): localStagingDir = sys.argv[idx+1] del sys.argv[idx:idx+2] if os.path.exists(localStagingDir): - print - print 'ERROR: pushLocal dir "%s" exists; please remove first' % localStagingDir - print + print() + print('ERROR: pushLocal dir "%s" exists; please remove first' % localStagingDir) + print() sys.exit(1) if doPushRemote and doPushLocal: - print - print 'ERROR: specify at most one of -push or -pushLocal (got both)' - print + print() + print('ERROR: specify at most one of -push or -pushLocal (got both)') + print() sys.exit(1) try: @@ -263,7 +252,7 @@ def main(): else: gpgKeyID = sys.argv[idx+1] del sys.argv[idx:idx+2] - + root = os.path.abspath(sys.argv[1]) version = sys.argv[2] rcNum = int(sys.argv[3]) @@ -272,22 +261,26 @@ def main(): rev = prepare(root, version, gpgKeyID, smokeTmpDir is None) else: os.chdir(root) - rev = open('rev.txt').read() + rev = open('rev.txt', encoding='UTF-8').read() if doPushRemote: url = push(version, root, rev, rcNum, username) elif doPushLocal: url = pushLocal(version, root, rev, rcNum, localStagingDir) else: - url = NOne + url = None if url is not None: - print ' URL: %s' % url + print(' URL: %s' % url) if smokeTmpDir is not None: import smokeTestRelease smokeTestRelease.DEBUG = False smokeTestRelease.smokeTest(url, version, smokeTmpDir, gpgKeyID is not None) - + if __name__ == '__main__': - main() + try: + main() + except: + import traceback + traceback.print_exc() diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 366b2482da4..2e896886b8b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -19,6 +19,21 @@ Changes in backwards compatibility policy (Nikola Tanković, Uwe Schindler, Chris Male, Mike McCandless, Robert Muir) +======================= Lucene 4.2.0 ======================= + +Changes in backwards compatibility policy + +* LUCENE-4602: FacetFields now stores facet ordinals in a DocValues field, + rather than a payload. This forces rebuilding existing indexes, or do a + one time migration using FacetsPayloadMigratingReader. Since DocValues + support in-memory caching, CategoryListCache was removed too. + (Shai Erera, Michael McCandless) + +Optimizations + +* LUCENE-4687: BloomFilterPostingsFormat now lazily initializes delegate + TermsEnum only if needed to do a seek or get a DocsEnum. (Simon Willnauer) + * LUCENE-4677, LUCENE-4682: unpacked FSTs now use vInt to encode the node target, to reduce their size (Mike McCandless) @@ -29,6 +44,11 @@ Changes in backwards compatibility policy * LUCENE-3298: FST can now be larger than 2.1 GB / 2.1 B nodes. (James Dyer, Mike McCandless) +New Features + +* LUCENE-4686: New specialized DGapVInt8IntEncoder for facets (now the + default). (Shai Erera) + ======================= Lucene 4.1.0 ======================= Changes in backwards compatibility policy @@ -150,7 +170,7 @@ New Features remove stop words then "ghost chr..." could suggest "The Ghost of Christmas Past"; if SynonymFilter is used to map wifi and wireless network to hotspot, then "wirele..." could suggest "wifi router"; - token normalization likes stemmers, accent removel, etc. would allow + token normalization likes stemmers, accent removal, etc. would allow the suggester to ignore such variations. (Robert Muir, Sudarshan Gaikaiwari, Mike McCandless) @@ -202,7 +222,7 @@ API Changes information about the trigger of the merge ie. merge triggered due to a segment merge or a full flush etc. (Simon Willnauer) -* Lucene-4415: TermsFilter is now immutable. All terms need to be provided +* LUCENE-4415: TermsFilter is now immutable. All terms need to be provided as constructor argument. (Simon Willnauer) * LUCENE-4520: ValueSource.getSortField no longer throws IOExceptions @@ -227,6 +247,9 @@ API Changes * LUCENE-4663: Deprecate IndexSearcher.document(int, Set). This was not intended to be final, nor named document(). Use IndexSearcher.doc(int, Set) instead. (Robert Muir) + +* LUCENE-4684: Made DirectSpellChecker extendable. + (Martijn van Groningen) Bug Fixes @@ -343,6 +366,9 @@ Bug Fixes DEFAULT_ARTICLES list passed to ElisionFilter. (David Leunen via Steve Rowe) * LUCENE-4671: Fix CharsRef.subSequence method. (Tim Smith via Robert Muir) + +* LUCENE-4465: Let ConstantScoreQuery's Scorer return its child scorer. + (selckin via Uwe Schindler) Changes in Runtime Behavior diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java index e91dc36b417..5bd456bbc43 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java @@ -59,7 +59,7 @@ public class MappingCharFilter extends BaseCharFilter { cachedRootArcs = normMap.cachedRootArcs; if (map != null) { - fstReader = map.getBytesReader(0); + fstReader = map.getBytesReader(); } else { fstReader = null; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java index c22203a76a4..9203784101c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java @@ -49,7 +49,7 @@ public class NormalizeCharMap { try { // Pre-cache root arcs: final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = map.getBytesReader(0); + final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java index bdb22a27f0f..909b3805e67 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java @@ -148,7 +148,7 @@ public class HyphenationCompoundWordTokenFilter extends */ public static HyphenationTree getHyphenationTree(File hyphenationFile) throws IOException { - return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm())); + return getHyphenationTree(new InputSource(hyphenationFile.toURI().toASCIIString())); } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java index b988c462e7b..119794eaf05 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java @@ -111,7 +111,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer { * @throws IOException In case the parsing fails */ public void loadPatterns(File f) throws IOException { - InputSource src = new InputSource(f.toURL().toExternalForm()); + InputSource src = new InputSource(f.toURI().toASCIIString()); loadPatterns(src); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java index 448adfb582e..d658977241f 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java @@ -98,7 +98,7 @@ public class PatternParser extends DefaultHandler { * @throws IOException In case of an exception while parsing */ public void parse(File file) throws IOException { - InputSource src = new InputSource(file.toURL().toExternalForm()); + InputSource src = new InputSource(file.toURI().toASCIIString()); parse(src); } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java index 1411fdb1bb2..da0f1ea3a94 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java @@ -35,7 +35,7 @@ import java.util.Set; * <analyzer> * <tokenizer class="solr.StandardTokenizerFactory"/> * <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt" - * enablePositionIncrements="true" useWhiteList="false"/> + * enablePositionIncrements="true" useWhitelist="false"/> * </analyzer> * </fieldType> */ diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java index 2267ce2007c..bd871ea6646 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java @@ -263,7 +263,7 @@ public final class SynonymFilter extends TokenFilter { this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; - this.fstReader = fst.getBytesReader(0); + this.fstReader = fst.getBytesReader(); if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java index b03c80effc8..de203b9c449 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java @@ -201,10 +201,10 @@ public final class JapaneseTokenizer extends Tokenizer { characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); - fstReader = fst.getBytesReader(0); + fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); - userFSTReader = userFST.getBytesReader(0); + userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java index e7c20da60e8..c386910044c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoFST.java @@ -54,7 +54,7 @@ public final class TokenInfoFST { FST.Arc firstArc = new FST.Arc(); fst.getFirstArc(firstArc); FST.Arc arc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); // TODO: jump to 3040, readNextRealArc to ceiling? (just be careful we don't add bugs) for (int i = 0; i < rootCache.length; i++) { if (fst.findTargetArc(0x3040 + i, firstArc, arc, fstReader) != null) { @@ -83,8 +83,8 @@ public final class TokenInfoFST { return fst.getFirstArc(arc); } - public FST.BytesReader getBytesReader(int pos) { - return fst.getBytesReader(pos); + public FST.BytesReader getBytesReader() { + return fst.getBytesReader(); } /** @lucene.internal for testing only */ diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index ba23f53c933..3ff5e64a01a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -139,7 +139,7 @@ public final class UserDictionary implements Dictionary { TreeMap result = new TreeMap(); // index, [length, length...] boolean found = false; // true if we found any results - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); FST.Arc arc = new FST.Arc(); int end = off + len; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java index 9a5f6325ebc..2ff143e6327 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java @@ -236,26 +236,22 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { @Override public TermsEnum iterator(TermsEnum reuse) throws IOException { - TermsEnum result; if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) { // recycle the existing BloomFilteredTermsEnum by asking the delegate // to recycle its contained TermsEnum BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse; if (bfte.filter == filter) { - bfte.delegateTermsEnum = delegateTerms - .iterator(bfte.delegateTermsEnum); + bfte.reset(delegateTerms, bfte.delegateTermsEnum); return bfte; } } // We have been handed something we cannot reuse (either null, wrong // class or wrong filter) so allocate a new object - result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse), - filter); - return result; + return new BloomFilteredTermsEnum(delegateTerms, reuse, filter); } @Override - public Comparator getComparator() throws IOException { + public Comparator getComparator() { return delegateTerms.getComparator(); } @@ -295,24 +291,43 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { } } - class BloomFilteredTermsEnum extends TermsEnum { + final class BloomFilteredTermsEnum extends TermsEnum { + private Terms delegateTerms; + private TermsEnum delegateTermsEnum; + private TermsEnum reuseDelegate; + private final FuzzySet filter; - TermsEnum delegateTermsEnum; - private FuzzySet filter; - - public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) { - this.delegateTermsEnum = iterator; + public BloomFilteredTermsEnum(Terms delegateTerms, TermsEnum reuseDelegate, FuzzySet filter) throws IOException { + this.delegateTerms = delegateTerms; + this.reuseDelegate = reuseDelegate; this.filter = filter; } + void reset(Terms delegateTerms, TermsEnum reuseDelegate) throws IOException { + this.delegateTerms = delegateTerms; + this.reuseDelegate = reuseDelegate; + this.delegateTermsEnum = null; + } + + private final TermsEnum delegate() throws IOException { + if (delegateTermsEnum == null) { + /* pull the iterator only if we really need it - + * this can be a relativly heavy operation depending on the + * delegate postings format and they underlying directory + * (clone IndexInput) */ + delegateTermsEnum = delegateTerms.iterator(reuseDelegate); + } + return delegateTermsEnum; + } + @Override public final BytesRef next() throws IOException { - return delegateTermsEnum.next(); + return delegate().next(); } @Override public final Comparator getComparator() { - return delegateTermsEnum.getComparator(); + return delegateTerms.getComparator(); } @Override @@ -326,51 +341,51 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { if (filter.contains(text) == ContainsResult.NO) { return false; } - return delegateTermsEnum.seekExact(text, useCache); + return delegate().seekExact(text, useCache); } @Override public final SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException { - return delegateTermsEnum.seekCeil(text, useCache); + return delegate().seekCeil(text, useCache); } @Override public final void seekExact(long ord) throws IOException { - delegateTermsEnum.seekExact(ord); + delegate().seekExact(ord); } @Override public final BytesRef term() throws IOException { - return delegateTermsEnum.term(); + return delegate().term(); } @Override public final long ord() throws IOException { - return delegateTermsEnum.ord(); + return delegate().ord(); } @Override public final int docFreq() throws IOException { - return delegateTermsEnum.docFreq(); + return delegate().docFreq(); } @Override public final long totalTermFreq() throws IOException { - return delegateTermsEnum.totalTermFreq(); + return delegate().totalTermFreq(); } @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags); + return delegate().docsAndPositions(liveDocs, reuse, flags); } @Override public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - return delegateTermsEnum.docs(liveDocs, reuse, flags); + return delegate().docs(liveDocs, reuse, flags); } @@ -383,12 +398,10 @@ public final class BloomFilteringPostingsFormat extends PostingsFormat { private Map bloomFilters = new HashMap(); private SegmentWriteState state; - // private PostingsFormat delegatePostingsFormat; public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer, SegmentWriteState state, PostingsFormat delegatePostingsFormat) { this.delegateFieldsConsumer = fieldsConsumer; - // this.delegatePostingsFormat=delegatePostingsFormat; this.state = state; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java index 318421b6389..d36f5dae52a 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java @@ -70,10 +70,6 @@ import org.apache.lucene.util.packed.PackedInts; * queries that rely on advance will (AND BooleanQuery, * PhraseQuery) will be relatively slow! * - *

NOTE: this codec cannot address more than ~2.1 GB - * of postings, because the underlying FST uses an int - * to address the underlying byte[]. - * * @lucene.experimental */ // TODO: Maybe name this 'Cached' or something to reflect @@ -88,6 +84,13 @@ public final class MemoryPostingsFormat extends PostingsFormat { this(false, PackedInts.DEFAULT); } + /** + * Create MemoryPostingsFormat, specifying advanced FST options. + * @param doPackFST true if a packed FST should be built. + * NOTE: packed FSTs are limited to ~2.1 GB of postings. + * @param acceptableOverheadRatio allowable overhead for packed ints + * during FST construction. + */ public MemoryPostingsFormat(boolean doPackFST, float acceptableOverheadRatio) { super("Memory"); this.doPackFST = doPackFST; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java index 2b105de27d2..1a0cde63097 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java @@ -272,7 +272,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader { } @Override - public Comparator getComparator() throws IOException { + public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java index 5c8718e6937..32324bb0890 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/BlockTreeTermsReader.java @@ -833,7 +833,7 @@ public class BlockTreeTermsReader extends FieldsProducer { if (index == null) { fstReader = null; } else { - fstReader = index.getBytesReader(0); + fstReader = index.getBytesReader(); } // TODO: if the automaton is "smallish" we really @@ -1277,7 +1277,7 @@ public class BlockTreeTermsReader extends FieldsProducer { if (index == null) { fstReader = null; } else { - fstReader = index.getBytesReader(0); + fstReader = index.getBytesReader(); } // Init w/ root block; don't use index since it may diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41DocValuesProducer.java index 795d8701911..9318ae493c2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41DocValuesProducer.java @@ -218,7 +218,7 @@ class Lucene41DocValuesProducer extends DocValuesProducer { final FST fst = instance; // per-thread resources - final BytesReader in = fst.getBytesReader(0); + final BytesReader in = fst.getBytesReader(); final Arc firstArc = new Arc(); final Arc scratchArc = new Arc(); final IntsRef scratchInts = new IntsRef(); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java index 0247b6c5c8d..44775e17901 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterAtomicReader.java @@ -87,7 +87,7 @@ public class FilterAtomicReader extends AtomicReader { } @Override - public Comparator getComparator() throws IOException { + public Comparator getComparator() { return in.getComparator(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/Terms.java b/lucene/core/src/java/org/apache/lucene/index/Terms.java index 62925c30770..45924d40d0d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/Terms.java +++ b/lucene/core/src/java/org/apache/lucene/index/Terms.java @@ -80,7 +80,7 @@ public abstract class Terms { * if there are no terms. This method may be invoked * many times; it's best to cache a single instance & * reuse it. */ - public abstract Comparator getComparator() throws IOException; + public abstract Comparator getComparator(); /** Returns the number of terms for this field, or -1 if this * measure isn't stored by the codec. Note that, just like diff --git a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java index 71add6d0b48..f6fd987b979 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/ConstantScoreQuery.java @@ -24,6 +24,8 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; +import java.util.Collection; +import java.util.Collections; import java.util.Set; /** @@ -248,6 +250,14 @@ public class ConstantScoreQuery extends Query { return super.score(collector, max, firstDocID); } } + + @Override + public Collection getChildren() { + if (docIdSetIterator instanceof Scorer) + return Collections.singletonList(new ChildScorer((Scorer) docIdSetIterator, "constant")); + else + return Collections.emptyList(); + } } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index eb4fd38b972..6e0025475c0 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -96,7 +96,7 @@ public class FuzzyQuery extends MultiTermQuery { /** * Calls {@link #FuzzyQuery(Term, int, int, int, boolean) - * FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions, defaultTranspositions)}. + * FuzzyQuery(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions)}. */ public FuzzyQuery(Term term, int maxEdits, int prefixLength) { this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions); diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java index 3b56addbe86..745146813d7 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java @@ -39,6 +39,7 @@ import org.apache.lucene.search.DocIdSetIterator; public final class FixedBitSet extends DocIdSet implements Bits { private final long[] bits; private final int numBits; + private final int wordLength; /** returns the number of 64 bit words it would take to hold numBits */ public static int bits2words(int numBits) { @@ -52,23 +53,29 @@ public final class FixedBitSet extends DocIdSet implements Bits { public FixedBitSet(int numBits) { this.numBits = numBits; bits = new long[bits2words(numBits)]; + wordLength = bits.length; } - public FixedBitSet(long[]storedBits,int numBits) { + public FixedBitSet(long[] storedBits, int numBits) { + this.wordLength = bits2words(numBits); + if (wordLength > storedBits.length) { + throw new IllegalArgumentException("The given long array is too small to hold " + numBits + " bits"); + } this.numBits = numBits; this.bits = storedBits; } /** Makes full copy. */ public FixedBitSet(FixedBitSet other) { - bits = new long[other.bits.length]; - System.arraycopy(other.bits, 0, bits, 0, bits.length); + bits = new long[other.wordLength]; + System.arraycopy(other.bits, 0, bits, 0, other.wordLength); numBits = other.numBits; + wordLength = other.wordLength; } @Override public DocIdSetIterator iterator() { - return new OpenBitSetIterator(bits, bits.length); + return new OpenBitSetIterator(bits, wordLength); } @Override @@ -159,7 +166,7 @@ public final class FixedBitSet extends DocIdSet implements Bits { return (i<<6) + subIndex + Long.numberOfTrailingZeros(word); } - while(++i < bits.length) { + while(++i < wordLength) { word = bits[i]; if (word != 0) { return (i<<6) + Long.numberOfTrailingZeros(word); @@ -211,12 +218,12 @@ public final class FixedBitSet extends DocIdSet implements Bits { /** this = this OR other */ public void or(FixedBitSet other) { - or(other.bits, other.bits.length); + or(other.bits, other.wordLength); } private void or(final long[] otherArr, final int otherLen) { final long[] thisArr = this.bits; - int pos = Math.min(thisArr.length, otherLen); + int pos = Math.min(wordLength, otherLen); while (--pos >= 0) { thisArr[pos] |= otherArr[pos]; } @@ -247,17 +254,17 @@ public final class FixedBitSet extends DocIdSet implements Bits { /** this = this AND other */ public void and(FixedBitSet other) { - and(other.bits, other.bits.length); + and(other.bits, other.wordLength); } private void and(final long[] otherArr, final int otherLen) { final long[] thisArr = this.bits; - int pos = Math.min(thisArr.length, otherLen); + int pos = Math.min(this.wordLength, otherLen); while(--pos >= 0) { thisArr[pos] &= otherArr[pos]; } - if (thisArr.length > otherLen) { - Arrays.fill(thisArr, otherLen, thisArr.length, 0L); + if (this.wordLength > otherLen) { + Arrays.fill(thisArr, otherLen, this.wordLength, 0L); } } @@ -285,7 +292,7 @@ public final class FixedBitSet extends DocIdSet implements Bits { private void andNot(final long[] otherArr, final int otherLen) { final long[] thisArr = this.bits; - int pos = Math.min(thisArr.length, otherLen); + int pos = Math.min(this.wordLength, otherLen); while(--pos >= 0) { thisArr[pos] &= ~otherArr[pos]; } @@ -418,7 +425,7 @@ public final class FixedBitSet extends DocIdSet implements Bits { @Override public int hashCode() { long h = 0; - for (int i = bits.length; --i>=0;) { + for (int i = wordLength; --i>=0;) { h ^= bits[i]; h = (h << 1) | (h >>> 63); // rotate left } diff --git a/lucene/core/src/java/org/apache/lucene/util/Version.java b/lucene/core/src/java/org/apache/lucene/util/Version.java index fc8ff5a6339..09f600a1fc4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Version.java +++ b/lucene/core/src/java/org/apache/lucene/util/Version.java @@ -47,7 +47,14 @@ public enum Version { @Deprecated LUCENE_41, - /** Match settings and bugs in Lucene's 5.0 release. + /** + * Match settings and bugs in Lucene's 4.2 release. + * @deprecated (5.0) Use latest + */ + @Deprecated + LUCENE_42, + + /** Match settings and bugs in Lucene's 5.0 release. *

* Use this to get the latest & greatest settings, bug * fixes, etc, for Lucene. diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index 7e8f7b2087f..e5c45a97f6d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -417,7 +417,7 @@ public final class FST { cachedRootArcs = (Arc[]) new Arc[0x80]; final Arc arc = new Arc(); getFirstArc(arc); - final BytesReader in = getBytesReader(0); + final BytesReader in = getBytesReader(); if (targetHasArcs(arc)) { readFirstRealTargetArc(arc.target, arc, in); while(true) { @@ -1246,22 +1246,12 @@ public final class FST { /** Returns a {@link BytesReader} for this FST, positioned at * position 0. */ public BytesReader getBytesReader() { - return getBytesReader(0); - } - - /** Returns a {@link BytesReader} for this FST, positioned at - * the provided position. */ - public BytesReader getBytesReader(long pos) { - // TODO: maybe re-use via ThreadLocal? BytesReader in; if (packed) { in = bytes.getForwardReader(); } else { in = bytes.getReverseReader(); } - if (pos != 0) { - in.setPosition(pos); - } return in; } @@ -1448,7 +1438,7 @@ public final class FST { Arc arc = new Arc(); - final BytesReader r = getBytesReader(0); + final BytesReader r = getBytesReader(); final int topN = Math.min(maxDerefNodes, inCounts.size()); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java index 13be2f1c5ba..ea0c68d8f85 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTEnum.java @@ -46,7 +46,7 @@ abstract class FSTEnum { * term before target. */ protected FSTEnum(FST fst) { this.fst = fst; - fstReader = fst.getBytesReader(0); + fstReader = fst.getBytesReader(); NO_OUTPUT = fst.outputs.getNoOutput(); fst.getFirstArc(getArc(0)); output[0] = NO_OUTPUT; @@ -145,7 +145,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -284,7 +284,7 @@ abstract class FSTEnum { // Arcs are fixed array -- use binary search to find // the target. - final FST.BytesReader in = fst.getBytesReader(0); + final FST.BytesReader in = fst.getBytesReader(); int low = arc.arcIdx; int high = arc.numArcs-1; int mid = 0; @@ -434,7 +434,7 @@ abstract class FSTEnum { FST.Arc arc = getArc(upto-1); int targetLabel = getTargetLabel(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); while(true) { //System.out.println(" cycle target=" + (targetLabel == -1 ? "-1" : (char) targetLabel)); diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java index 38edfbdd019..452e72f8364 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/Util.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/Util.java @@ -39,7 +39,7 @@ public final class Util { // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); - final BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(); // Accumulate output as we go T output = fst.outputs.getNoOutput(); @@ -64,7 +64,7 @@ public final class Util { public static T get(FST fst, BytesRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE1; - final BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup final FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -101,7 +101,7 @@ public final class Util { * fit this. */ public static IntsRef getByOutput(FST fst, long targetOutput) throws IOException { - final BytesReader in = fst.getBytesReader(0); + final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc arc = fst.getFirstArc(new FST.Arc()); @@ -291,7 +291,7 @@ public final class Util { public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator) { this.fst = fst; - this.bytesReader = fst.getBytesReader(0); + this.bytesReader = fst.getBytesReader(); this.topN = topN; this.maxQueueDepth = maxQueueDepth; this.comparator = comparator; @@ -380,7 +380,7 @@ public final class Util { //System.out.println("search topN=" + topN); - final BytesReader fstReader = fst.getBytesReader(0); + final BytesReader fstReader = fst.getBytesReader(); final T NO_OUTPUT = fst.outputs.getNoOutput(); // TODO: we could enable FST to sorting arcs by weight @@ -603,7 +603,7 @@ public final class Util { emitDotState(out, "initial", "point", "white", ""); final T NO_OUTPUT = fst.outputs.getNoOutput(); - final BytesReader r = fst.getBytesReader(0); + final BytesReader r = fst.getBytesReader(); // final FST.Arc scratchArc = new FST.Arc(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 8ff33897718..4a56af6deee 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -83,14 +83,22 @@ import org.junit.Ignore; @SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"}) public class TestBackwardsCompatibility extends LuceneTestCase { - // Uncomment these cases & run them on an older Lucene - // version, to generate an index to test backwards - // compatibility. Then, cd to build/test/index.cfs and - // run "zip index..cfs.zip *"; cd to - // build/test/index.nocfs and run "zip - // index..nocfs.zip *". Then move those 2 zip - // files to your trunk checkout and add them to the - // oldNames array. + // Uncomment these cases & run them on an older Lucene version, + // to generate indexes to test backwards compatibility. These + // indexes will be created under directory /tmp/idx/. + // + // However, you must first disable the Lucene TestSecurityManager, + // which will otherwise disallow writing outside of the build/ + // directory - to do this, comment out the "java.security.manager" + // under the "test-macro" . + // + // Zip up the generated indexes: + // + // cd /tmp/idx/index.cfs ; zip index..cfs.zip * + // cd /tmp/idx/index.nocfs ; zip index..nocfs.zip * + // + // Then move those 2 zip files to your trunk checkout and add them + // to the oldNames array. /* public void testCreateCFS() throws IOException { @@ -151,6 +159,8 @@ public class TestBackwardsCompatibility extends LuceneTestCase { final static String[] oldNames = {"40.cfs", "40.nocfs", + "41.cfs", + "41.nocfs", }; final String[] unsupportedNames = {"19.cfs", @@ -545,7 +555,7 @@ public class TestBackwardsCompatibility extends LuceneTestCase { public File createIndex(String dirName, boolean doCFS, boolean fullyMerged) throws IOException { // we use a real directory name that is not cleaned up, because this method is only used to create backwards indexes: - File indexDir = new File("/tmp/4x", dirName); + File indexDir = new File("/tmp/idx", dirName); _TestUtil.rmDir(indexDir); Directory dir = newFSDirectory(indexDir); LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java index 486096cbc6b..19f17a58ebe 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriterWithThreads.java @@ -355,16 +355,20 @@ public class TestIndexWriterWithThreads extends LuceneTestCase { StackTraceElement[] trace = new Exception().getStackTrace(); boolean sawAbortOrFlushDoc = false; boolean sawClose = false; + boolean sawMerge = false; for (int i = 0; i < trace.length; i++) { if ("abort".equals(trace[i].getMethodName()) || "finishDocument".equals(trace[i].getMethodName())) { sawAbortOrFlushDoc = true; } + if ("merge".equals(trace[i])) { + sawMerge = true; + } if ("close".equals(trace[i].getMethodName())) { sawClose = true; } } - if (sawAbortOrFlushDoc && !sawClose) { + if (sawAbortOrFlushDoc && !sawClose && !sawMerge) { if (onlyOnce) doFail = false; //System.out.println(Thread.currentThread().getName() + ": now fail"); diff --git a/lucene/core/src/test/org/apache/lucene/index/index.41.cfs.zip b/lucene/core/src/test/org/apache/lucene/index/index.41.cfs.zip new file mode 100644 index 00000000000..4d911a27005 Binary files /dev/null and b/lucene/core/src/test/org/apache/lucene/index/index.41.cfs.zip differ diff --git a/lucene/core/src/test/org/apache/lucene/index/index.41.nocfs.zip b/lucene/core/src/test/org/apache/lucene/index/index.41.nocfs.zip new file mode 100644 index 00000000000..7c18c451b66 Binary files /dev/null and b/lucene/core/src/test/org/apache/lucene/index/index.41.nocfs.zip differ diff --git a/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java b/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java index e5b729af6a2..38cc749051c 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestNRTManager.java @@ -32,6 +32,7 @@ import org.apache.lucene.index.IndexDocument; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.ThreadedIndexingAndSearchingTestCase; @@ -294,6 +295,7 @@ public class TestNRTManager extends ThreadedIndexingAndSearchingTestCase { */ public void testThreadStarvationNoDeleteNRTReader() throws IOException, InterruptedException { IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + conf.setMergePolicy(random().nextBoolean() ? NoMergePolicy.COMPOUND_FILES : NoMergePolicy.NO_COMPOUND_FILES); Directory d = newDirectory(); final CountDownLatch latch = new CountDownLatch(1); final CountDownLatch signal = new CountDownLatch(1); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java index f4350cf9544..e4925b20551 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFixedBitSet.java @@ -265,7 +265,18 @@ public class TestFixedBitSet extends LuceneTestCase { } private FixedBitSet makeFixedBitSet(int[] a, int numBits) { - FixedBitSet bs = new FixedBitSet(numBits); + FixedBitSet bs; + if (random().nextBoolean()) { + int bits2words = FixedBitSet.bits2words(numBits); + long[] words = new long[bits2words + random().nextInt(100)]; + for (int i = bits2words; i < words.length; i++) { + words[i] = random().nextLong(); + } + bs = new FixedBitSet(words, numBits); + + } else { + bs = new FixedBitSet(numBits); + } for (int e: a) { bs.set(e); } @@ -291,6 +302,23 @@ public class TestFixedBitSet extends LuceneTestCase { checkPrevSetBitArray(new int[] {0}, 1); checkPrevSetBitArray(new int[] {0,2}, 3); } + + + private void checkNextSetBitArray(int [] a, int numBits) { + FixedBitSet obs = makeFixedBitSet(a, numBits); + BitSet bs = makeBitSet(a); + doNextSetBit(bs, obs); + } + + public void testNextBitSet() { + int[] setBits = new int[0+random().nextInt(1000)]; + for (int i = 0; i < setBits.length; i++) { + setBits[i] = random().nextInt(setBits.length); + } + checkNextSetBitArray(setBits, setBits.length + random().nextInt(10)); + + checkNextSetBitArray(new int[0], setBits.length + random().nextInt(10)); + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index b81b15b1a86..5e041b26a6a 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -1033,7 +1033,7 @@ public class TestFSTs extends LuceneTestCase { throws IOException { if (FST.targetHasArcs(arc)) { int childCount = 0; - BytesReader fstReader = fst.getBytesReader(0); + BytesReader fstReader = fst.getBytesReader(); for (arc = fst.readFirstTargetArc(arc, arc, fstReader);; arc = fst.readNextArc(arc, fstReader), childCount++) { @@ -1168,12 +1168,12 @@ public class TestFSTs extends LuceneTestCase { assertEquals(nothing, startArc.nextFinalOutput); FST.Arc arc = fst.readFirstTargetArc(startArc, new FST.Arc(), - fst.getBytesReader(0)); + fst.getBytesReader()); assertEquals('a', arc.label); assertEquals(17, arc.nextFinalOutput.longValue()); assertTrue(arc.isFinal()); - arc = fst.readNextArc(arc, fst.getBytesReader(0)); + arc = fst.readNextArc(arc, fst.getBytesReader()); assertEquals('b', arc.label); assertFalse(arc.isFinal()); assertEquals(42, arc.output.longValue()); @@ -1303,7 +1303,7 @@ public class TestFSTs extends LuceneTestCase { //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + BytesReader reader = fst.getBytesReader(); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { @@ -1424,7 +1424,7 @@ public class TestFSTs extends LuceneTestCase { //Util.toDot(fst, w, false, false); //w.close(); - BytesReader reader = fst.getBytesReader(0); + BytesReader reader = fst.getBytesReader(); //System.out.println("testing: " + allPrefixes.size() + " prefixes"); for (String prefix : allPrefixes) { diff --git a/lucene/facet/build.xml b/lucene/facet/build.xml index 9cdd5e995bb..7536e6c4825 100644 --- a/lucene/facet/build.xml +++ b/lucene/facet/build.xml @@ -30,19 +30,13 @@ - - - - - - - + diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java index 55d1e67a487..b5c53a11efb 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java @@ -90,8 +90,9 @@ public class TaxonomyMergeUtils { DirectoryReader reader = DirectoryReader.open(srcIndexDir, -1); List leaves = reader.leaves(); - AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()]; - for (int i = 0; i < leaves.size(); i++) { + int numReaders = leaves.size(); + AtomicReader wrappedLeaves[] = new AtomicReader[numReaders]; + for (int i = 0; i < numReaders; i++) { wrappedLeaves[i] = new OrdinalMappingAtomicReader(leaves.get(i).reader(), ordinalMap, params); } try { diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java index 96d273a10e6..5925e2327a0 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java @@ -20,7 +20,6 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; @@ -80,12 +79,12 @@ public class MultiCLIndexer { // Initialize PerDimensionIndexingParams static { Map paramsMap = new HashMap(); - paramsMap.put(new CategoryPath("0"), new CategoryListParams(new Term("$Digits", "Zero"))); - paramsMap.put(new CategoryPath("1"), new CategoryListParams(new Term("$Digits", "One"))); - paramsMap.put(new CategoryPath("2"), new CategoryListParams(new Term("$Digits", "Two"))); - paramsMap.put(new CategoryPath("3"), new CategoryListParams(new Term("$Digits", "Three"))); - paramsMap.put(new CategoryPath("4"), new CategoryListParams(new Term("$Digits", "Four"))); - paramsMap.put(new CategoryPath("5"), new CategoryListParams(new Term("$Digits", "Five"))); + paramsMap.put(new CategoryPath("0"), new CategoryListParams("$Digits$Zero")); + paramsMap.put(new CategoryPath("1"), new CategoryListParams("$Digits$One")); + paramsMap.put(new CategoryPath("2"), new CategoryListParams("$Digits$Two")); + paramsMap.put(new CategoryPath("3"), new CategoryListParams("$Digits$Three")); + paramsMap.put(new CategoryPath("4"), new CategoryListParams("$Digits$Four")); + paramsMap.put(new CategoryPath("5"), new CategoryListParams("$Digits$Five")); MULTI_IPARAMS = new PerDimensionIndexingParams(paramsMap); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java index 1ed3985afef..0e611176b7b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java @@ -114,7 +114,7 @@ public class AssociationsFacetFields extends FacetFields { } @Override - protected FieldType fieldType() { + protected FieldType drillDownFieldType() { return DRILL_DOWN_TYPE; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsIterator.java similarity index 75% rename from lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java rename to lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsIterator.java index ff2663302ef..b764afda507 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsIterator.java @@ -2,9 +2,8 @@ package org.apache.lucene.facet.associations; import java.io.IOException; -import org.apache.lucene.facet.search.PayloadIterator; import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; @@ -30,25 +29,23 @@ import org.apache.lucene.util.BytesRef; * * @lucene.experimental */ -public abstract class AssociationsPayloadIterator { +public abstract class AssociationsIterator { - private final PayloadIterator pi; private final T association; + private final String dvField; + private final BytesRef bytes = new BytesRef(32); + + private BinaryDocValues current; - /** - * Marking whether there are associations (at all) in the given index - */ - private boolean hasAssociations = false; - /** * Construct a new associations iterator. The given * {@link CategoryAssociation} is used to deserialize the association values. * It is assumed that all association values can be deserialized with the * given {@link CategoryAssociation}. */ - public AssociationsPayloadIterator(String field, T association) throws IOException { - pi = new PayloadIterator(new Term(field, association.getCategoryListID())); + public AssociationsIterator(String field, T association) throws IOException { this.association = association; + this.dvField = field + association.getCategoryListID(); } /** @@ -57,8 +54,8 @@ public abstract class AssociationsPayloadIterator * of the documents belonging to the association given to the constructor. */ public final boolean setNextReader(AtomicReaderContext context) throws IOException { - hasAssociations = pi.setNextReader(context); - return hasAssociations; + current = context.reader().getBinaryDocValues(dvField); + return current != null; } /** @@ -68,15 +65,11 @@ public abstract class AssociationsPayloadIterator * extending classes. */ protected final boolean setNextDoc(int docID) throws IOException { - if (!hasAssociations) { // there are no associations at all - return false; + current.get(docID, bytes); + if (bytes.length == 0) { + return false; // no associations for the requested document } - BytesRef bytes = pi.getPayload(docID); - if (bytes == null) { // no associations for the requested document - return false; - } - ByteArrayDataInput in = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); while (!in.eof()) { int ordinal = in.readInt(); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsIterator.java similarity index 82% rename from lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java rename to lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsIterator.java index 4ddb077a546..daa791e35f6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsIterator.java @@ -22,15 +22,18 @@ import org.apache.lucene.util.collections.IntToFloatMap; */ /** - * An {@link AssociationsPayloadIterator} over integer association values. + * An {@link AssociationsIterator} over integer association values. * * @lucene.experimental */ -public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterator { +public class FloatAssociationsIterator extends AssociationsIterator { private final IntToFloatMap ordinalAssociations = new IntToFloatMap(); - - public FloatAssociationsPayloadIterator(String field, CategoryFloatAssociation association) throws IOException { + + /** + * Constructs a new {@link FloatAssociationsIterator}. + */ + public FloatAssociationsIterator(String field, CategoryFloatAssociation association) throws IOException { super(field, association); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsIterator.java similarity index 83% rename from lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java rename to lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsIterator.java index c400c6d8768..0a7b74b52bc 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsIterator.java @@ -22,15 +22,18 @@ import org.apache.lucene.util.collections.IntToIntMap; */ /** - * An {@link AssociationsPayloadIterator} over integer association values. + * An {@link AssociationsIterator} over integer association values. * * @lucene.experimental */ -public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator { +public class IntAssociationsIterator extends AssociationsIterator { private final IntToIntMap ordinalAssociations = new IntToIntMap(); - public IntAssociationsPayloadIterator(String field, CategoryIntAssociation association) throws IOException { + /** + * Constructs a new {@link IntAssociationsIterator}. + */ + public IntAssociationsIterator(String field, CategoryIntAssociation association) throws IOException { super(field, association); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java index 29b87b3f7b2..9077698fdc4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java @@ -56,10 +56,9 @@ public class CountingListBuilder implements CategoryListBuilder { private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder { private final IntEncoder encoder; - private final String name; + private final String name = ""; NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) { - name = categoryListParams.getTerm().text(); encoder = categoryListParams.createEncoder(); } @@ -91,7 +90,7 @@ public class CountingListBuilder implements CategoryListBuilder { final HashMap partitionOrdinals = new HashMap(); for (int i = 0; i < ordinals.length; i++) { int ordinal = ordinals.ints[i]; - final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal); + final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, ordinal); IntsRef partitionOrds = partitionOrdinals.get(name); if (partitionOrds == null) { partitionOrds = new IntsRef(32); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java index 9fda2396ee5..db6c5951a00 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java @@ -4,17 +4,14 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StraightBytesDocValuesField; import org.apache.lucene.document.TextField; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; @@ -51,32 +48,6 @@ import org.apache.lucene.util.IntsRef; */ public class FacetFields { - // a TokenStream for writing the counting list payload - private static final class CountingListStream extends TokenStream { - private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private Iterator> categoriesData; - - CountingListStream() {} - - @Override - public boolean incrementToken() throws IOException { - if (!categoriesData.hasNext()) { - return false; - } - - Entry entry = categoriesData.next(); - termAtt.setEmpty().append(entry.getKey()); - payloadAtt.setPayload(entry.getValue()); - return true; - } - - void setCategoriesData(Map categoriesData) { - this.categoriesData = categoriesData.entrySet().iterator(); - } - - } - // The counting list is written in a payload, but we don't store it // nor need norms. private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType(); @@ -94,9 +65,7 @@ public class FacetFields { // Therefore we set its IndexOptions to DOCS_ONLY. private static final FieldType DRILL_DOWN_TYPE = new FieldType(TextField.TYPE_NOT_STORED); static { - // TODO: once we cutover to DocValues, we can set it to DOCS_ONLY for this - // FacetFields (not associations) - DRILL_DOWN_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + DRILL_DOWN_TYPE.setIndexOptions(IndexOptions.DOCS_ONLY); DRILL_DOWN_TYPE.freeze(); } @@ -175,10 +144,20 @@ public class FacetFields { * Returns the {@link FieldType} with which the drill-down terms should be * indexed. The default is {@link IndexOptions#DOCS_ONLY}. */ - protected FieldType fieldType() { + protected FieldType drillDownFieldType() { return DRILL_DOWN_TYPE; } + /** + * Add the counting list data to the document under the given field. Note that + * the field is determined by the {@link CategoryListParams}. + */ + protected void addCountingListData(Document doc, Map categoriesData, String field) { + for (Entry entry : categoriesData.entrySet()) { + doc.add(new StraightBytesDocValuesField(field + entry.getKey(), entry.getValue())); + } + } + /** Adds the needed facet fields to the document. */ public void addFields(Document doc, Iterable categories) throws IOException { if (categories == null) { @@ -198,7 +177,7 @@ public class FacetFields { IntsRef ordinals = new IntsRef(32); // should be enough for most common applications for (Entry> e : categoryLists.entrySet()) { final CategoryListParams clp = e.getKey(); - final String field = clp.getTerm().field(); + final String field = clp.field; // build category list data ordinals.length = 0; // reset @@ -214,13 +193,11 @@ public class FacetFields { Map categoriesData = getCategoryListData(clp, ordinals, e.getValue()); // add the counting list data - CountingListStream ts = new CountingListStream(); - ts.setCategoriesData(categoriesData); - doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE)); + addCountingListData(doc, categoriesData, field); // add the drill-down field DrillDownStream drillDownStream = getDrillDownStream(e.getValue()); - Field drillDown = new Field(field, drillDownStream, fieldType()); + Field drillDown = new Field(field, drillDownStream, drillDownFieldType()); doc.add(drillDown); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadMigrationReader.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadMigrationReader.java new file mode 100644 index 00000000000..c63476247c8 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadMigrationReader.java @@ -0,0 +1,258 @@ +package org.apache.lucene.facet.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValues.Source; +import org.apache.lucene.index.DocValues.Type; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterAtomicReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +/** + * A {@link FilterAtomicReader} for migrating a facets index which encodes + * category ordinals in a payload to {@link DocValues}. To migrate the index, + * you should build a mapping from a field (String) to term ({@link Term}), + * which denotes under which DocValues field to put the data encoded in the + * matching term's payload. You can follow the code example below to migrate an + * existing index: + * + *

+ * // Add the index and migrate payload to DocValues on the go
+ * DirectoryReader reader = DirectoryReader.open(oldDir);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * IndexWriter writer = new IndexWriter(newDir, conf);
+ * List<AtomicReaderContext> leaves = reader.leaves();
+ * AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()];
+ * for (int i = 0; i < leaves.size(); i++) {
+ *   wrappedLeaves[i] = new FacetPayloadMigrationReader(leaves.get(i).reader(),
+ *       fieldTerms);
+ * }
+ * writer.addIndexes(new MultiReader(wrappedLeaves));
+ * writer.commit();
+ * 
+ * + *

+ * NOTE: to build the field-to-term map you can use + * {@link #buildFieldTermsMap(Directory, FacetIndexingParams)}, as long as the + * index to migrate contains the ordinals payload under + * {@link #PAYLOAD_TERM_TEXT}. + * + * @lucene.experimental + */ +public class FacetsPayloadMigrationReader extends FilterAtomicReader { + + private class PayloadMigratingDocValues extends DocValues { + + private final DocsAndPositionsEnum dpe; + + public PayloadMigratingDocValues(DocsAndPositionsEnum dpe) { + this.dpe = dpe; + } + + @Override + protected Source loadDirectSource() throws IOException { + return new PayloadMigratingSource(getType(), dpe); + } + + @Override + protected Source loadSource() throws IOException { + throw new UnsupportedOperationException("in-memory Source is not supported by this reader"); + } + + @Override + public Type getType() { + return Type.BYTES_VAR_STRAIGHT; + } + + } + + private class PayloadMigratingSource extends Source { + + private final DocsAndPositionsEnum dpe; + private int curDocID; + + protected PayloadMigratingSource(Type type, DocsAndPositionsEnum dpe) { + super(type); + this.dpe = dpe; + if (dpe == null) { + curDocID = DocIdSetIterator.NO_MORE_DOCS; + } else { + try { + curDocID = dpe.nextDoc(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public BytesRef getBytes(int docID, BytesRef ref) { + if (curDocID > docID) { + // document does not exist + ref.length = 0; + return ref; + } + + try { + if (curDocID < docID) { + curDocID = dpe.advance(docID); + if (curDocID != docID) { // requested document does not have a payload + ref.length = 0; + return ref; + } + } + + // we're on the document + dpe.nextPosition(); + ref.copyBytes(dpe.getPayload()); + return ref; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + } + + /** The {@link Term} text of the ordinals payload. */ + public static final String PAYLOAD_TERM_TEXT = "$fulltree$"; + + /** + * A utility method for building the field-to-Term map, given the + * {@link FacetIndexingParams} and the directory of the index to migrate. The + * map that will be built will correspond to partitions as well as multiple + * {@link CategoryListParams}. + *

+ * NOTE: since {@link CategoryListParams} no longer define a + * {@link Term}, this method assumes that the term used by the different + * {@link CategoryListParams} is {@link #PAYLOAD_TERM_TEXT}. If this is not + * the case, then you should build the map yourself, using the terms in your + * index. + */ + public static Map buildFieldTermsMap(Directory dir, FacetIndexingParams fip) throws IOException { + // only add field-Term mapping that will actually have DocValues in the end. + // therefore traverse the index terms and add what exists. this pertains to + // multiple CLPs, as well as partitions + DirectoryReader reader = DirectoryReader.open(dir); + final Map fieldTerms = new HashMap(); + for (AtomicReaderContext context : reader.leaves()) { + for (CategoryListParams clp : fip.getAllCategoryListParams()) { + Terms terms = context.reader().terms(clp.field); + if (terms != null) { + TermsEnum te = terms.iterator(null); + BytesRef termBytes = null; + while ((termBytes = te.next()) != null) { + String term = termBytes.utf8ToString(); + if (term.startsWith(PAYLOAD_TERM_TEXT )) { + if (term.equals(PAYLOAD_TERM_TEXT)) { + fieldTerms.put(clp.field, new Term(clp.field, term)); + } else { + fieldTerms.put(clp.field + term.substring(PAYLOAD_TERM_TEXT.length()), new Term(clp.field, term)); + } + } + } + } + } + } + reader.close(); + return fieldTerms; + } + + private final Map fieldTerms; + + /** + * Wraps an {@link AtomicReader} and migrates the payload to {@link DocValues} + * fields by using the given mapping. + */ + public FacetsPayloadMigrationReader(AtomicReader in, Map fieldTerms) { + super(in); + this.fieldTerms = fieldTerms; + } + + @Override + public DocValues docValues(String field) throws IOException { + Term term = fieldTerms.get(field); + if (term == null) { + return super.docValues(field); + } else { + DocsAndPositionsEnum dpe = null; + Fields fields = fields(); + if (fields != null) { + Terms terms = fields.terms(term.field()); + if (terms != null) { + TermsEnum te = terms.iterator(null); // no use for reusing + if (te.seekExact(term.bytes(), true)) { + // we're not expected to be called for deleted documents + dpe = te.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_PAYLOADS); + } + } + } + // we shouldn't return null, even if the term does not exist or has no + // payloads, since we already marked the field as having DocValues. + return new PayloadMigratingDocValues(dpe); + } + } + + @Override + public FieldInfos getFieldInfos() { + FieldInfos innerInfos = super.getFieldInfos(); + ArrayList infos = new ArrayList(innerInfos.size()); + // if there are partitions, then the source index contains one field for all their terms + // while with DocValues, we simulate that by multiple fields. + HashSet leftoverFields = new HashSet(fieldTerms.keySet()); + int number = -1; + for (FieldInfo info : innerInfos) { + if (fieldTerms.containsKey(info.name)) { + // mark this field as having a DocValues + infos.add(new FieldInfo(info.name, true, info.number, + info.hasVectors(), info.omitsNorms(), info.hasPayloads(), + info.getIndexOptions(), Type.BYTES_VAR_STRAIGHT, + info.getNormType(), info.attributes())); + leftoverFields.remove(info.name); + } else { + infos.add(info); + } + number = Math.max(number, info.number); + } + for (String field : leftoverFields) { + infos.add(new FieldInfo(field, false, ++number, false, false, false, + null, Type.BYTES_VAR_STRAIGHT, null, null)); + } + return new FieldInfos(infos.toArray(new FieldInfo[infos.size()])); + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java index 522f383e4d4..d892ab0c8c5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java @@ -25,13 +25,10 @@ import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.OrdinalMap; import org.apache.lucene.index.AtomicReader; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.Fields; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValues.Source; +import org.apache.lucene.index.DocValues.Type; import org.apache.lucene.index.FilterAtomicReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; @@ -41,8 +38,8 @@ import org.apache.lucene.util.encoding.IntEncoder; * A {@link FilterAtomicReader} for updating facets ordinal references, * based on an ordinal map. You should use this code in conjunction with merging * taxonomies - after you merge taxonomies, you receive an {@link OrdinalMap} - * which maps the 'old' payloads to the 'new' ones. You can use that map to - * re-map the payloads which contain the facets information (ordinals) either + * which maps the 'old' ordinals to the 'new' ones. You can use that map to + * re-map the doc values which contain the facets information (ordinals) either * before or while merging the indexes. *

* For re-mapping the ordinals during index merge, do the following: @@ -70,9 +67,8 @@ import org.apache.lucene.util.encoding.IntEncoder; public class OrdinalMappingAtomicReader extends FilterAtomicReader { private final int[] ordinalMap; - // a little obtuse: but we dont need to create Term objects this way - private final Map> termMap = - new HashMap>(1); + + private final Map dvFieldMap = new HashMap(); /** * Wraps an AtomicReader, mapping ordinals according to the ordinalMap. @@ -91,125 +87,85 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader { super(in); this.ordinalMap = ordinalMap; for (CategoryListParams params: indexingParams.getAllCategoryListParams()) { - Term term = params.getTerm(); - Map fieldMap = termMap.get(term.field()); - if (fieldMap == null) { - fieldMap = new HashMap(1); - termMap.put(term.field(), fieldMap); - } - fieldMap.put(term.bytes(), params); + dvFieldMap.put(params.field, params); } } @Override - public Fields getTermVectors(int docID) throws IOException { - Fields fields = super.getTermVectors(docID); - if (fields == null) { - return null; - } else { - return new OrdinalMappingFields(fields); + public DocValues docValues(String field) throws IOException { + DocValues inner = super.docValues(field); + if (inner == null) { + return inner; } - } - - @Override - public Fields fields() throws IOException { - Fields fields = super.fields(); - if (fields == null) { - return null; - } else { - return new OrdinalMappingFields(fields); - } - } - - private class OrdinalMappingFields extends FilterFields { - - public OrdinalMappingFields(Fields in) { - super(in); - } - - @Override - public Terms terms(String field) throws IOException { - Terms terms = super.terms(field); - if (terms == null) { - return terms; - } - Map termsMap = termMap.get(field); - if (termsMap == null) { - return terms; - } else { - return new OrdinalMappingTerms(terms, termsMap); - } - } - } - - private class OrdinalMappingTerms extends FilterTerms { - private final Map termsMap; - public OrdinalMappingTerms(Terms in, Map termsMap) { - super(in); - this.termsMap = termsMap; - } - - @Override - public TermsEnum iterator(TermsEnum reuse) throws IOException { - // TODO: should we reuse the inner termsenum? - return new OrdinalMappingTermsEnum(super.iterator(reuse), termsMap); + CategoryListParams clp = dvFieldMap.get(field); + if (clp == null) { + return inner; + } else { + return new OrdinalMappingDocValues(inner, clp); } } - private class OrdinalMappingTermsEnum extends FilterTermsEnum { - private final Map termsMap; + private class OrdinalMappingDocValues extends DocValues { + + private final CategoryListParams clp; + private final DocValues delegate; - public OrdinalMappingTermsEnum(TermsEnum in, Map termsMap) { - super(in); - this.termsMap = termsMap; + public OrdinalMappingDocValues(DocValues delegate, CategoryListParams clp) { + this.delegate = delegate; + this.clp = clp; } @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - // TODO: we could reuse our D&P enum if we need - DocsAndPositionsEnum inner = super.docsAndPositions(liveDocs, reuse, flags); - if (inner == null) { - return inner; - } - - CategoryListParams params = termsMap.get(term()); - if (params == null) { - return inner; - } - - return new OrdinalMappingDocsAndPositionsEnum(inner, params); + protected Source loadSource() throws IOException { + return new OrdinalMappingSource(getType(), clp, delegate.getSource()); } + + @Override + protected Source loadDirectSource() throws IOException { + return new OrdinalMappingSource(getType(), clp, delegate.getDirectSource()); + } + + @Override + public Type getType() { + return Type.BYTES_VAR_STRAIGHT; + } + } - private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum { + private class OrdinalMappingSource extends Source { + private final IntEncoder encoder; private final IntDecoder decoder; private final IntsRef ordinals = new IntsRef(32); - private final BytesRef payloadOut = new BytesRef(); - - public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) { - super(in); - encoder = params.createEncoder(); + private final Source delegate; + + protected OrdinalMappingSource(Type type, CategoryListParams clp, Source delegate) { + super(type); + this.delegate = delegate; + encoder = clp.createEncoder(); decoder = encoder.createMatchingDecoder(); } - + + @SuppressWarnings("synthetic-access") @Override - public BytesRef getPayload() throws IOException { - BytesRef payload = super.getPayload(); - if (payload == null) { - return payload; + public BytesRef getBytes(int docID, BytesRef ref) { + ref = delegate.getBytes(docID, ref); + if (ref == null || ref.length == 0) { + return ref; } else { - decoder.decode(payload, ordinals); + decoder.decode(ref, ordinals); // map the ordinals for (int i = 0; i < ordinals.length; i++) { ordinals.ints[i] = ordinalMap[ordinals.ints[i]]; } - encoder.encode(ordinals, payloadOut); - return payloadOut; + encoder.encode(ordinals, ref); + return ref; } } + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java index 847da005d73..1db4efbe3f6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java @@ -4,15 +4,13 @@ import java.io.IOException; import java.io.Serializable; import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.search.PayloadCategoryListIteraor; +import org.apache.lucene.facet.search.DocValuesCategoryListIterator; import org.apache.lucene.facet.util.PartitionsUtils; -import org.apache.lucene.index.Term; -import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.DGapVInt8IntEncoder; import org.apache.lucene.util.encoding.IntDecoder; import org.apache.lucene.util.encoding.IntEncoder; import org.apache.lucene.util.encoding.SortingIntEncoder; import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; -import org.apache.lucene.util.encoding.VInt8IntEncoder; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -38,39 +36,26 @@ import org.apache.lucene.util.encoding.VInt8IntEncoder; */ public class CategoryListParams implements Serializable { - /** The default term used to store the facets information. */ - public static final Term DEFAULT_TERM = new Term("$facets", "$fulltree$"); + /** The default field used to store the facets information. */ + public static final String DEFAULT_FIELD = "$facets"; - private final Term term; + public final String field; private final int hashCode; - /** - * Constructs a default category list parameters object, using - * {@link #DEFAULT_TERM}. - */ + /** Constructs a default category list parameters object, using {@link #DEFAULT_FIELD}. */ public CategoryListParams() { - this(DEFAULT_TERM); + this(DEFAULT_FIELD); } - /** - * Constructs a category list parameters object, using the given {@link Term}. - * @param term who's payload hold the category-list. - */ - public CategoryListParams(Term term) { - this.term = term; + /** Constructs a category list parameters object, using the given field. */ + public CategoryListParams(String field) { + this.field = field; // Pre-compute the hashCode because these objects are immutable. Saves // some time on the comparisons later. - this.hashCode = term.hashCode(); + this.hashCode = field.hashCode(); } - /** - * A {@link Term} who's payload holds the category-list. - */ - public final Term getTerm() { - return term; - } - /** * Allows to override how categories are encoded and decoded. A matching * {@link IntDecoder} is provided by the {@link IntEncoder}. @@ -92,7 +77,7 @@ public class CategoryListParams implements Serializable { * counting facets. */ public IntEncoder createEncoder() { - return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapVInt8IntEncoder())); } @Override @@ -110,7 +95,7 @@ public class CategoryListParams implements Serializable { // The above hashcodes might equal each other in the case of a collision, // so at this point only directly term equality testing will settle // the equality test. - return this.term.equals(other.term); + return field.equals(other.field); } @Override @@ -120,9 +105,9 @@ public class CategoryListParams implements Serializable { /** Create the {@link CategoryListIterator} for the specified partition. */ public CategoryListIterator createCategoryListIterator(int partition) throws IOException { - String categoryListTermStr = PartitionsUtils.partitionName(this, partition); - Term payloadTerm = new Term(term.field(), categoryListTermStr); - return new PayloadCategoryListIteraor(payloadTerm, createEncoder().createMatchingDecoder()); + String categoryListTermStr = PartitionsUtils.partitionName(partition); + String docValuesField = field + categoryListTermStr; + return new DocValuesCategoryListIterator(docValuesField, createEncoder().createMatchingDecoder()); } } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java b/lucene/facet/src/java/org/apache/lucene/facet/search/DocValuesCategoryListIterator.java similarity index 60% rename from lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java rename to lucene/facet/src/java/org/apache/lucene/facet/search/DocValuesCategoryListIterator.java index b8639c3cc0d..f0d86d02880 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/DocValuesCategoryListIterator.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet.search; import java.io.IOException; import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; @@ -25,57 +25,62 @@ import org.apache.lucene.util.encoding.IntDecoder; * limitations under the License. */ -/** - * A {@link CategoryListIterator} which reads the category ordinals from a - * payload. - * - * @lucene.experimental - */ -public class PayloadCategoryListIteraor implements CategoryListIterator { - - private final IntDecoder decoder; - private final Term term; - private final PayloadIterator pi; - private final int hashCode; +/** A {@link CategoryListIterator} which reads the ordinals from a {@link BinaryDocValues}. */ +public class DocValuesCategoryListIterator implements CategoryListIterator { - public PayloadCategoryListIteraor(Term term, IntDecoder decoder) throws IOException { - pi = new PayloadIterator(term); + private final IntDecoder decoder; + private final String field; + private final int hashCode; + private final BytesRef bytes = new BytesRef(32); + + private BinaryDocValues current; + + /** + * Constructs a new {@link DocValuesCategoryListIterator}. + */ + public DocValuesCategoryListIterator(String field, IntDecoder decoder) { + this.field = field; this.decoder = decoder; - hashCode = term.hashCode(); - this.term = term; + this.hashCode = field.hashCode(); } - - @Override - public boolean equals(Object other) { - if (!(other instanceof PayloadCategoryListIteraor)) { - return false; - } - PayloadCategoryListIteraor that = (PayloadCategoryListIteraor) other; - if (hashCode != that.hashCode) { - return false; - } - - // Hash codes are the same, check equals() to avoid cases of hash-collisions. - return term.equals(that.term); - } - + @Override public int hashCode() { return hashCode; } - + + @Override + public boolean equals(Object o) { + if (!(o instanceof DocValuesCategoryListIterator)) { + return false; + } + DocValuesCategoryListIterator other = (DocValuesCategoryListIterator) o; + if (hashCode != other.hashCode) { + return false; + } + + // Hash codes are the same, check equals() to avoid cases of hash-collisions. + return field.equals(other.field); + } + @Override public boolean setNextReader(AtomicReaderContext context) throws IOException { - return pi.setNextReader(context); + current = context.reader().getBinaryDocValues(field); + return current != null; } @Override public void getOrdinals(int docID, IntsRef ints) throws IOException { + current.get(docID, bytes); ints.length = 0; - BytesRef payload = pi.getPayload(docID); - if (payload != null) { - decoder.decode(payload, ints); + if (bytes.length > 0) { + decoder.decode(bytes, ints); } } - + + @Override + public String toString() { + return field; + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/DrillDown.java b/lucene/facet/src/java/org/apache/lucene/facet/search/DrillDown.java index b6cdf738b47..38dc02a94d7 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/DrillDown.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/DrillDown.java @@ -55,7 +55,7 @@ public final class DrillDown { CategoryListParams clp = iParams.getCategoryListParams(path); char[] buffer = new char[path.fullPathLength()]; iParams.drillDownTermText(path, buffer); - return new Term(clp.getTerm().field(), String.valueOf(buffer)); + return new Term(clp.field, String.valueOf(buffer)); } /** diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java deleted file mode 100644 index 7d956ac608e..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java +++ /dev/null @@ -1,114 +0,0 @@ -package org.apache.lucene.facet.search; - -import java.io.IOException; - -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DocsAndPositionsEnum; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * A utility class for iterating through a posting list of a given term and - * retrieving the payload of the first position in every document. For - * efficiency, this class does not check if documents passed to - * {@link #getPayload(int)} are deleted, since it is usually used to iterate on - * payloads of documents that matched a query. If you need to skip over deleted - * documents, you should do so before calling {@link #getPayload(int)}. - * - * @lucene.experimental - */ -public class PayloadIterator { - - private TermsEnum reuseTE; - private DocsAndPositionsEnum dpe; - private boolean hasMore; - private int curDocID; - - private final Term term; - - public PayloadIterator(Term term) throws IOException { - this.term = term; - } - - /** - * Sets the {@link AtomicReaderContext} for which {@link #getPayload(int)} - * calls will be made. Returns true iff this reader has payload for any of the - * documents belonging to the {@link Term} given to the constructor. - */ - public boolean setNextReader(AtomicReaderContext context) throws IOException { - hasMore = false; - Fields fields = context.reader().fields(); - if (fields != null) { - Terms terms = fields.terms(term.field()); - if (terms != null) { - reuseTE = terms.iterator(reuseTE); - if (reuseTE.seekExact(term.bytes(), true)) { - // this class is usually used to iterate on whatever a Query matched - // if it didn't match deleted documents, we won't receive them. if it - // did, we should iterate on them too, therefore we pass liveDocs=null - dpe = reuseTE.docsAndPositions(null, dpe, DocsAndPositionsEnum.FLAG_PAYLOADS); - if (dpe != null && (curDocID = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - hasMore = true; - } - } - } - } - return hasMore; - } - - /** - * Returns the {@link BytesRef payload} of the given document, or {@code null} - * if the document does not exist, there are no more documents in the posting - * list, or the document exists but has not payload. The given document IDs - * are treated as local to the reader given to - * {@link #setNextReader(AtomicReaderContext)}. - */ - public BytesRef getPayload(int docID) throws IOException { - if (!hasMore) { - return null; - } - - if (curDocID > docID) { - // document does not exist - return null; - } - - if (curDocID < docID) { - curDocID = dpe.advance(docID); - if (curDocID != docID) { // requested document does not have a payload - if (curDocID == DocIdSetIterator.NO_MORE_DOCS) { // no more docs in this reader - hasMore = false; - } - return null; - } - } - - // we're on the document - assert dpe.freq() == 1 : "expecting freq=1 (got " + dpe.freq() + ") term=" + term + " doc=" + curDocID; - int pos = dpe.nextPosition(); - assert pos != -1 : "no positions for term=" + term + " doc=" + curDocID; - return dpe.getPayload(); - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java index 7373e4dba34..15e195ffd88 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java @@ -10,6 +10,7 @@ import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; +import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.search.params.FacetSearchParams; @@ -110,9 +111,8 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { if (isUsingComplements) { try { - totalFacetCounts = TotalFacetCountsCache.getSingleton() - .getTotalCounts(indexReader, taxonomyReader, - searchParams.getFacetIndexingParams(), searchParams.getCategoryListCache()); + totalFacetCounts = TotalFacetCountsCache.getSingleton().getTotalCounts(indexReader, taxonomyReader, + searchParams.getFacetIndexingParams()); if (totalFacetCounts != null) { docids = ScoredDocIdsUtils.getComplementSet(docids, indexReader); } else { @@ -242,20 +242,29 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { int maxDoc = -1; while (iterator.next()) { int docID = iterator.getDocID(); - while (docID >= maxDoc) { // find the segment which contains this document - if (!contexts.hasNext()) { - throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?"); - } - current = contexts.next(); - maxDoc = current.docBase + current.reader().maxDoc(); - if (docID < maxDoc) { // segment has docs, check if it has categories - boolean validSegment = categoryListIter.setNextReader(current); - validSegment &= aggregator.setNextReader(current); - if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs - while (docID < maxDoc && iterator.next()) { - docID = iterator.getDocID(); + if (docID >= maxDoc) { + boolean iteratorDone = false; + do { // find the segment which contains this document + if (!contexts.hasNext()) { + throw new RuntimeException("ScoredDocIDs contains documents outside this reader's segments !?"); + } + current = contexts.next(); + maxDoc = current.docBase + current.reader().maxDoc(); + if (docID < maxDoc) { // segment has docs, check if it has categories + boolean validSegment = categoryListIter.setNextReader(current); + validSegment &= aggregator.setNextReader(current); + if (!validSegment) { // if categoryList or aggregtor say it's an invalid segment, skip all docs + while (docID < maxDoc && iterator.next()) { + docID = iterator.getDocID(); + } + if (docID < maxDoc) { + iteratorDone = true; + } } } + } while (docID >= maxDoc); + if (iteratorDone) { // iterator finished, terminate the loop + break; } } docID -= current.docBase; @@ -312,19 +321,17 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { HashMap categoryLists = new HashMap(); + FacetIndexingParams indexingParams = searchParams.getFacetIndexingParams(); for (FacetRequest facetRequest : searchParams.getFacetRequests()) { - Aggregator categoryAggregator = facetRequest.createAggregator( - isUsingComplements, facetArrays, taxonomyReader); + Aggregator categoryAggregator = facetRequest.createAggregator(isUsingComplements, facetArrays, taxonomyReader); - CategoryListIterator cli = facetRequest.createCategoryListIterator(taxonomyReader, searchParams, partition); + CategoryListIterator cli = indexingParams.getCategoryListParams(facetRequest.categoryPath).createCategoryListIterator(partition); // get the aggregator Aggregator old = categoryLists.put(cli, categoryAggregator); if (old != null && !old.equals(categoryAggregator)) { - // TODO (Facet): create a more meaningful RE class, and throw it. - throw new RuntimeException( - "Overriding existing category list with different aggregator. THAT'S A NO NO!"); + throw new RuntimeException("Overriding existing category list with different aggregator"); } // if the aggregator is the same we're covered } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java b/lucene/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java index 20feeb03c45..b8689517553 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java @@ -55,7 +55,7 @@ public class TopKFacetResultsHandler extends FacetResultsHandler { public IntermediateFacetResult fetchPartitionResult(FacetArrays facetArrays, int offset) throws IOException { TopKFacetResult res = null; - int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath()); + int ordinal = taxonomyReader.getOrdinal(facetRequest.categoryPath); if (ordinal != TaxonomyReader.INVALID_ORDINAL) { double value = 0; if (isSelfPartition(ordinal, facetArrays, offset)) { @@ -79,7 +79,7 @@ public class TopKFacetResultsHandler extends FacetResultsHandler { @Override public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults) throws IOException { - int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath()); + int ordinal = taxonomyReader.getOrdinal(facetRequest.categoryPath); MutableFacetResultNode resNode = new MutableFacetResultNode(ordinal, 0); int totalFacets = 0; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java b/lucene/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java index dff7feec142..7aee99d0220 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java @@ -34,35 +34,37 @@ import org.apache.lucene.util.collections.IntToObjectMap; */ /** - * Generates {@link FacetResult} from the count arrays aggregated for a particular - * {@link FacetRequest}. - * The generated {@link FacetResult} is a subtree of the taxonomy tree. - * Its root node, {@link FacetResult#getFacetResultNode()}, - * is the facet specified by {@link FacetRequest#getCategoryPath()}, - * and the enumerated children, {@link FacetResultNode#getSubResults()}, of each node in that - * {@link FacetResult} are the top K ( = {@link FacetRequest#getNumResults()}) among its children - * in the taxonomy. - * Top in the sense {@link FacetRequest#getSortBy()}, - * which can be by the values aggregated in the count arrays, or by ordinal numbers; - * also specified is the sort order, {@link FacetRequest#getSortOrder()}, - * ascending or descending, of these values or ordinals before their top K are selected. - * The depth (number of levels excluding the root) of the - * {@link FacetResult} tree is specified by {@link FacetRequest#getDepth()}. + * Generates {@link FacetResult} from the count arrays aggregated for a + * particular {@link FacetRequest}. The generated {@link FacetResult} is a + * subtree of the taxonomy tree. Its root node, + * {@link FacetResult#getFacetResultNode()}, is the facet specified by + * {@link FacetRequest#categoryPath}, and the enumerated children, + * {@link FacetResultNode#getSubResults()}, of each node in that + * {@link FacetResult} are the top K ( = {@link FacetRequest#getNumResults()}) + * among its children in the taxonomy. Top in the sense + * {@link FacetRequest#getSortBy()}, which can be by the values aggregated in + * the count arrays, or by ordinal numbers; also specified is the sort order, + * {@link FacetRequest#getSortOrder()}, ascending or descending, of these values + * or ordinals before their top K are selected. The depth (number of levels + * excluding the root) of the {@link FacetResult} tree is specified by + * {@link FacetRequest#getDepth()}. *

- * Because the number of selected children of each node is restricted, - * and not the overall number of nodes in the {@link FacetResult}, facets not selected + * Because the number of selected children of each node is restricted, and not + * the overall number of nodes in the {@link FacetResult}, facets not selected * into {@link FacetResult} might have better values, or ordinals, (typically, * higher counts), than facets that are selected into the {@link FacetResult}. *

- * The generated {@link FacetResult} also provides with - * {@link FacetResult#getNumValidDescendants()}, which returns the total number of facets - * that are descendants of the root node, no deeper than {@link FacetRequest#getDepth()}, and - * which have valid value. The rootnode itself is not counted here. - * Valid value is determined by the {@link FacetResultsHandler}. - * {@link TopKInEachNodeHandler} defines valid as != 0. + * The generated {@link FacetResult} also provides with + * {@link FacetResult#getNumValidDescendants()}, which returns the total number + * of facets that are descendants of the root node, no deeper than + * {@link FacetRequest#getDepth()}, and which have valid value. The rootnode + * itself is not counted here. Valid value is determined by the + * {@link FacetResultsHandler}. {@link TopKInEachNodeHandler} defines valid as + * != 0. *

- * NOTE: this code relies on the assumption that {@link TaxonomyReader#INVALID_ORDINAL} == -1, a smaller - * value than any valid ordinal. + * NOTE: this code relies on the assumption that + * {@link TaxonomyReader#INVALID_ORDINAL} == -1, a smaller value than any valid + * ordinal. * * @lucene.experimental */ @@ -109,7 +111,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler { // get the root of the result tree to be returned, and the depth of that result tree // (depth means number of node levels excluding the root). - int rootNode = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath()); + int rootNode = this.taxonomyReader.getOrdinal(facetRequest.categoryPath); if (rootNode == TaxonomyReader.INVALID_ORDINAL) { return null; } @@ -767,7 +769,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler { @Override public FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException { IntermediateFacetResultWithHash tmp = (IntermediateFacetResultWithHash) tmpResult; - int ordinal = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath()); + int ordinal = this.taxonomyReader.getOrdinal(this.facetRequest.categoryPath); if ((tmp == null) || (ordinal == TaxonomyReader.INVALID_ORDINAL)) { return null; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java index 84c3b2fc05a..1ab291d94f6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java @@ -17,8 +17,6 @@ import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.aggregator.CountingAggregator; -import org.apache.lucene.facet.search.cache.CategoryListCache; -import org.apache.lucene.facet.search.cache.CategoryListData; import org.apache.lucene.facet.search.params.CountFacetRequest; import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.search.params.FacetSearchParams; @@ -155,9 +153,8 @@ public class TotalFacetCounts { private static final List DUMMY_REQ = Arrays.asList( new FacetRequest[] { new CountFacetRequest(CategoryPath.EMPTY, 1) }); - static TotalFacetCounts compute(final IndexReader indexReader, - final TaxonomyReader taxonomy, final FacetIndexingParams facetIndexingParams, - final CategoryListCache clCache) throws IOException { + static TotalFacetCounts compute(final IndexReader indexReader, final TaxonomyReader taxonomy, + final FacetIndexingParams facetIndexingParams) throws IOException { int partitionSize = PartitionsUtils.partitionSize(facetIndexingParams, taxonomy); final int[][] counts = new int[(int) Math.ceil(taxonomy.getSize() /(float) partitionSize)][partitionSize]; FacetSearchParams newSearchParams = new FacetSearchParams(DUMMY_REQ, facetIndexingParams); @@ -170,8 +167,7 @@ public class TotalFacetCounts { Aggregator aggregator = new CountingAggregator(counts[partition]); HashMap map = new HashMap(); for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) { - final CategoryListIterator cli = clIteraor(clCache, clp, partition); - map.put(cli, aggregator); + map.put(clp.createCategoryListIterator(partition), aggregator); } return map; } @@ -181,14 +177,4 @@ public class TotalFacetCounts { return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed); } - static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, int partition) - throws IOException { - if (clCache != null) { - CategoryListData cld = clCache.get(clp); - if (cld != null) { - return cld.iterator(partition); - } - } - return clp.createCategoryListIterator(partition); - } } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java index edb4b50fc72..95ba9603c4f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java @@ -7,12 +7,10 @@ import java.util.LinkedHashMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.cache.CategoryListCache; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.IndexReader; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -80,16 +78,20 @@ public final class TotalFacetCountsCache { } /** - * Get the total facet counts for a reader/taxonomy pair and facet indexing parameters. - * If not in cache, computed here and added to the cache for later use. - * @param indexReader the documents index - * @param taxonomy the taxonomy index - * @param facetIndexingParams facet indexing parameters - * @param clCache category list cache for faster computation, can be null + * Get the total facet counts for a reader/taxonomy pair and facet indexing + * parameters. If not in cache, computed here and added to the cache for later + * use. + * + * @param indexReader + * the documents index + * @param taxonomy + * the taxonomy index + * @param facetIndexingParams + * facet indexing parameters * @return the total facet counts. */ public TotalFacetCounts getTotalCounts(IndexReader indexReader, TaxonomyReader taxonomy, - FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException { + FacetIndexingParams facetIndexingParams) throws IOException { // create the key TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams); // it is important that this call is not synchronized, so that available TFC @@ -99,7 +101,7 @@ public final class TotalFacetCountsCache { markRecentlyUsed(key); return tfc; } - return computeAndCache(key, clCache); + return computeAndCache(key); } /** @@ -149,10 +151,10 @@ public final class TotalFacetCountsCache { * matter this method is synchronized, which is not too bad, because there is * lots of work done in the computations. */ - private synchronized TotalFacetCounts computeAndCache(TFCKey key, CategoryListCache clCache) throws IOException { + private synchronized TotalFacetCounts computeAndCache(TFCKey key) throws IOException { TotalFacetCounts tfc = cache.get(key); if (tfc == null) { - tfc = TotalFacetCounts.compute(key.indexReader, key.taxonomy, key.facetIndexingParams, clCache); + tfc = TotalFacetCounts.compute(key.indexReader, key.taxonomy, key.facetIndexingParams); lruKeys.add(key); cache.put(key,tfc); trimCache(); @@ -161,16 +163,22 @@ public final class TotalFacetCountsCache { } /** - * Load {@link TotalFacetCounts} matching input parameters from the provided outputFile - * and add them into the cache for the provided indexReader, taxonomy, and facetIndexingParams. - * If a {@link TotalFacetCounts} for these parameters already exists in the cache, it will be - * replaced by the loaded one. - * @param inputFile file from which to read the data - * @param indexReader the documents index - * @param taxonomy the taxonomy index - * @param facetIndexingParams the facet indexing parameters - * @throws IOException on error - * @see #store(File, IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache) + * Load {@link TotalFacetCounts} matching input parameters from the provided + * outputFile and add them into the cache for the provided indexReader, + * taxonomy, and facetIndexingParams. If a {@link TotalFacetCounts} for these + * parameters already exists in the cache, it will be replaced by the loaded + * one. + * + * @param inputFile + * file from which to read the data + * @param indexReader + * the documents index + * @param taxonomy + * the taxonomy index + * @param facetIndexingParams + * the facet indexing parameters + * @throws IOException + * on error */ public synchronized void load(File inputFile, IndexReader indexReader, TaxonomyReader taxonomy, FacetIndexingParams facetIndexingParams) throws IOException { @@ -185,21 +193,27 @@ public final class TotalFacetCountsCache { } /** - * Store the {@link TotalFacetCounts} matching input parameters into the provided outputFile, - * making them available for a later call to {@link #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)}. - * If these {@link TotalFacetCounts} are available in the cache, they are used. But if they are - * not in the cache, this call will first compute them (which will also add them to the cache). - * @param outputFile file to store in. - * @param indexReader the documents index - * @param taxonomy the taxonomy index - * @param facetIndexingParams the facet indexing parameters - * @param clCache category list cache for faster computation, can be null - * @throws IOException on error + * Store the {@link TotalFacetCounts} matching input parameters into the + * provided outputFile, making them available for a later call to + * {@link #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)}. If + * these {@link TotalFacetCounts} are available in the cache, they are used. + * But if they are not in the cache, this call will first compute them (which + * will also add them to the cache). + * + * @param outputFile + * file to store in. + * @param indexReader + * the documents index + * @param taxonomy + * the taxonomy index + * @param facetIndexingParams + * the facet indexing parameters + * @throws IOException + * on error * @see #load(File, IndexReader, TaxonomyReader, FacetIndexingParams) - * @see #getTotalCounts(IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache) */ public void store(File outputFile, IndexReader indexReader, TaxonomyReader taxonomy, - FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException { + FacetIndexingParams facetIndexingParams) throws IOException { File parentFile = outputFile.getParentFile(); if ( ( outputFile.exists() && (!outputFile.isFile() || !outputFile.canWrite())) || @@ -207,7 +221,7 @@ public final class TotalFacetCountsCache { ) { throw new IllegalArgumentException("Exepecting a writable file: "+outputFile); } - TotalFacetCounts tfc = getTotalCounts(indexReader, taxonomy, facetIndexingParams, clCache); + TotalFacetCounts tfc = getTotalCounts(indexReader, taxonomy, facetIndexingParams); TotalFacetCounts.storeToFile(outputFile, tfc); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java index 5aa38af2d33..d35428f5b64 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet.search.aggregator.associations; import java.io.IOException; import org.apache.lucene.facet.associations.CategoryFloatAssociation; -import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator; +import org.apache.lucene.facet.associations.FloatAssociationsIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.AtomicReaderContext; @@ -37,15 +37,15 @@ public class AssociationFloatSumAggregator implements Aggregator { protected final String field; protected final float[] sumArray; - protected final FloatAssociationsPayloadIterator associations; + protected final FloatAssociationsIterator associations; public AssociationFloatSumAggregator(float[] sumArray) throws IOException { - this(CategoryListParams.DEFAULT_TERM.field(), sumArray); + this(CategoryListParams.DEFAULT_FIELD, sumArray); } public AssociationFloatSumAggregator(String field, float[] sumArray) throws IOException { this.field = field; - associations = new FloatAssociationsPayloadIterator(field, new CategoryFloatAssociation()); + associations = new FloatAssociationsIterator(field, new CategoryFloatAssociation()); this.sumArray = sumArray; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java index 52baf368aa7..ec20dc37232 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java @@ -3,7 +3,7 @@ package org.apache.lucene.facet.search.aggregator.associations; import java.io.IOException; import org.apache.lucene.facet.associations.CategoryIntAssociation; -import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator; +import org.apache.lucene.facet.associations.IntAssociationsIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.AtomicReaderContext; @@ -37,15 +37,15 @@ public class AssociationIntSumAggregator implements Aggregator { protected final String field; protected final int[] sumArray; - protected final IntAssociationsPayloadIterator associations; + protected final IntAssociationsIterator associations; public AssociationIntSumAggregator(int[] sumArray) throws IOException { - this(CategoryListParams.DEFAULT_TERM.field(), sumArray); + this(CategoryListParams.DEFAULT_FIELD, sumArray); } public AssociationIntSumAggregator(String field, int[] sumArray) throws IOException { this.field = field; - associations = new IntAssociationsPayloadIterator(field, new CategoryIntAssociation()); + associations = new IntAssociationsIterator(field, new CategoryIntAssociation()); this.sumArray = sumArray; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java deleted file mode 100644 index d87356fdd8b..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java +++ /dev/null @@ -1,61 +0,0 @@ -package org.apache.lucene.facet.search.cache; - -import java.io.IOException; -import java.util.HashMap; - -import org.apache.lucene.index.IndexReader; - -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Cache for {@link CategoryListData}, per {@link CategoryListParams}. - * - * @lucene.experimental - */ -public class CategoryListCache { - - private HashMap - cldMap = new HashMap(); - - /** - * Fetch the cached {@link CategoryListData} for a given {@link CategoryListParams}. - */ - public CategoryListData get(CategoryListParams clp) { - return cldMap.get(clp); - } - - /** - * Register a pre-computed {@link CategoryListData}. - */ - public void register(CategoryListParams clp, CategoryListData clData) { - cldMap.put(clp,clData); - } - - /** - * Load and register {@link CategoryListData}. - */ - public void loadAndRegister(CategoryListParams clp, - IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams) throws IOException { - CategoryListData clData = new CategoryListData(reader, taxo, iparams, clp); - register(clp,clData); - } -} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java deleted file mode 100644 index db4769c3d43..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java +++ /dev/null @@ -1,133 +0,0 @@ -package org.apache.lucene.facet.search.cache; - -import java.io.IOException; - -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.util.IntsRef; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Category list data maintained in RAM. - *

- * Speeds up facets accumulation when more RAM is available. - *

- * Note that this will consume more memory: one int (4 bytes) for each category - * of each document. - *

- * Note: at the moment this class is insensitive to updates of the index, and, - * in particular, does not make use of Lucene's ability to refresh a single - * segment. - *

- * See {@link CategoryListCache#register(CategoryListParams, CategoryListData)} - * and - * {@link CategoryListCache#loadAndRegister(CategoryListParams, IndexReader, TaxonomyReader, FacetIndexingParams)}. - * - * @lucene.experimental - */ -public class CategoryListData { - - // TODO (Facet): experiment with different orders - p-d-c vs. current d-p-c. - private transient volatile int[][][] docPartitionCategories; - - /** - * Empty constructor for extensions with modified computation of the data. - */ - protected CategoryListData() { - } - - /** Compute category list data for caching for faster iteration. */ - CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp) - throws IOException { - - int[][][]dpf = new int[reader.maxDoc()][][]; - int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize()); - IntsRef ordinals = new IntsRef(32); - for (int part = 0; part < numPartitions; part++) { - for (AtomicReaderContext context : reader.leaves()) { - CategoryListIterator cli = clp.createCategoryListIterator(part); - if (cli.setNextReader(context)) { - final int maxDoc = context.reader().maxDoc(); - for (int i = 0; i < maxDoc; i++) { - cli.getOrdinals(i, ordinals); - if (ordinals.length > 0) { - int doc = i + context.docBase; - if (dpf[doc] == null) { - dpf[doc] = new int[numPartitions][]; - } - if (dpf[doc][part] == null) { - dpf[doc][part] = new int[ordinals.length]; - } - for (int j = 0; j < ordinals.length; j++) { - dpf[doc][part][j] = ordinals.ints[j]; - } - } - } - } - } - } - docPartitionCategories = dpf; - } - - /** - * Iterate on the category list data for the specified partition. - */ - public CategoryListIterator iterator(int partition) throws IOException { - return new RAMCategoryListIterator(partition, docPartitionCategories); - } - - /** Internal: category list iterator over uncompressed category info in RAM */ - private static class RAMCategoryListIterator implements CategoryListIterator { - - private int docBase; - private final int part; - private final int[][][] dpc; - - RAMCategoryListIterator(int part, int[][][] docPartitionCategories) { - this.part = part; - dpc = docPartitionCategories; - } - - @Override - public boolean setNextReader(AtomicReaderContext context) throws IOException { - docBase = context.docBase; - return dpc != null && dpc.length > part; - } - - @Override - public void getOrdinals(int docID, IntsRef ints) throws IOException { - ints.length = 0; - docID += docBase; - if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) { - if (ints.ints.length < dpc[docID][part].length) { - ints.grow(dpc[docID][part].length); - } - ints.length = 0; - for (int i = 0; i < dpc[docID][part].length; i++) { - ints.ints[ints.length++] = dpc[docID][part][i]; - } - } - } - } - -} \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/package.html b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/package.html deleted file mode 100644 index 1fc371e7f91..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/package.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - -Caching to speed up facets accumulation. - - diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java index 851f3a3f3a2..83872539260 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java @@ -2,15 +2,11 @@ package org.apache.lucene.facet.search.params; import java.io.IOException; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.FacetResultsHandler; import org.apache.lucene.facet.search.TopKFacetResultsHandler; import org.apache.lucene.facet.search.TopKInEachNodeHandler; import org.apache.lucene.facet.search.aggregator.Aggregator; -import org.apache.lucene.facet.search.cache.CategoryListCache; -import org.apache.lucene.facet.search.cache.CategoryListData; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -64,7 +60,7 @@ public abstract class FacetRequest implements Cloneable { */ public static final ResultMode DEFAULT_RESULT_MODE = ResultMode.PER_NODE_IN_TREE; - private final CategoryPath categoryPath; + public final CategoryPath categoryPath; private final int numResults; private int numLabel; private int depth; @@ -133,17 +129,6 @@ public abstract class FacetRequest implements Cloneable { this.sortBy = sortBy; } - /** - * The root category of this facet request. The categories that are returned - * as a result of this request will all be descendants of this root. - *

- * NOTE: you should not modify the returned {@link CategoryPath}, or - * otherwise some methonds may not work properly, e.g. {@link #hashCode()}. - */ - public final CategoryPath getCategoryPath() { - return categoryPath; - } - /** * How deeply to look under the given category. If the depth is 0, * only the category itself is counted. If the depth is 1, its immediate @@ -160,24 +145,22 @@ public abstract class FacetRequest implements Cloneable { * will have their category paths calculated, and the rest will only be * available as ordinals (category numbers) and will have null paths. *

- * If Integer.MAX_VALUE is specified, all - * results are labled. + * If Integer.MAX_VALUE is specified, all results are labled. *

- * The purpose of this parameter is to avoid having to run the whole - * faceted search again when the user asks for more values for the facet; - * The application can ask (getNumResults()) for more values than it needs - * to show, but keep getNumLabel() only the number it wants to immediately - * show. The slow-down caused by finding more values is negligible, because - * the slowest part - finding the categories' paths, is avoided. + * The purpose of this parameter is to avoid having to run the whole faceted + * search again when the user asks for more values for the facet; The + * application can ask (getNumResults()) for more values than it needs to + * show, but keep getNumLabel() only the number it wants to immediately show. + * The slow-down caused by finding more values is negligible, because the + * slowest part - finding the categories' paths, is avoided. *

- * Depending on the {@link #getResultMode() LimitsMode}, - * this limit is applied globally or per results node. - * In the global mode, if this limit is 3, - * only 3 top results would be labeled. - * In the per-node mode, if this limit is 3, - * 3 top children of {@link #getCategoryPath() the target category} would be labeled, - * as well as 3 top children of each of them, and so forth, until the depth defined - * by {@link #getDepth()}. + * Depending on the {@link #getResultMode() LimitsMode}, this limit is applied + * globally or per results node. In the global mode, if this limit is 3, only + * 3 top results would be labeled. In the per-node mode, if this limit is 3, 3 + * top children of {@link #categoryPath the target category} would be labeled, + * as well as 3 top children of each of them, and so forth, until the depth + * defined by {@link #getDepth()}. + * * @see #getResultMode() */ public final int getNumLabel() { @@ -185,20 +168,18 @@ public abstract class FacetRequest implements Cloneable { } /** - * The number of sub-categories to return (at most). - * If the sub-categories are returned. + * The number of sub-categories to return (at most). If the sub-categories are + * returned. *

- * If Integer.MAX_VALUE is specified, all - * sub-categories are returned. + * If Integer.MAX_VALUE is specified, all sub-categories are returned. *

- * Depending on the {@link #getResultMode() LimitsMode}, - * this limit is applied globally or per results node. - * In the global mode, if this limit is 3, - * only 3 top results would be computed. - * In the per-node mode, if this limit is 3, - * 3 top children of {@link #getCategoryPath() the target category} would be returned, - * as well as 3 top children of each of them, and so forth, until the depth defined - * by {@link #getDepth()}. + * Depending on the {@link #getResultMode() LimitsMode}, this limit is applied + * globally or per results node. In the global mode, if this limit is 3, only + * 3 top results would be computed. In the per-node mode, if this limit is 3, + * 3 top children of {@link #categoryPath the target category} would be + * returned, as well as 3 top children of each of them, and so forth, until + * the depth defined by {@link #getDepth()}. + * * @see #getResultMode() */ public final int getNumResults() { @@ -319,24 +300,6 @@ public abstract class FacetRequest implements Cloneable { public abstract Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) throws IOException; - /** - * Create the category list iterator for the specified partition. If a non - * null cache is provided which contains the required data, use it for the - * iteration. - */ - public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, int partition) - throws IOException { - CategoryListCache clCache = sParams.getCategoryListCache(); - CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath); - if (clCache != null) { - CategoryListData clData = clCache.get(clParams); - if (clData != null) { - return clData.iterator(partition); - } - } - return clParams.createCategoryListIterator(partition); - } - /** * Return the value of a category used for facets computations for this * request. For a count request this would be the count for that facet, i.e. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java index 78662436309..711406a35ae 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java @@ -4,7 +4,6 @@ import java.util.Arrays; import java.util.List; import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.cache.CategoryListCache; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -71,14 +70,6 @@ public class FacetSearchParams { this.facetRequests = facetRequests; } - /** - * Returns the {@link CategoryListCache}. By default returns {@code null}, you - * should override if you want to use a cache. - */ - public CategoryListCache getCategoryListCache() { - return null; - } - /** * Returns the {@link FacetIndexingParams} that were passed to the * constructor. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java index 5e5a258d726..80dedb71e5c 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationFloatSumFacetRequest.java @@ -28,7 +28,10 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; /** * A {@link FacetRequest} for weighting facets according to their float - * association by summing the association values. + * association by summing the association values. Note that this class caches + * the associations data in-memory by default. You can override + * {@link #createAggregator(boolean, FacetArrays, TaxonomyReader)} to return an + * {@link AssociationFloatSumAggregator} which does otherwise. * * @lucene.experimental */ diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java index 0d291119d6a..bbf1b697dc6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/associations/AssociationIntSumFacetRequest.java @@ -4,6 +4,7 @@ import java.io.IOException; import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.associations.AssociationFloatSumAggregator; import org.apache.lucene.facet.search.aggregator.associations.AssociationIntSumAggregator; import org.apache.lucene.facet.search.params.FacetRequest; import org.apache.lucene.facet.taxonomy.CategoryPath; @@ -28,7 +29,10 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; /** * A {@link FacetRequest} for weighting facets according to their integer - * association by summing the association values. + * association by summing the association values. Note that this class caches + * the associations data in-memory by default. You can override + * {@link #createAggregator(boolean, FacetArrays, TaxonomyReader)} to return an + * {@link AssociationFloatSumAggregator} which does otherwise. * * @lucene.experimental */ diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java b/lucene/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java index 14078f98a46..85c053cc785 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java @@ -39,7 +39,7 @@ public class FacetResult { /** * Facet result node matching the root of the {@link #getFacetRequest() facet request}. * @see #getFacetRequest() - * @see FacetRequest#getCategoryPath() + * @see FacetRequest#categoryPath */ public final FacetResultNode getFacetResultNode() { return this.rootNode; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java index 294e602b762..17760a0c6c2 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java @@ -4,9 +4,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.IndexReader; - -import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.search.FacetArrays; import org.apache.lucene.facet.search.ScoredDocIDs; import org.apache.lucene.facet.search.aggregator.Aggregator; @@ -16,6 +13,7 @@ import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.FacetResultNode; import org.apache.lucene.facet.search.results.MutableFacetResultNode; import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.index.IndexReader; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -205,7 +203,7 @@ public abstract class Sampler { private static class OverSampledFacetRequest extends FacetRequest { final FacetRequest orig; public OverSampledFacetRequest(FacetRequest orig, int num) { - super(orig.getCategoryPath(), num); + super(orig.categoryPath, num); this.orig = orig; setDepth(orig.getDepth()); setNumLabel(orig.getNumLabel()); @@ -214,12 +212,6 @@ public abstract class Sampler { setSortOrder(orig.getSortOrder()); } - @Override - public CategoryListIterator createCategoryListIterator(TaxonomyReader taxo, FacetSearchParams sParams, - int partition) throws IOException { - return orig.createCategoryListIterator(taxo, sParams, partition); - } - @Override public Aggregator createAggregator(boolean useComplements, FacetArrays arrays, TaxonomyReader taxonomy) throws IOException { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java index 4e14fcf23cc..f277c054d68 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java @@ -92,8 +92,8 @@ public class CategoryPath implements Comparable { */ @Override public int compareTo(CategoryPath other) { - int length = this.length < other.length ? this.length : other.length; - for (int i = 0, j = 0; i < length; i++, j++) { + final int len = length < other.length ? length : other.length; + for (int i = 0, j = 0; i < len; i++, j++) { int cmp = components[i].compareTo(other.components[j]); if (cmp < 0) return -1; // this is 'before' if (cmp > 0) return 1; // this is 'after' diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java b/lucene/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java index 7e583e13a46..a16a47294d8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java @@ -1,6 +1,5 @@ package org.apache.lucene.facet.util; -import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -28,15 +27,9 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; */ public final class PartitionsUtils { - /** - * Get the offset for a given partition. That is, what is the minimum number an - * ordinal could be for a particular partition. - */ - public final static int partitionOffset(FacetIndexingParams iParams, - int partitionNumber, final TaxonomyReader taxonomyReader) { - return partitionNumber * partitionSize(iParams, taxonomyReader); - } - + /** The prefix that is added to the name of the partition. */ + public static final String PART_NAME_PREFIX = "$part"; + /** * Get the partition size in this parameter, or return the size of the taxonomy, which * is smaller. (Guarantees usage of as little memory as possible at search time). @@ -58,21 +51,21 @@ public final class PartitionsUtils { /** * Partition name by category ordinal */ - public final static String partitionNameByOrdinal( - FacetIndexingParams iParams, CategoryListParams clParams, int ordinal) { + public final static String partitionNameByOrdinal(FacetIndexingParams iParams, int ordinal) { int partition = partitionNumber(iParams, ordinal); - return partitionName(clParams, partition); + return partitionName(partition); } - /** - * Partition name by its number - */ - public final static String partitionName(CategoryListParams clParams, int partition) { - String term = clParams.getTerm().text(); + /** Partition name by its number */ + public final static String partitionName(int partition) { + // TODO would be good if this method isn't called when partitions are not enabled. + // perhaps through some specialization code. if (partition == 0) { - return term; // for backwards compatibility we do not add a partition number in this case + // since regular faceted search code goes through this method too, + // return the same value for partition 0 and when there are no partitions + return ""; } - return term + partition; + return PART_NAME_PREFIX + Integer.toString(partition); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java index a19550c29e5..8829f2b9e57 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java @@ -49,20 +49,20 @@ import org.apache.lucene.util.IntsRef; * @lucene.experimental */ public abstract class ChunksIntEncoder extends IntEncoder { - + /** Holds the values which must be encoded, outside the indicator. */ protected final IntsRef encodeQueue; - + /** Represents bits flag byte. */ protected int indicator = 0; - + /** Counts the current ordinal of the encoded value. */ protected byte ordinal = 0; - + protected ChunksIntEncoder(int chunkSize) { encodeQueue = new IntsRef(chunkSize); } - + /** * Encodes the values of the current chunk. First it writes the indicator, and * then it encodes the values outside the indicator. @@ -76,17 +76,40 @@ public abstract class ChunksIntEncoder extends IntEncoder { buf.bytes[buf.length++] = ((byte) indicator); for (int i = 0; i < encodeQueue.length; i++) { - VInt8.encode(encodeQueue.ints[i], buf); + // it is better if the encoding is inlined like so, and not e.g. + // in a utility method + int value = encodeQueue.ints[i]; + if ((value & ~0x7F) == 0) { + buf.bytes[buf.length] = (byte) value; + buf.length++; + } else if ((value & ~0x3FFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 1] = (byte) (value & 0x7F); + buf.length += 2; + } else if ((value & ~0x1FFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 2] = (byte) (value & 0x7F); + buf.length += 3; + } else if ((value & ~0xFFFFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 3] = (byte) (value & 0x7F); + buf.length += 4; + } else { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 4] = (byte) (value & 0x7F); + buf.length += 5; + } } - reset(); - } - - @Override - protected void reset() { ordinal = 0; indicator = 0; encodeQueue.length = 0; } - + } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java index a5b2fb3c28c..5e908206800 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java @@ -26,7 +26,7 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class DGapIntDecoder extends IntDecoder { +public final class DGapIntDecoder extends IntDecoder { private final IntDecoder decoder; @@ -35,13 +35,8 @@ public class DGapIntDecoder extends IntDecoder { } @Override - protected void reset() { - decoder.reset(); - } - - @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - decoder.doDecode(buf, values, upto); + public void decode(BytesRef buf, IntsRef values) { + decoder.decode(buf, values); int prev = 0; for (int i = 0; i < values.length; i++) { values.ints[i] += prev; @@ -51,7 +46,7 @@ public class DGapIntDecoder extends IntDecoder { @Override public String toString() { - return "DGap (" + decoder.toString() + ")"; + return "DGap(" + decoder.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java index 305f975c619..5e8ca5d293d 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java @@ -35,7 +35,7 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class DGapIntEncoder extends IntEncoderFilter { +public final class DGapIntEncoder extends IntEncoderFilter { /** Initializes with the given encoder. */ public DGapIntEncoder(IntEncoder encoder) { @@ -43,14 +43,15 @@ public class DGapIntEncoder extends IntEncoderFilter { } @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { int prev = 0; + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int tmp = values.ints[i]; values.ints[i] -= prev; prev = tmp; } - encoder.doEncode(values, buf, upto); + encoder.encode(values, buf); } @Override @@ -60,7 +61,7 @@ public class DGapIntEncoder extends IntEncoderFilter { @Override public String toString() { - return "DGap (" + encoder.toString() + ")"; + return "DGap(" + encoder.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntDecoder.java new file mode 100644 index 00000000000..d5a533234cd --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntDecoder.java @@ -0,0 +1,67 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.RamUsageEstimator; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Decodes values encoded by {@link DGapVInt8IntDecoder}. + * + * @lucene.experimental + */ +public final class DGapVInt8IntDecoder extends IntDecoder { + + @Override + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; + + // grow the buffer up front, even if by a large number of values (buf.length) + // that saves the need to check inside the loop for every decoded value if + // the buffer needs to grow. + if (values.ints.length < buf.length) { + values.ints = new int[ArrayUtil.oversize(buf.length, RamUsageEstimator.NUM_BYTES_INT)]; + } + + // it is better if the decoding is inlined like so, and not e.g. + // in a utility method + int upto = buf.offset + buf.length; + int value = 0; + int offset = buf.offset; + int prev = 0; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + values.ints[values.length] = ((value << 7) | b) + prev; + value = 0; + prev = values.ints[values.length]; + values.length++; + } else { + value = (value << 7) | (b & 0x7F); + } + } + } + + @Override + public String toString() { + return "DGapVInt8"; + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntEncoder.java new file mode 100644 index 00000000000..a28328ea393 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapVInt8IntEncoder.java @@ -0,0 +1,89 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoder} which implements variable length encoding for the gap + * between values. It's a specialized form of the combination of + * {@link DGapIntEncoder} and {@link VInt8IntEncoder}. + * + * @see VInt8IntEncoder + * @see DGapIntEncoder + * + * @lucene.experimental + */ +public final class DGapVInt8IntEncoder extends IntEncoder { + + @Override + public void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; + int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt + if (buf.bytes.length < maxBytesNeeded) { + buf.grow(maxBytesNeeded); + } + + int upto = values.offset + values.length; + int prev = 0; + for (int i = values.offset; i < upto; i++) { + // it is better if the encoding is inlined like so, and not e.g. + // in a utility method + int value = values.ints[i] - prev; + if ((value & ~0x7F) == 0) { + buf.bytes[buf.length] = (byte) value; + buf.length++; + } else if ((value & ~0x3FFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 1] = (byte) (value & 0x7F); + buf.length += 2; + } else if ((value & ~0x1FFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 2] = (byte) (value & 0x7F); + buf.length += 3; + } else if ((value & ~0xFFFFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 3] = (byte) (value & 0x7F); + buf.length += 4; + } else { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 4] = (byte) (value & 0x7F); + buf.length += 5; + } + prev = values.ints[i]; + } + } + + @Override + public IntDecoder createMatchingDecoder() { + return new DGapVInt8IntDecoder(); + } + + @Override + public String toString() { + return "DGapVInt8"; + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java index 270fcffca52..317185f44e3 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java @@ -45,10 +45,13 @@ public class EightFlagsIntDecoder extends IntDecoder { } @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - while (buf.offset < upto) { + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; + int upto = buf.offset + buf.length; + int offset = buf.offset; + while (offset < upto) { // read indicator - int indicator = buf.bytes[buf.offset++] & 0xFF; + int indicator = buf.bytes[offset++] & 0xFF; int ordinal = 0; int capacityNeeded = values.length + 8; @@ -59,11 +62,21 @@ public class EightFlagsIntDecoder extends IntDecoder { // process indicator, until we read 8 values, or end-of-buffer while (ordinal != 8) { if (DECODE_TABLE[indicator][ordinal++] == 0) { - if (buf.offset == upto) { // end of buffer + if (offset == upto) { // end of buffer return; } - // decode the value from the stream. - values.ints[values.length++] = VInt8.decode(buf) + 2; + // it is better if the decoding is inlined like so, and not e.g. + // in a utility method + int value = 0; + while (true) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + values.ints[values.length++] = ((value << 7) | b) + 2; + break; + } else { + value = (value << 7) | (b & 0x7F); + } + } } else { values.ints[values.length++] = 1; } @@ -73,7 +86,7 @@ public class EightFlagsIntDecoder extends IntDecoder { @Override public String toString() { - return "EightFlags (VInt8)"; + return "EightFlags(VInt8)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java index 143660a4742..812d86d1003 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java @@ -59,7 +59,9 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder { } @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int value = values.ints[i]; if (value == 1) { @@ -88,7 +90,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "EightFlags (VInt)"; + return "EightFlags(VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java index ebc161ac9cd..5d1f957335e 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java @@ -45,10 +45,13 @@ public class FourFlagsIntDecoder extends IntDecoder { } @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - while (buf.offset < upto) { + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; + int upto = buf.offset + buf.length; + int offset = buf.offset; + while (offset < upto) { // read indicator - int indicator = buf.bytes[buf.offset++] & 0xFF; + int indicator = buf.bytes[offset++] & 0xFF; int ordinal = 0; int capacityNeeded = values.length + 4; @@ -59,11 +62,21 @@ public class FourFlagsIntDecoder extends IntDecoder { while (ordinal != 4) { byte decodeVal = DECODE_TABLE[indicator][ordinal++]; if (decodeVal == 0) { - if (buf.offset == upto) { // end of buffer + if (offset == upto) { // end of buffer return; } - // decode the value from the stream. - values.ints[values.length++] = VInt8.decode(buf) + 4; + // it is better if the decoding is inlined like so, and not e.g. + // in a utility method + int value = 0; + while (true) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + values.ints[values.length++] = ((value << 7) | b) + 4; + break; + } else { + value = (value << 7) | (b & 0x7F); + } + } } else { values.ints[values.length++] = decodeVal; } @@ -73,7 +86,7 @@ public class FourFlagsIntDecoder extends IntDecoder { @Override public String toString() { - return "FourFlags (VInt8)"; + return "FourFlags(VInt8)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java index 535a90fb60c..fd822651000 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java @@ -65,7 +65,9 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { } @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int value = values.ints[i]; if (value <= 3) { @@ -94,7 +96,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "FourFlags (VInt)"; + return "FourFlags(VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java index acf83cc6158..e89c7557039 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java @@ -27,44 +27,10 @@ import org.apache.lucene.util.IntsRef; */ public abstract class IntDecoder { - /** - * Performs the actual decoding. Values should be read from - * {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and - * length are set to 0 and the encoder is expected to update - * {@link IntsRef#length}, but not {@link IntsRef#offset}. - * - *

- * NOTE: it is ok to use the buffer's offset as the current position in - * the buffer (and modify it), it will be reset by - * {@link #decode(BytesRef, IntsRef)}. - */ - protected abstract void doDecode(BytesRef buf, IntsRef values, int upto); - - /** - * Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders - * can reset their state. - */ - protected void reset() { - // do nothing by default - } - /** * Decodes the values from the buffer into the given {@link IntsRef}. Note * that {@code values.offset} and {@code values.length} are set to 0. */ - public final void decode(BytesRef buf, IntsRef values) { - values.offset = values.length = 0; // must do that because we cannot grow() them otherwise - - // some decoders may use the buffer's offset as a position index, so save - // current offset. - int bufOffset = buf.offset; - - reset(); - doDecode(buf, values, buf.offset + buf.length); - assert values.offset == 0 : "offset should not have been modified by the decoder."; - - // fix offset - buf.offset = bufOffset; - } + public abstract void decode(BytesRef buf, IntsRef values); } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java index 0a3197d6c6b..64e2878cbf7 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java @@ -31,32 +31,11 @@ public abstract class IntEncoder { public IntEncoder() {} - /** - * Performs the actual encoding. Values should be read from - * {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that - * {@code buf's} offset and length are set to 0 and the encoder is expected to - * update {@link BytesRef#length}, but not {@link BytesRef#offset}. - */ - protected abstract void doEncode(IntsRef values, BytesRef buf, int upto); - - /** - * Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders - * can reset their state. - */ - protected void reset() { - // do nothing by default - } - /** * Encodes the values to the given buffer. Note that the buffer's offset and * length are set to 0. */ - public final void encode(IntsRef values, BytesRef buf) { - buf.offset = buf.length = 0; - reset(); - doEncode(values, buf, values.offset + values.length); - assert buf.offset == 0 : "offset should not have been modified by the encoder."; - } + public abstract void encode(IntsRef values, BytesRef buf); /** * Returns an {@link IntDecoder} which can decode the values that were encoded diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java index ee2e5db7e9e..c1fa04b5dc9 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java @@ -31,9 +31,4 @@ public abstract class IntEncoderFilter extends IntEncoder { this.encoder = encoder; } - @Override - public void reset() { - encoder.reset(); - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java index 1cf33857280..37e52e9a815 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java @@ -42,14 +42,10 @@ public class NOnesIntDecoder extends FourFlagsIntDecoder { } @Override - protected void reset() { + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; internalBuffer.length = 0; - super.reset(); - } - - @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - super.doDecode(buf, internalBuffer, upto); + super.decode(buf, internalBuffer); if (values.ints.length < internalBuffer.length) { // need space for internalBuffer.length to internalBuffer.length*N, // grow mildly at first @@ -84,7 +80,7 @@ public class NOnesIntDecoder extends FourFlagsIntDecoder { @Override public String toString() { - return "NOnes (" + n + ") (" + super.toString() + ")"; + return "NOnes(" + n + ") (" + super.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java index 956eea253a3..5a705c5ecf7 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java @@ -65,19 +65,15 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { } @Override - protected void reset() { + public void encode(IntsRef values, BytesRef buf) { internalBuffer.length = 0; - super.reset(); - } - - @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { // make sure the internal buffer is large enough if (values.length > internalBuffer.ints.length) { internalBuffer.grow(values.length); } int onesCounter = 0; + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int value = values.ints[i]; if (value == 1) { @@ -102,7 +98,7 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { --onesCounter; internalBuffer.ints[internalBuffer.length++] = 1; } - super.doEncode(internalBuffer, buf, internalBuffer.length); + super.encode(internalBuffer, buf); } @Override @@ -112,7 +108,7 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { @Override public String toString() { - return "NOnes (" + n + ") (" + super.toString() + ")"; + return "NOnes(" + n + ") (" + super.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java index af6fce26af7..fd7a79e38c4 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java @@ -1,7 +1,9 @@ package org.apache.lucene.util.encoding; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.RamUsageEstimator; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -25,19 +27,24 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class SimpleIntDecoder extends IntDecoder { +public final class SimpleIntDecoder extends IntDecoder { @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - while (buf.offset < upto) { - if (values.length == values.ints.length) { - values.grow(values.length + 10); // grow by few items, however not too many - } + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; + int numValues = buf.length / 4; // every value is 4 bytes + if (values.ints.length < numValues) { // offset and length are 0 + values.ints = new int[ArrayUtil.oversize(numValues, RamUsageEstimator.NUM_BYTES_INT)]; + } + + int offset = buf.offset; + int upto = buf.offset + buf.length; + while (offset < upto) { values.ints[values.length++] = - ((buf.bytes[buf.offset++] & 0xFF) << 24) | - ((buf.bytes[buf.offset++] & 0xFF) << 16) | - ((buf.bytes[buf.offset++] & 0xFF) << 8) | - (buf.bytes[buf.offset++] & 0xFF); + ((buf.bytes[offset++] & 0xFF) << 24) | + ((buf.bytes[offset++] & 0xFF) << 16) | + ((buf.bytes[offset++] & 0xFF) << 8) | + (buf.bytes[offset++] & 0xFF); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java index fd6a0206117..ae0b2958b30 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java @@ -25,16 +25,18 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class SimpleIntEncoder extends IntEncoder { +public final class SimpleIntEncoder extends IntEncoder { @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; // ensure there's enough room in the buffer int bytesNeeded = values.length * 4; if (buf.bytes.length < bytesNeeded) { buf.grow(bytesNeeded); } + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { int value = values.ints[i]; buf.bytes[buf.length++] = (byte) (value >>> 24); diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java index 0ebb06efa85..128542087b9 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class SortingIntEncoder extends IntEncoderFilter { +public final class SortingIntEncoder extends IntEncoderFilter { /** Initializes with the given encoder. */ public SortingIntEncoder(IntEncoder encoder) { @@ -36,9 +36,9 @@ public class SortingIntEncoder extends IntEncoderFilter { } @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { - Arrays.sort(values.ints, values.offset, upto); - encoder.doEncode(values, buf, upto); + public void encode(IntsRef values, BytesRef buf) { + Arrays.sort(values.ints, values.offset, values.offset + values.length); + encoder.encode(values, buf); } @Override @@ -48,7 +48,7 @@ public class SortingIntEncoder extends IntEncoderFilter { @Override public String toString() { - return "Sorting (" + encoder.toString() + ")"; + return "Sorting(" + encoder.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java index c9a6be5c848..2612c234c70 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java @@ -36,9 +36,10 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter { } @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { int prev = values.ints[values.offset]; int idx = values.offset + 1; + int upto = values.offset + values.length; for (int i = idx; i < upto; i++) { if (values.ints[i] != prev) { values.ints[idx++] = values.ints[i]; @@ -46,7 +47,7 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter { } } values.length = idx - values.offset; - encoder.doEncode(values, buf, idx); + encoder.encode(values, buf); } @Override @@ -56,7 +57,7 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter { @Override public String toString() { - return "Unique (" + encoder.toString() + ")"; + return "Unique(" + encoder.toString() + ")"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java deleted file mode 100644 index 267d52bae96..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java +++ /dev/null @@ -1,138 +0,0 @@ -package org.apache.lucene.util.encoding; - -import org.apache.lucene.util.BytesRef; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is - * encoded as follows: - *

    - *
  • If it is less than 127 and non-negative (i.e., if the number uses only 7 - * bits), it is encoded as as single byte: 0bbbbbbb. - *
  • If its highest nonzero bit is greater than bit 6 (0x40), it is - * represented as a series of bytes, each byte's 7 LSB containing bits from the - * original value, with the MSB set for all but the last byte. The first encoded - * byte contains the highest nonzero bits from the original; the second byte - * contains the next 7 MSB; and so on, with the last byte containing the 7 LSB - * of the original. - *
- * Examples: - *
    - *
  1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is - * encoded as 01110101 = 0x75. - *
  2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, - * and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, - * then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the - * others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20. - *
- * {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly - * handle any 32-bit integer, but for negative numbers, and positive numbers - * with more than 28 significant bits, encoding requires 5 bytes; this is not an - * efficient encoding scheme for large positive numbers or any negative number. - * - * @lucene.experimental - */ -public class VInt8 { - - /** The maximum number of bytes needed to encode an integer. */ - public static final int MAXIMUM_BYTES_NEEDED = 5; - - /** - * Decodes an int from the given bytes, starting at {@link BytesRef#offset}. - * Returns the decoded bytes and updates {@link BytesRef#offset}. - */ - public static int decode(BytesRef bytes) { - /* - This is the original code of this method, but a Hotspot bug - corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975) - so the loop was unwounded here too, to be on the safe side - int value = 0; - while (true) { - byte first = bytes.bytes[bytes.offset++]; - value |= first & 0x7F; - if ((first & 0x80) == 0) { - return value; - } - value <<= 7; - } - */ - - // byte 1 - byte b = bytes.bytes[bytes.offset++]; - if (b >= 0) return b; - - // byte 2 - int value = b & 0x7F; - b = bytes.bytes[bytes.offset++]; - value = (value << 7) | b & 0x7F; - if (b >= 0) return value; - - // byte 3 - b = bytes.bytes[bytes.offset++]; - value = (value << 7) | b & 0x7F; - if (b >= 0) return value; - - // byte 4 - b = bytes.bytes[bytes.offset++]; - value = (value << 7) | b & 0x7F; - if (b >= 0) return value; - - // byte 5 - b = bytes.bytes[bytes.offset++]; - return (value << 7) | b & 0x7F; - } - - /** - * Encodes the given number into bytes, starting at {@link BytesRef#length}. - * Assumes that the array is large enough. - */ - public static void encode(int value, BytesRef bytes) { - if ((value & ~0x7F) == 0) { - bytes.bytes[bytes.length] = (byte) value; - bytes.length++; - } else if ((value & ~0x3FFF) == 0) { - bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); - bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F); - bytes.length += 2; - } else if ((value & ~0x1FFFFF) == 0) { - bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); - bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); - bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F); - bytes.length += 3; - } else if ((value & ~0xFFFFFFF) == 0) { - bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); - bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); - bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); - bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F); - bytes.length += 4; - } else { - bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); - bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); - bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); - bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); - bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F); - bytes.length += 5; - } - } - - private VInt8() { - // Just making it impossible to instantiate. - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java index e9fe5600c9a..65a122ac7de 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java @@ -1,7 +1,9 @@ package org.apache.lucene.util.encoding; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.RamUsageEstimator; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -25,15 +27,32 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class VInt8IntDecoder extends IntDecoder { +public final class VInt8IntDecoder extends IntDecoder { @Override - protected void doDecode(BytesRef buf, IntsRef values, int upto) { - while (buf.offset < upto) { - if (values.length == values.ints.length) { - values.grow(values.length + 10); // grow by few items, however not too many + public void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; + + // grow the buffer up front, even if by a large number of values (buf.length) + // that saves the need to check inside the loop for every decoded value if + // the buffer needs to grow. + if (values.ints.length < buf.length) { + values.ints = new int[ArrayUtil.oversize(buf.length, RamUsageEstimator.NUM_BYTES_INT)]; + } + + // it is better if the decoding is inlined like so, and not e.g. + // in a utility method + int upto = buf.offset + buf.length; + int value = 0; + int offset = buf.offset; + while (offset < upto) { + byte b = buf.bytes[offset++]; + if (b >= 0) { + values.ints[values.length++] = (value << 7) | b; + value = 0; + } else { + value = (value << 7) | (b & 0x7F); } - values.ints[values.length++] = VInt8.decode(buf); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java index 7c62bf3e035..cc4bc1332af 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java @@ -47,17 +47,47 @@ import org.apache.lucene.util.IntsRef; * * @lucene.experimental */ -public class VInt8IntEncoder extends IntEncoder { +public final class VInt8IntEncoder extends IntEncoder { @Override - protected void doEncode(IntsRef values, BytesRef buf, int upto) { + public void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt if (buf.bytes.length < maxBytesNeeded) { buf.grow(maxBytesNeeded); } + int upto = values.offset + values.length; for (int i = values.offset; i < upto; i++) { - VInt8.encode(values.ints[i], buf); + // it is better if the encoding is inlined like so, and not e.g. + // in a utility method + int value = values.ints[i]; + if ((value & ~0x7F) == 0) { + buf.bytes[buf.length] = (byte) value; + buf.length++; + } else if ((value & ~0x3FFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 1] = (byte) (value & 0x7F); + buf.length += 2; + } else if ((value & ~0x1FFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 2] = (byte) (value & 0x7F); + buf.length += 3; + } else if ((value & ~0xFFFFFFF) == 0) { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 3] = (byte) (value & 0x7F); + buf.length += 4; + } else { + buf.bytes[buf.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); + buf.bytes[buf.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + buf.bytes[buf.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + buf.bytes[buf.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + buf.bytes[buf.length + 4] = (byte) (value & 0x7F); + buf.length += 5; + } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java index 709bb3c16f5..251faff37f8 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestBase.java @@ -43,6 +43,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util._TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -64,7 +65,7 @@ import org.junit.BeforeClass; * limitations under the License. */ -/** Base faceted search test. */ +@SuppressCodecs({"SimpleText"}) public abstract class FacetTestBase extends LuceneTestCase { /** Holds a search and taxonomy Directories pair. */ @@ -266,13 +267,12 @@ public abstract class FacetTestBase extends LuceneTestCase { FacetIndexingParams iParams = getFacetIndexingParams(Integer.MAX_VALUE); String delim = String.valueOf(iParams.getFacetDelimChar()); Map res = new HashMap(); - HashSet handledTerms = new HashSet(); + HashSet handledTerms = new HashSet(); for (CategoryListParams clp : iParams.getAllCategoryListParams()) { - Term baseTerm = new Term(clp.getTerm().field()); - if (!handledTerms.add(baseTerm)) { + if (!handledTerms.add(clp.field)) { continue; // already handled this term (for another list) } - Terms terms = MultiFields.getTerms(indexReader, baseTerm.field()); + Terms terms = MultiFields.getTerms(indexReader, clp.field); if (terms == null) { continue; } @@ -297,7 +297,7 @@ public abstract class FacetTestBase extends LuceneTestCase { FacetResultNode topResNode = fr.getFacetResultNode(); FacetRequest freq = fr.getFacetRequest(); if (VERBOSE) { - System.out.println(freq.getCategoryPath().toString()+ "\t\t" + topResNode); + System.out.println(freq.categoryPath.toString()+ "\t\t" + topResNode); } assertCountsAndCardinality(facetCountsTruth, topResNode, freq.getNumResults()); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java index d2ac98ac148..708e7448b42 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java @@ -2,14 +2,9 @@ package org.apache.lucene.facet; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; import java.util.List; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.TextField; -import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.FacetsCollector; import org.apache.lucene.facet.search.params.CountFacetRequest; @@ -23,7 +18,6 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.Collector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; @@ -116,15 +110,6 @@ public class FacetTestUtils { return collectors; } - public static void add(FacetIndexingParams iParams, RandomIndexWriter iw, - TaxonomyWriter tw, String... strings) throws IOException { - Document d = new Document(); - FacetFields facetFields = new FacetFields(tw, iParams); - facetFields.addFields(d, Collections.singletonList(new CategoryPath(strings))); - d.add(new TextField("content", "alpha", Field.Store.YES)); - iw.addDocument(d); - } - public static class IndexTaxonomyReaderPair { public DirectoryReader indexReader; public DirectoryTaxonomyReader taxReader; diff --git a/lucene/facet/src/test/org/apache/lucene/facet/index/TestFacetsPayloadMigrationReader.java b/lucene/facet/src/test/org/apache/lucene/facet/index/TestFacetsPayloadMigrationReader.java new file mode 100644 index 00000000000..c7ed773f04a --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/index/TestFacetsPayloadMigrationReader.java @@ -0,0 +1,398 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Random; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Tests facets index migration from payload to DocValues.*/ +public class TestFacetsPayloadMigrationReader extends LuceneTestCase { + + private static class PayloadFacetFields extends FacetFields { + + private static final class CountingListStream extends TokenStream { + private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final Iterator> categoriesData; + + CountingListStream(Map categoriesData) { + this.categoriesData = categoriesData.entrySet().iterator(); + } + + @Override + public boolean incrementToken() throws IOException { + if (!categoriesData.hasNext()) { + return false; + } + + Entry entry = categoriesData.next(); + termAtt.setEmpty().append(FacetsPayloadMigrationReader.PAYLOAD_TERM_TEXT + entry.getKey()); + payloadAtt.setPayload(entry.getValue()); + return true; + } + + } + + private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType(); + static { + COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true); + COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true); + COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + COUNTING_LIST_PAYLOAD_TYPE.setStored(false); + COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true); + COUNTING_LIST_PAYLOAD_TYPE.freeze(); + } + + public PayloadFacetFields(TaxonomyWriter taxonomyWriter, FacetIndexingParams params) { + super(taxonomyWriter, params); + } + + @Override + protected FieldType drillDownFieldType() { + // Since the payload is indexed in the same field as the drill-down terms, + // we must set IndexOptions to DOCS_AND_FREQS_AND_POSITIONS + final FieldType type = new FieldType(TextField.TYPE_NOT_STORED); + type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + type.freeze(); + return type; + } + + @Override + protected void addCountingListData(Document doc, Map categoriesData, String field) { + CountingListStream ts = new CountingListStream(categoriesData); + doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE)); + } + } + + private static final String[] DIMENSIONS = new String[] { "dim1", "dim2", "dim3.1", "dim3.2" }; + + private HashMap createIndex(Directory indexDir, Directory taxoDir, FacetIndexingParams fip) + throws Exception { + Random random = random(); + IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)); + conf.setMaxBufferedDocs(2); // force few segments + conf.setMergePolicy(NoMergePolicy.COMPOUND_FILES); // avoid merges so that we're left with few segments + IndexWriter indexWriter = new IndexWriter(indexDir, conf); + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + + FacetFields facetFields = new PayloadFacetFields(taxoWriter, fip); + + HashMap expectedCounts = new HashMap(DIMENSIONS.length); + int numDocs = atLeast(10); + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int numCategories = random.nextInt(3) + 1; + ArrayList categories = new ArrayList(numCategories); + HashSet docDimensions = new HashSet(); + while (numCategories-- > 0) { + String dim = DIMENSIONS[random.nextInt(DIMENSIONS.length)]; + // we should only increment the expected count by 1 per document + docDimensions.add(dim); + categories.add(new CategoryPath(dim, Integer.toString(i), Integer.toString(numCategories))); + } + facetFields.addFields(doc, categories); + doc.add(new StringField("docid", Integer.toString(i), Store.YES)); + doc.add(new TextField("foo", "content" + i, Store.YES)); + indexWriter.addDocument(doc); + + // update expected count per dimension + for (String dim : docDimensions) { + Integer val = expectedCounts.get(dim); + if (val == null) { + expectedCounts.put(dim, Integer.valueOf(1)); + } else { + expectedCounts.put(dim, Integer.valueOf(val.intValue() + 1)); + } + } + + if (random.nextDouble() < 0.2) { // add some documents that will be deleted + doc = new Document(); + doc.add(new StringField("del", "key", Store.NO)); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dummy"))); + indexWriter.addDocument(doc); + } + } + + indexWriter.commit(); + taxoWriter.commit(); + + // delete the docs that were marked for deletion. note that the 'dummy' + // category is not removed from the taxonomy, so must account for it when we + // verify the migrated index. + indexWriter.deleteDocuments(new Term("del", "key")); + indexWriter.commit(); + + IOUtils.close(indexWriter, taxoWriter); + + return expectedCounts; + } + + private void migrateIndex(Directory indexDir, FacetIndexingParams fip) throws Exception { + final Map fieldTerms = FacetsPayloadMigrationReader.buildFieldTermsMap(indexDir, fip); + DirectoryReader reader = DirectoryReader.open(indexDir); + List leaves = reader.leaves(); + int numReaders = leaves.size(); + AtomicReader wrappedLeaves[] = new AtomicReader[numReaders]; + for (int i = 0; i < numReaders; i++) { + wrappedLeaves[i] = new FacetsPayloadMigrationReader(leaves.get(i).reader(), fieldTerms); + } + + IndexWriter writer = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, null)); + writer.deleteAll(); + try { + writer.addIndexes(new MultiReader(wrappedLeaves)); + writer.commit(); + } finally { + reader.close(); + writer.close(); + } + } + + private void verifyMigratedIndex(Directory indexDir, Directory taxoDir, HashMap expectedCounts, + FacetIndexingParams fip) throws Exception { + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + + assertFalse("index should not have deletions", indexReader.hasDeletions()); + + verifyNotFacetsData(indexReader, searcher); + verifyFacetedSearch(expectedCounts, fip, indexReader, taxoReader, searcher); + verifyDrillDown(expectedCounts, fip, indexReader, taxoReader, searcher); + verifyIndexOrdinals(indexReader, taxoReader, fip); + + IOUtils.close(indexReader, taxoReader); + } + + private void verifyNotFacetsData(DirectoryReader indexReader, IndexSearcher searcher) throws IOException { + // verify that non facets data was not damaged + TotalHitCountCollector total = new TotalHitCountCollector(); + searcher.search(new PrefixQuery(new Term("foo", "content")), total); + assertEquals("invalid number of results for content query", total.getTotalHits(), indexReader.maxDoc()); + + int numDocIDs = 0; + for (AtomicReaderContext context : indexReader.leaves()) { + Terms docIDs = context.reader().terms("docid"); + assertNotNull(docIDs); + TermsEnum te = docIDs.iterator(null); + while (te.next() != null) { + ++numDocIDs; + } + } + assertEquals("invalid number of docid terms", indexReader.maxDoc(), numDocIDs); + } + + private void verifyFacetedSearch(Map expectedCounts, FacetIndexingParams fip, + DirectoryReader indexReader, TaxonomyReader taxoReader, IndexSearcher searcher) throws IOException { + // run faceted search and assert expected counts + ArrayList requests = new ArrayList(expectedCounts.size()); + for (String dim : expectedCounts.keySet()) { + requests.add(new CountFacetRequest(new CategoryPath(dim), 5)); + } + FacetSearchParams fsp = new FacetSearchParams(requests, fip); + FacetsCollector fc = new FacetsCollector(fsp, indexReader, taxoReader); + MatchAllDocsQuery base = new MatchAllDocsQuery(); + searcher.search(base, fc); + List facetResults = fc.getFacetResults(); + assertEquals(requests.size(), facetResults.size()); + for (FacetResult res : facetResults) { + FacetResultNode node = res.getFacetResultNode(); + String dim = node.getLabel().components[0]; + assertEquals("wrong count for " + dim, expectedCounts.get(dim).intValue(), (int) node.getValue()); + } + } + + private void verifyDrillDown(Map expectedCounts, FacetIndexingParams fip, DirectoryReader indexReader, + TaxonomyReader taxoReader, IndexSearcher searcher) throws IOException { + // verify drill-down + for (String dim : expectedCounts.keySet()) { + CategoryPath drillDownCP = new CategoryPath(dim); + ArrayList request = new ArrayList(1); + request.add(new CountFacetRequest(drillDownCP, 10)); + FacetSearchParams fsp = new FacetSearchParams(request, fip); + Query drillDown = DrillDown.query(fsp, new MatchAllDocsQuery(), drillDownCP); + TotalHitCountCollector total = new TotalHitCountCollector(); + FacetsCollector fc = new FacetsCollector(fsp, indexReader, taxoReader); + searcher.search(drillDown, MultiCollector.wrap(fc, total)); + assertTrue("no results for drill-down query " + drillDown, total.getTotalHits() > 0); + List facetResults = fc.getFacetResults(); + assertEquals(1, facetResults.size()); + FacetResultNode rootNode = facetResults.get(0).getFacetResultNode(); + assertEquals("wrong count for " + dim, expectedCounts.get(dim).intValue(), (int) rootNode.getValue()); + } + } + + private void verifyIndexOrdinals(DirectoryReader indexReader, TaxonomyReader taxoReader, FacetIndexingParams fip) + throws IOException { + // verify that the ordinals in the index match the ones in the taxonomy, and vice versa + + // collect all fields which have DocValues, to assert later that all were + // visited i.e. that during migration we didn't add FieldInfos with no + // DocValues + HashSet docValuesFields = new HashSet(); + for (AtomicReaderContext context : indexReader.leaves()) { + FieldInfos infos = context.reader().getFieldInfos(); + for (FieldInfo info : infos) { + if (info.hasDocValues()) { + docValuesFields.add(info.name); + } + } + } + + // check that all visited ordinals are found in the taxonomy and vice versa + boolean[] foundOrdinals = new boolean[taxoReader.getSize()]; + for (int i = 0; i < foundOrdinals.length; i++) { + foundOrdinals[i] = false; // init to be on the safe side + } + foundOrdinals[0] = true; // ROOT ordinals isn't indexed + // mark 'dummy' category ordinal as seen + int dummyOrdinal = taxoReader.getOrdinal(new CategoryPath("dummy")); + if (dummyOrdinal > 0) { + foundOrdinals[dummyOrdinal] = true; + } + + int partitionSize = fip.getPartitionSize(); + int numPartitions = (int) Math.ceil(taxoReader.getSize() / (double) partitionSize); + final IntsRef ordinals = new IntsRef(32); + for (String dim : DIMENSIONS) { + CategoryListParams clp = fip.getCategoryListParams(new CategoryPath(dim)); + int partitionOffset = 0; + for (int partition = 0; partition < numPartitions; partition++, partitionOffset += partitionSize) { + final CategoryListIterator cli = clp.createCategoryListIterator(partition); + for (AtomicReaderContext context : indexReader.leaves()) { + if (cli.setNextReader(context)) { // not all fields may exist in all segments + // remove that field from the list of DocValues fields + docValuesFields.remove(clp.field + PartitionsUtils.partitionName(partition)); + int maxDoc = context.reader().maxDoc(); + for (int doc = 0; doc < maxDoc; doc++) { + cli.getOrdinals(doc, ordinals); + for (int j = 0; j < ordinals.length; j++) { + // verify that the ordinal is recognized by the taxonomy + int ordinal = ordinals.ints[j] + partitionOffset; + assertTrue("should not have received dummy ordinal (" + dummyOrdinal + ")", dummyOrdinal != ordinal); + assertNotNull("missing category for ordinal " + ordinal, taxoReader.getPath(ordinal)); + foundOrdinals[ordinal] = true; + } + } + } + } + } + } + + assertTrue("some fields which have docValues were not visited: " + docValuesFields, docValuesFields.isEmpty()); + + for (int i = 0; i < foundOrdinals.length; i++) { + assertTrue("ordinal " + i + " not visited", foundOrdinals[i]); + } + } + + private void doTestMigration(final int partitionSize) throws Exception { + // create a facets index with PayloadFacetFields and check it after migration + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); + + // set custom CLP fields for two dimensions and use the default ($facets) for the other two + HashMap params = new HashMap(); + params.put(new CategoryPath(DIMENSIONS[0]), new CategoryListParams(DIMENSIONS[0])); + params.put(new CategoryPath(DIMENSIONS[1]), new CategoryListParams(DIMENSIONS[1])); + FacetIndexingParams fip = new PerDimensionIndexingParams(params) { + @Override + public int getPartitionSize() { + return partitionSize; + } + }; + + HashMap expectedCounts = createIndex(indexDir, taxoDir, fip); + migrateIndex(indexDir, fip); + verifyMigratedIndex(indexDir, taxoDir, expectedCounts, fip); + + IOUtils.close(indexDir, taxoDir); + } + + @Test + public void testMigration() throws Exception { + doTestMigration(Integer.MAX_VALUE); + } + + @Test + public void testMigrationWithPartitions() throws Exception { + doTestMigration(2); + } + +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java b/lucene/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java index 4ea2aead894..4534b89ecba 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java @@ -1,10 +1,12 @@ package org.apache.lucene.facet.index.params; -import org.apache.lucene.index.Term; -import org.junit.Test; - import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.util.encoding.DGapVInt8IntEncoder; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.encoding.SortingIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; +import org.junit.Test; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -28,9 +30,11 @@ public class CategoryListParamsTest extends LuceneTestCase { @Test public void testDefaultSettings() { CategoryListParams clp = new CategoryListParams(); - assertEquals("wrong default term", new Term("$facets", "$fulltree$"), clp.getTerm()); - assertEquals("unexpected default encoder", "Sorting (Unique (DGap (VInt8)))", clp.createEncoder().toString()); - assertEquals("unexpected default decoder", "DGap (VInt8)", clp.createEncoder().createMatchingDecoder().toString()); + assertEquals("wrong default field", "$facets", clp.field); + IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapVInt8IntEncoder())); + IntDecoder decoder = encoder.createMatchingDecoder(); + assertEquals("unexpected default encoder", encoder.toString(), clp.createEncoder().toString()); + assertEquals("unexpected default decoder", decoder.toString(), clp.createEncoder().createMatchingDecoder().toString()); } /** @@ -64,8 +68,8 @@ public class CategoryListParamsTest extends LuceneTestCase { clParams1.hashCode(), clParams2.hashCode()); // Test 2 CategoryListParams with the same specified Term - clParams1 = new CategoryListParams(new Term("test")); - clParams2 = new CategoryListParams(new Term("test")); + clParams1 = new CategoryListParams("test"); + clParams2 = new CategoryListParams("test"); assertEquals( "2 CategoryListParams with the same term should equal each other.", clParams1, clParams2); @@ -73,8 +77,8 @@ public class CategoryListParamsTest extends LuceneTestCase { clParams1.hashCode(), clParams2.hashCode()); // Test 2 CategoryListParams with DIFFERENT terms - clParams1 = new CategoryListParams(new Term("test1")); - clParams2 = new CategoryListParams(new Term("test2")); + clParams1 = new CategoryListParams("test1"); + clParams2 = new CategoryListParams("test2"); assertFalse( "2 CategoryListParams with the different terms should NOT equal each other.", clParams1.equals(clParams2)); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/index/params/FacetIndexingParamsTest.java b/lucene/facet/src/test/org/apache/lucene/facet/index/params/FacetIndexingParamsTest.java index c04d68d737c..de5f233072e 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/index/params/FacetIndexingParamsTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/index/params/FacetIndexingParamsTest.java @@ -35,8 +35,7 @@ public class FacetIndexingParamsTest extends LuceneTestCase { assertNotNull("Missing default category list", dfip.getAllCategoryListParams()); assertEquals("all categories have the same CategoryListParams by default", dfip.getCategoryListParams(null), dfip.getCategoryListParams(new CategoryPath("a"))); - assertEquals("Expected default category list term is $facets:$fulltree$", - new Term("$facets", "$fulltree$"), dfip.getCategoryListParams(null).getTerm()); + assertEquals("Expected default category list field is $facets", "$facets", dfip.getCategoryListParams(null).field); String expectedDDText = "a" + dfip.getFacetDelimChar() + "b"; CategoryPath cp = new CategoryPath("a", "b"); @@ -48,13 +47,13 @@ public class FacetIndexingParamsTest extends LuceneTestCase { assertEquals("wrong drill-down term text", expectedDDText, new String( buf, 0, numchars)); CategoryListParams clParams = dfip.getCategoryListParams(null); - assertEquals("partition for all ordinals is the first", "$fulltree$", - PartitionsUtils.partitionNameByOrdinal(dfip, clParams , 250)); + assertEquals("partition for all ordinals is the first", "", + PartitionsUtils.partitionNameByOrdinal(dfip, 250)); assertEquals("for partition 0, the same name should be returned", - "$fulltree$", PartitionsUtils.partitionName(clParams, 0)); + "", PartitionsUtils.partitionName(0)); assertEquals( "for any other, it's the concatenation of name + partition", - "$fulltree$1", PartitionsUtils.partitionName(clParams, 1)); + PartitionsUtils.PART_NAME_PREFIX + "1", PartitionsUtils.partitionName(1)); assertEquals("default partition number is always 0", 0, PartitionsUtils.partitionNumber(dfip,100)); assertEquals("default partition size is unbounded", Integer.MAX_VALUE, @@ -63,11 +62,9 @@ public class FacetIndexingParamsTest extends LuceneTestCase { @Test public void testCategoryListParamsWithDefaultIndexingParams() { - CategoryListParams clp = new CategoryListParams( - new Term("clp", "value")); + CategoryListParams clp = new CategoryListParams("clp"); FacetIndexingParams dfip = new FacetIndexingParams(clp); - assertEquals("Expected default category list term is " + clp.getTerm(), - clp.getTerm(), dfip.getCategoryListParams(null).getTerm()); + assertEquals("Expected default category list field is " + clp.field, clp.field, dfip.getCategoryListParams(null).field); } @Test diff --git a/lucene/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java b/lucene/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java index c73cf8db0fb..6db5e22b262 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java @@ -32,44 +32,31 @@ public class PerDimensionIndexingParamsTest extends LuceneTestCase { public void testTopLevelSettings() { FacetIndexingParams ifip = new PerDimensionIndexingParams(Collections.emptyMap()); assertNotNull("Missing default category list", ifip.getAllCategoryListParams()); - assertEquals( - "Expected default category list term is $facets:$fulltree$", - new Term("$facets", "$fulltree$"), ifip.getCategoryListParams( - null).getTerm()); - String expectedDDText = "a" - + ifip.getFacetDelimChar() + "b"; + assertEquals("Expected default category list field is $facets", "$facets", ifip.getCategoryListParams(null).field); + String expectedDDText = "a" + ifip.getFacetDelimChar() + "b"; CategoryPath cp = new CategoryPath("a", "b"); - assertEquals("wrong drill-down term", new Term("$facets", - expectedDDText), DrillDown.term(ifip,cp)); + assertEquals("wrong drill-down term", new Term("$facets", expectedDDText), DrillDown.term(ifip,cp)); char[] buf = new char[20]; int numchars = ifip.drillDownTermText(cp, buf); assertEquals("3 characters should be written", 3, numchars); - assertEquals("wrong drill-down term text", expectedDDText, new String( - buf, 0, numchars)); + assertEquals("wrong drill-down term text", expectedDDText, new String(buf, 0, numchars)); CategoryListParams clParams = ifip.getCategoryListParams(null); - assertEquals("partition for all ordinals is the first", "$fulltree$", - PartitionsUtils.partitionNameByOrdinal(ifip, clParams , 250)); - assertEquals("for partition 0, the same name should be returned", - "$fulltree$", PartitionsUtils.partitionName(clParams, 0)); - assertEquals( - "for any other, it's the concatenation of name + partition", - "$fulltree$1", PartitionsUtils.partitionName(clParams, 1)); - assertEquals("default partition number is always 0", 0, - PartitionsUtils.partitionNumber(ifip,100)); - - assertEquals("default partition size is unbounded", Integer.MAX_VALUE, - ifip.getPartitionSize()); + assertEquals("partition for all ordinals is the first", "", PartitionsUtils.partitionNameByOrdinal(ifip, 250)); + assertEquals("for partition 0, the same name should be returned", "", PartitionsUtils.partitionName(0)); + assertEquals("for any other, it's the concatenation of name + partition", PartitionsUtils.PART_NAME_PREFIX + "1", PartitionsUtils.partitionName(1)); + assertEquals("default partition number is always 0", 0, PartitionsUtils.partitionNumber(ifip,100)); + assertEquals("default partition size is unbounded", Integer.MAX_VALUE, ifip.getPartitionSize()); } @Test public void testCategoryListParamsAddition() { - CategoryListParams clp = new CategoryListParams(new Term("clp", "value")); + CategoryListParams clp = new CategoryListParams("clp"); PerDimensionIndexingParams tlfip = new PerDimensionIndexingParams( Collections. singletonMap(new CategoryPath("a"), clp)); - assertEquals("Expected category list term is " + clp.getTerm(), - clp.getTerm(), tlfip.getCategoryListParams(new CategoryPath("a")).getTerm()); - assertNotSame("Unexpected default category list " + clp.getTerm(), clp, tlfip.getCategoryListParams(null)); + assertEquals("Expected category list field is " + clp.field, + clp.field, tlfip.getCategoryListParams(new CategoryPath("a")).field); + assertNotSame("Unexpected default category list " + clp.field, clp, tlfip.getCategoryListParams(null)); } } \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java index c326bbb30c6..9f3c977e398 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java @@ -1,23 +1,15 @@ package org.apache.lucene.facet.search; -import java.io.IOException; -import java.io.Reader; import java.util.HashSet; import java.util.Set; -import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.TextField; +import org.apache.lucene.document.StraightBytesDocValuesField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; @@ -48,42 +40,6 @@ import org.junit.Test; public class CategoryListIteratorTest extends LuceneTestCase { - private static final class DataTokenStream extends TokenStream { - - private final PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private final BytesRef buf; - private final IntEncoder encoder; - private final CharTermAttribute term = addAttribute(CharTermAttribute.class); - - private int idx; - private boolean exhausted = false; - - public DataTokenStream(String text, IntEncoder encoder) { - this.encoder = encoder; - term.setEmpty().append(text); - buf = new BytesRef(); - payload.setPayload(buf); - } - - public void setIdx(int idx) { - this.idx = idx; - exhausted = false; - } - - @Override - public boolean incrementToken() throws IOException { - if (exhausted) { - return false; - } - - // must copy because encoders may change the buffer - encoder.encode(IntsRef.deepCopyOf(data[idx]), buf); - exhausted = true; - return true; - } - - } - static final IntsRef[] data = new IntsRef[] { new IntsRef(new int[] { 1, 2 }, 0, 2), new IntsRef(new int[] { 3, 4 }, 0, 2), @@ -95,13 +51,13 @@ public class CategoryListIteratorTest extends LuceneTestCase { public void testPayloadCategoryListIteraor() throws Exception { Directory dir = newDirectory(); final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); - DataTokenStream dts = new DataTokenStream("1",encoder); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())); + BytesRef buf = new BytesRef(); for (int i = 0; i < data.length; i++) { - dts.setIdx(i); Document doc = new Document(); - doc.add(new TextField("f", dts)); + encoder.encode(IntsRef.deepCopyOf(data[i]), buf); + doc.add(new StraightBytesDocValuesField("f", buf)); writer.addDocument(doc); } IndexReader reader = writer.getReader(); @@ -109,9 +65,9 @@ public class CategoryListIteratorTest extends LuceneTestCase { int totalCategories = 0; IntsRef ordinals = new IntsRef(); - CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder()); + CategoryListIterator cli = new DocValuesCategoryListIterator("f", encoder.createMatchingDecoder()); for (AtomicReaderContext context : reader.leaves()) { - cli.setNextReader(context); + assertTrue("failed to initalize iterator", cli.setNextReader(context)); int maxDoc = context.reader().maxDoc(); int dataIdx = context.docBase; for (int doc = 0; doc < maxDoc; doc++, dataIdx++) { @@ -136,24 +92,17 @@ public class CategoryListIteratorTest extends LuceneTestCase { public void testPayloadIteratorWithInvalidDoc() throws Exception { Directory dir = newDirectory(); final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); - DataTokenStream dts = new DataTokenStream("1", encoder); - // this test requires that no payloads ever be randomly present! - final Analyzer noPayloadsAnalyzer = new Analyzer() { - @Override - public TokenStreamComponents createComponents(String fieldName, Reader reader) { - return new TokenStreamComponents(new MockTokenizer(reader, MockTokenizer.KEYWORD, false)); - } - }; // NOTE: test is wired to LogMP... because test relies on certain docids having payloads RandomIndexWriter writer = new RandomIndexWriter(random(), dir, - newIndexWriterConfig(TEST_VERSION_CURRENT, noPayloadsAnalyzer).setMergePolicy(newLogMergePolicy())); + newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy())); for (int i = 0; i < data.length; i++) { Document doc = new Document(); if (i == 0) { - dts.setIdx(i); - doc.add(new TextField("f", dts)); // only doc 0 has payloads! + BytesRef buf = new BytesRef(); + encoder.encode(IntsRef.deepCopyOf(data[i]), buf ); + doc.add(new StraightBytesDocValuesField("f", buf)); } else { - doc.add(new TextField("f", "1", Field.Store.NO)); + doc.add(new StraightBytesDocValuesField("f", new BytesRef())); } writer.addDocument(doc); writer.commit(); @@ -164,9 +113,9 @@ public class CategoryListIteratorTest extends LuceneTestCase { int totalCategories = 0; IntsRef ordinals = new IntsRef(); - CategoryListIterator cli = new PayloadCategoryListIteraor(new Term("f","1"), encoder.createMatchingDecoder()); + CategoryListIterator cli = new DocValuesCategoryListIterator("f", encoder.createMatchingDecoder()); for (AtomicReaderContext context : reader.leaves()) { - cli.setNextReader(context); + assertTrue("failed to initalize iterator", cli.setNextReader(context)); int maxDoc = context.reader().maxDoc(); int dataIdx = context.docBase; for (int doc = 0; doc < maxDoc; doc++, dataIdx++) { @@ -176,13 +125,13 @@ public class CategoryListIteratorTest extends LuceneTestCase { } cli.getOrdinals(doc, ordinals); if (dataIdx == 0) { - assertTrue("document 0 must have a payload", ordinals.length > 0); + assertTrue("document 0 must have ordinals", ordinals.length > 0); for (int j = 0; j < ordinals.length; j++) { assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); } totalCategories += ordinals.length; } else { - assertTrue("only document 0 should have a payload", ordinals.length == 0); + assertTrue("only document 0 should have ordinals", ordinals.length == 0); } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java index c20a98c41fe..9a881cbad20 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java @@ -60,8 +60,8 @@ public class DrillDownTest extends LuceneTestCase { public DrillDownTest() { Map paramsMap = new HashMap(); - paramsMap.put(new CategoryPath("a"), new CategoryListParams(new Term("testing_facets_a", "a"))); - paramsMap.put(new CategoryPath("b"), new CategoryListParams(new Term("testing_facets_b", "b"))); + paramsMap.put(new CategoryPath("a"), new CategoryListParams("testing_facets_a")); + paramsMap.put(new CategoryPath("b"), new CategoryListParams("testing_facets_b")); nonDefaultParams = new PerDimensionIndexingParams(paramsMap); } @@ -113,8 +113,8 @@ public class DrillDownTest extends LuceneTestCase { } @Test - public void testTermDefault() { - String defaultField = CategoryListParams.DEFAULT_TERM.field(); + public void testDefaultField() { + String defaultField = CategoryListParams.DEFAULT_FIELD; Term termA = DrillDown.term(defaultParams, new CategoryPath("a")); assertEquals(new Term(defaultField, "a"), termA); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java deleted file mode 100644 index 0d7d3173aa1..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java +++ /dev/null @@ -1,145 +0,0 @@ -package org.apache.lucene.facet.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.util.IntsRef; - -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import org.apache.lucene.facet.FacetTestBase; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.cache.CategoryListCache; -import org.apache.lucene.facet.search.cache.CategoryListData; -import org.apache.lucene.facet.search.params.CountFacetRequest; -import org.apache.lucene.facet.search.params.FacetRequest; -import org.apache.lucene.facet.search.params.FacetSearchParams; -import org.apache.lucene.facet.search.results.FacetResult; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.index.AtomicReaderContext; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -public class TestCategoryListCache extends FacetTestBase { - - public TestCategoryListCache() { - super(); - } - - @Before - @Override - public void setUp() throws Exception { - super.setUp(); - initIndex(); - } - - @After - @Override - public void tearDown() throws Exception { - closeAll(); - super.tearDown(); - } - - @Test - public void testNoClCache() throws Exception { - doTest(false,false); - } - - @Test - public void testCorrectClCache() throws Exception { - doTest(true,false); - } - - @Test - public void testWrongClCache() throws Exception { - doTest(true,true); - } - - private void doTest(boolean withCache, boolean plantWrongData) throws Exception { - Map truth = facetCountsTruth(); - CategoryPath cp = (CategoryPath) truth.keySet().toArray()[0]; // any category path will do for this test - FacetIndexingParams iParams = FacetIndexingParams.ALL_PARENTS; - final CategoryListCache clCache; - if (withCache) { - //let's use a cached cl data - CategoryListParams clp = new CategoryListParams(); // default term ok as only single list - clCache = new CategoryListCache(); - clCache.loadAndRegister(clp, indexReader, taxoReader, iParams); - if (plantWrongData) { - // let's mess up the cached data and then expect a wrong result... - messCachedData(clCache, clp); - } - } else { - clCache = null; - } - List req = new ArrayList(); - req.add(new CountFacetRequest(cp, 10)); - final FacetSearchParams sParams = new FacetSearchParams(req, iParams) { - @Override - public CategoryListCache getCategoryListCache() { - return clCache; - } - }; - FacetsCollector fc = new FacetsCollector(sParams, indexReader, taxoReader); - searcher.search(new MatchAllDocsQuery(), fc); - List res = fc.getFacetResults(); - try { - assertCountsAndCardinality(truth, res); - assertFalse("Correct results not expected when wrong data was cached", plantWrongData); - } catch (Throwable e) { - assertTrue("Wrong results not expected unless wrong data was cached", withCache); - assertTrue("Wrong results not expected unless wrong data was cached", plantWrongData); - } - } - - /** Mess the cached data for this {@link CategoryListParams} */ - private void messCachedData(CategoryListCache clCache, CategoryListParams clp) { - final CategoryListData cld = clCache.get(clp); - CategoryListData badCld = new CategoryListData() { - @Override - public CategoryListIterator iterator(int partition) throws IOException { - final CategoryListIterator it = cld.iterator(partition); - return new CategoryListIterator() { - @Override - public void getOrdinals(int docID, IntsRef ints) throws IOException { - it.getOrdinals(docID, ints); - for (int i = 0; i < ints.length; i++) { - if (ints.ints[i] > 1) { - ints.ints[i]--; - } else { - ints.ints[i]++; - } - } - } - @Override - public boolean setNextReader(AtomicReaderContext context) throws IOException { - return it.setNextReader(context); - } - }; - } - }; - clCache.register(clp, badCld); - } - -} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java index 12a054c5be2..a614dc707ba 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java @@ -10,14 +10,18 @@ import java.util.Map; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; import org.apache.lucene.facet.search.params.CountFacetRequest; import org.apache.lucene.facet.search.params.FacetRequest; -import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.params.FacetRequest.ResultMode; +import org.apache.lucene.facet.search.params.FacetSearchParams; import org.apache.lucene.facet.search.results.FacetResult; import org.apache.lucene.facet.search.results.FacetResultNode; import org.apache.lucene.facet.taxonomy.CategoryPath; @@ -25,23 +29,19 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; -import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig.OpenMode; -import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiCollector; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -63,6 +63,18 @@ import org.junit.Test; public class TestMultipleCategoryLists extends LuceneTestCase { + private static final CategoryPath[] CATEGORIES = new CategoryPath[] { + new CategoryPath("Author", "Mark Twain"), + new CategoryPath("Author", "Stephen King"), + new CategoryPath("Author", "Kurt Vonnegut"), + new CategoryPath("Band", "Rock & Pop", "The Beatles"), + new CategoryPath("Band", "Punk", "The Ramones"), + new CategoryPath("Band", "Rock & Pop", "U2"), + new CategoryPath("Band", "Rock & Pop", "REM"), + new CategoryPath("Band", "Rock & Pop", "Dave Matthews Band"), + new CategoryPath("Composer", "Bach"), + }; + @Test public void testDefault() throws Exception { Directory[][] dirs = getDirs(); @@ -72,9 +84,6 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // create and open a taxonomy writer TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], OpenMode.CREATE); - /** - * Configure with no custom counting lists - */ PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(Collections.emptyMap()); seedIndex(iw, tw, iParams); @@ -88,58 +97,48 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // prepare searcher to search against IndexSearcher searcher = newSearcher(ir); - FacetsCollector facetsCollector = performSearch(iParams, tr, ir, - searcher); - - // Obtain facets results and hand-test them - assertCorrectResults(facetsCollector); - - DocsEnum td = _TestUtil.docs(random(), ir, "$facets", new BytesRef("$fulltree$"), MultiFields.getLiveDocs(ir), null, DocsEnum.FLAG_NONE); - assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); - - tr.close(); - ir.close(); - iw.close(); - tw.close(); - IOUtils.close(dirs[0]); - } - - @Test - public void testCustom() throws Exception { - Directory[][] dirs = getDirs(); - // create and open an index writer - RandomIndexWriter iw = new RandomIndexWriter(random(), dirs[0][0], newIndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))); - // create and open a taxonomy writer - TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], - OpenMode.CREATE); - - PerDimensionIndexingParams iParams = new PerDimensionIndexingParams( - Collections.singletonMap(new CategoryPath("Author"), - new CategoryListParams(new Term("$author", "Authors")))); - seedIndex(iw, tw, iParams); - - IndexReader ir = iw.getReader(); - tw.commit(); - - // prepare index reader and taxonomy. - TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[0][1]); - - // prepare searcher to search against - IndexSearcher searcher = newSearcher(ir); - FacetsCollector facetsCollector = performSearch(iParams, tr, ir, searcher); // Obtain facets results and hand-test them assertCorrectResults(facetsCollector); - assertPostingListExists("$facets", "$fulltree$", ir); - assertPostingListExists("$author", "Authors", ir); + assertOrdinalsExist("$facets", ir); - tr.close(); - ir.close(); - iw.close(); - tw.close(); + IOUtils.close(tr, ir, iw, tw); + IOUtils.close(dirs[0]); + } + + @Test + public void testCustom() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + RandomIndexWriter iw = new RandomIndexWriter(random(), dirs[0][0], newIndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))); + // create and open a taxonomy writer + TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], OpenMode.CREATE); + + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams( + Collections.singletonMap(new CategoryPath("Author"), new CategoryListParams("$author"))); + seedIndex(iw, tw, iParams); + + IndexReader ir = iw.getReader(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new DirectoryTaxonomyReader(dirs[0][1]); + + // prepare searcher to search against + IndexSearcher searcher = newSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + + assertOrdinalsExist("$facets", ir); + assertOrdinalsExist("$author", ir); + + IOUtils.close(tr, ir, iw, tw); IOUtils.close(dirs[0]); } @@ -150,12 +149,11 @@ public class TestMultipleCategoryLists extends LuceneTestCase { RandomIndexWriter iw = new RandomIndexWriter(random(), dirs[0][0], newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))); // create and open a taxonomy writer - TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], - OpenMode.CREATE); + TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], OpenMode.CREATE); Map paramsMap = new HashMap(); - paramsMap.put(new CategoryPath("Band"), new CategoryListParams(new Term("$music", "Bands"))); - paramsMap.put(new CategoryPath("Composer"), new CategoryListParams(new Term("$music", "Composers"))); + paramsMap.put(new CategoryPath("Band"), new CategoryListParams("$music")); + paramsMap.put(new CategoryPath("Composer"), new CategoryListParams("$music")); PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(paramsMap); seedIndex(iw, tw, iParams); @@ -168,26 +166,27 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // prepare searcher to search against IndexSearcher searcher = newSearcher(ir); - FacetsCollector facetsCollector = performSearch(iParams, tr, ir, - searcher); + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, searcher); // Obtain facets results and hand-test them assertCorrectResults(facetsCollector); - assertPostingListExists("$facets", "$fulltree$", ir); - assertPostingListExists("$music", "Bands", ir); - assertPostingListExists("$music", "Composers", ir); + assertOrdinalsExist("$facets", ir); + assertOrdinalsExist("$music", ir); + assertOrdinalsExist("$music", ir); - tr.close(); - ir.close(); - iw.close(); - tw.close(); + IOUtils.close(tr, ir, iw, tw); IOUtils.close(dirs[0]); } - private void assertPostingListExists(String field, String text, IndexReader ir) throws IOException { - DocsEnum de = _TestUtil.docs(random(), ir, field, new BytesRef(text), null, null, DocsEnum.FLAG_NONE); - assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); + private void assertOrdinalsExist(String field, IndexReader ir) throws IOException { + for (AtomicReaderContext context : ir.leaves()) { + AtomicReader r = context.reader(); + if (r.getBinaryDocValues(field) != null) { + return; // not all segments must have this DocValues + } + } + fail("no ordinals found for " + field); } @Test @@ -200,8 +199,8 @@ public class TestMultipleCategoryLists extends LuceneTestCase { TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], OpenMode.CREATE); Map paramsMap = new HashMap(); - paramsMap.put(new CategoryPath("Band"), new CategoryListParams(new Term("$bands", "Bands"))); - paramsMap.put(new CategoryPath("Composer"), new CategoryListParams(new Term("$composers", "Composers"))); + paramsMap.put(new CategoryPath("Band"), new CategoryListParams("$bands")); + paramsMap.put(new CategoryPath("Composer"), new CategoryListParams("$composers")); PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(paramsMap); seedIndex(iw, tw, iParams); @@ -214,18 +213,15 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // prepare searcher to search against IndexSearcher searcher = newSearcher(ir); - FacetsCollector facetsCollector = performSearch(iParams, tr, ir, - searcher); + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, searcher); // Obtain facets results and hand-test them assertCorrectResults(facetsCollector); - assertPostingListExists("$facets", "$fulltree$", ir); - assertPostingListExists("$bands", "Bands", ir); - assertPostingListExists("$composers", "Composers", ir); - tr.close(); - ir.close(); - iw.close(); - tw.close(); + assertOrdinalsExist("$facets", ir); + assertOrdinalsExist("$bands", ir); + assertOrdinalsExist("$composers", ir); + + IOUtils.close(tr, ir, iw, tw); IOUtils.close(dirs[0]); } @@ -236,13 +232,12 @@ public class TestMultipleCategoryLists extends LuceneTestCase { RandomIndexWriter iw = new RandomIndexWriter(random(), dirs[0][0], newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))); // create and open a taxonomy writer - TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], - OpenMode.CREATE); + TaxonomyWriter tw = new DirectoryTaxonomyWriter(dirs[0][1], OpenMode.CREATE); Map paramsMap = new HashMap(); - paramsMap.put(new CategoryPath("Band"), new CategoryListParams(new Term("$music", "music"))); - paramsMap.put(new CategoryPath("Composer"), new CategoryListParams(new Term("$music", "music"))); - paramsMap.put(new CategoryPath("Author"), new CategoryListParams(new Term("$literature", "Authors"))); + paramsMap.put(new CategoryPath("Band"), new CategoryListParams("$music")); + paramsMap.put(new CategoryPath("Composer"), new CategoryListParams("$music")); + paramsMap.put(new CategoryPath("Author"), new CategoryListParams("$literature")); PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(paramsMap); seedIndex(iw, tw, iParams); @@ -256,18 +251,14 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // prepare searcher to search against IndexSearcher searcher = newSearcher(ir); - FacetsCollector facetsCollector = performSearch(iParams, tr, ir, - searcher); + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, searcher); // Obtain facets results and hand-test them assertCorrectResults(facetsCollector); - assertPostingListExists("$music", "music", ir); - assertPostingListExists("$literature", "Authors", ir); + assertOrdinalsExist("$music", ir); + assertOrdinalsExist("$literature", ir); - tr.close(); - ir.close(); - iw.close(); - tw.close(); + IOUtils.close(tr, ir, iw, tw); IOUtils.close(dirs[0]); } @@ -275,14 +266,12 @@ public class TestMultipleCategoryLists extends LuceneTestCase { return FacetTestUtils.createIndexTaxonomyDirs(1); } - private void assertCorrectResults(FacetsCollector facetsCollector) - throws IOException { + private void assertCorrectResults(FacetsCollector facetsCollector) throws IOException { List res = facetsCollector.getFacetResults(); FacetResult results = res.get(0); FacetResultNode resNode = results.getFacetResultNode(); - Iterable subResults = resNode - .getSubResults(); + Iterable subResults = resNode.getSubResults(); Iterator subIter = subResults.iterator(); checkResult(resNode, "Band", 5.0); @@ -325,9 +314,8 @@ public class TestMultipleCategoryLists extends LuceneTestCase { checkResult(subIter.next(), "Band/Rock & Pop/The Beatles", 1.0); } - private FacetsCollector performSearch(FacetIndexingParams iParams, - TaxonomyReader tr, IndexReader ir, - IndexSearcher searcher) throws IOException { + private FacetsCollector performSearch(FacetIndexingParams iParams, TaxonomyReader tr, IndexReader ir, + IndexSearcher searcher) throws IOException { // step 1: collect matching documents into a collector Query q = new MatchAllDocsQuery(); TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true); @@ -344,7 +332,6 @@ public class TestMultipleCategoryLists extends LuceneTestCase { // Faceted search parameters indicate which facets are we interested in FacetSearchParams facetSearchParams = new FacetSearchParams(facetRequests, iParams); - // perform documents search and facets accumulation FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, ir, tr); @@ -352,27 +339,19 @@ public class TestMultipleCategoryLists extends LuceneTestCase { return facetsCollector; } - private void seedIndex(RandomIndexWriter iw, TaxonomyWriter tw, - FacetIndexingParams iParams) throws IOException { - FacetTestUtils.add(iParams, iw, tw, "Author", "Mark Twain"); - FacetTestUtils.add(iParams, iw, tw, "Author", "Stephen King"); - FacetTestUtils.add(iParams, iw, tw, "Author", "Kurt Vonnegut"); - FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", - "The Beatles"); - FacetTestUtils.add(iParams, iw, tw, "Band", "Punk", "The Ramones"); - FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", "U2"); - FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", "REM"); - FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", - "Dave Matthews Band"); - FacetTestUtils.add(iParams, iw, tw, "Composer", "Bach"); + private void seedIndex(RandomIndexWriter iw, TaxonomyWriter tw, FacetIndexingParams iParams) throws IOException { + FacetFields facetFields = new FacetFields(tw, iParams); + for (CategoryPath cp : CATEGORIES) { + Document doc = new Document(); + facetFields.addFields(doc, Collections.singletonList(cp)); + doc.add(new TextField("content", "alpha", Field.Store.YES)); + iw.addDocument(doc); + } } private static void checkResult(FacetResultNode sub, String label, double value) { - assertEquals("Label of subresult " + sub.getLabel() + " was incorrect", - label, sub.getLabel().toString()); - assertEquals( - "Value for " + sub.getLabel() + " subresult was incorrect", - value, sub.getValue(), 0.0); + assertEquals("Label of subresult " + sub.getLabel() + " was incorrect", label, sub.getLabel().toString()); + assertEquals("Value for " + sub.getLabel() + " subresult was incorrect", value, sub.getValue(), 0.0); } } \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestSameRequestAccumulation.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestSameRequestAccumulation.java similarity index 97% rename from lucene/facet/src/test/org/apache/lucene/facet/TestSameRequestAccumulation.java rename to lucene/facet/src/test/org/apache/lucene/facet/search/TestSameRequestAccumulation.java index edafeb52369..1306731249c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestSameRequestAccumulation.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestSameRequestAccumulation.java @@ -1,7 +1,8 @@ -package org.apache.lucene.facet; +package org.apache.lucene.facet.search; import java.util.List; +import org.apache.lucene.facet.FacetTestBase; import org.apache.lucene.facet.search.FacetsCollector; import org.apache.lucene.facet.search.params.CountFacetRequest; import org.apache.lucene.facet.search.params.FacetSearchParams; diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java index db88c7359f3..71769e9ede2 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestStandardFacetsAccumulator.java @@ -93,6 +93,7 @@ public class TestStandardFacetsAccumulator extends LuceneTestCase { indexTwoDocs(indexWriter, null, false); // 4th segment, no content, or categories indexTwoDocs(indexWriter, null, true); // 5th segment, with content, no categories indexTwoDocs(indexWriter, facetFields, true); // 6th segment, with content, with categories + indexTwoDocs(indexWriter, null, true); // 7th segment, with content, no categories IOUtils.close(indexWriter, taxoWriter); DirectoryReader indexReader = DirectoryReader.open(indexDir); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java index 6f9ccabd618..c1642456b9b 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java @@ -178,7 +178,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } FacetResult fr = facetResults.get(0); // a, depth=3, K=2 - boolean hasDoctor = "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + boolean hasDoctor = "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(9, fr.getNumValidDescendants()); FacetResultNode parentRes = fr.getFacetResultNode(); assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); @@ -219,7 +219,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(1); // a, depth=2, K=2. same result as before - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(9, fr.getNumValidDescendants()); parentRes = fr.getFacetResultNode(); assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); @@ -239,7 +239,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(2); // a, depth=1, K=2 - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(4, fr.getNumValidDescendants(), 4); parentRes = fr.getFacetResultNode(); assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); @@ -257,7 +257,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(3); // a/b, depth=3, K=2 - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(4, fr.getNumValidDescendants()); parentRes = fr.getFacetResultNode(); assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); @@ -272,7 +272,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(4); // a/b, depth=2, K=2 - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(4, fr.getNumValidDescendants()); parentRes = fr.getFacetResultNode(); assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); @@ -286,7 +286,7 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(5); // a/b, depth=1, K=2 - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(4, fr.getNumValidDescendants()); parentRes = fr.getFacetResultNode(); assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); @@ -300,13 +300,13 @@ public class TestTopKInEachNodeResultHandler extends LuceneTestCase { } fr = facetResults.get(6); // a/b, depth=0, K=2 - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); assertEquals(0, fr.getNumValidDescendants()); // 0 descendants but rootnode parentRes = fr.getFacetResultNode(); assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); assertEquals(0.0, parentRes.getResidue(), Double.MIN_VALUE); assertEquals(0, parentRes.getNumSubResults()); - hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().components[0]); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().categoryPath.components[0]); // doctor, depth=1, K=2 assertFalse("Shouldn't have found anything for a FacetRequest " + diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java index bc4922fd31d..7dfd8c6c0de 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java @@ -85,12 +85,12 @@ public class TestTotalFacetCounts extends LuceneTestCase { TotalFacetCountsCache tfcc = TotalFacetCountsCache.getSingleton(); File tmpFile = _TestUtil.createTempFile("test", "tmp", TEMP_DIR); - tfcc.store(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams, null); + tfcc.store(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams); tfcc.clear(); // not really required because TFCC overrides on load(), but in the test we need not rely on this. tfcc.load(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams); // now retrieve the one just loaded - TotalFacetCounts totalCounts = tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + TotalFacetCounts totalCounts = tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams); int partition = 0; for (int i=0; i clps = new HashMap(); for (String dim : dimensions) { CategoryPath cp = new CategoryPath(dim); - CategoryListParams clp = new CategoryListParams(new Term("$" + dim, CategoryListParams.DEFAULT_TERM.bytes())); + CategoryListParams clp = new CategoryListParams("$" + dim); clps.put(cp, clp); } PerDimensionIndexingParams indexingParams = new PerDimensionIndexingParams(clps); @@ -86,23 +84,13 @@ public class MultiCategoryListIteratorTest extends LuceneTestCase { IOUtils.close(indexWriter, taxoWriter); // test the multi iterator - CategoryListCache clCache = null; - if (random.nextBoolean()) { - clCache = new CategoryListCache(); - } - DirectoryReader indexReader = DirectoryReader.open(indexDir); TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); CategoryListIterator[] iterators = new CategoryListIterator[numDimensions]; for (int i = 0; i < iterators.length; i++) { CategoryListParams clp = indexingParams.getCategoryListParams(new CategoryPath(dimensions[i])); IntDecoder decoder = clp.createEncoder().createMatchingDecoder(); - if (clCache != null && random.nextBoolean()) { - clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams); - iterators[i] = clCache.get(clp).iterator(0); // no partitions - } else { - iterators[i] = new PayloadCategoryListIteraor(clp.getTerm(), decoder); - } + iterators[i] = new DocValuesCategoryListIterator(clp.field, decoder); } MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators); for (AtomicReaderContext context : indexReader.leaves()) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java index 5ea3c111b8f..ccaaf4d33ec 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java @@ -163,16 +163,22 @@ public class TestCategoryPath extends LuceneTestCase { CategoryPath p = new CategoryPath("a/b/c/d", '/'); CategoryPath pother = new CategoryPath("a/b/c/d", '/'); assertEquals(0, pother.compareTo(p)); + assertEquals(0, p.compareTo(pother)); pother = new CategoryPath("", '/'); assertTrue(pother.compareTo(p) < 0); + assertTrue(p.compareTo(pother) > 0); pother = new CategoryPath("a/b_/c/d", '/'); assertTrue(pother.compareTo(p) > 0); + assertTrue(p.compareTo(pother) < 0); pother = new CategoryPath("a/b/c", '/'); assertTrue(pother.compareTo(p) < 0); + assertTrue(p.compareTo(pother) > 0); pother = new CategoryPath("a/b/c/e", '/'); assertTrue(pother.compareTo(p) > 0); + assertTrue(p.compareTo(pother) < 0); pother = new CategoryPath("a/b/c//e", '/'); assertTrue(pother.compareTo(p) < 0); + assertTrue(p.compareTo(pother) > 0); } } diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java index 7b58ff4d0d7..9b34bdac834 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java @@ -77,6 +77,7 @@ public class EncodingSpeed { encoderTest(new VInt8IntEncoder(), facetIDs, loopFactor); encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())), facetIDs, loopFactor); encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapVInt8IntEncoder())), facetIDs, loopFactor); encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))), facetIDs, loopFactor); encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))), facetIDs, loopFactor); encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), facetIDs, loopFactor); diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java index 57689d5731c..411b9aec79e 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java @@ -64,9 +64,12 @@ public class EncodingTest extends LuceneTestCase { BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[] IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[] - encoding(encoder, data, bytes); - decoding(bytes, values, encoder.createMatchingDecoder()); - assertTrue(expected.intsEquals(values)); + for (int i = 0; i < 2; i++) { + // run 2 iterations to catch encoders/decoders which don't reset properly + encoding(encoder, data, bytes); + decoding(bytes, values, encoder.createMatchingDecoder()); + assertTrue(expected.intsEquals(values)); + } } private static void encoding(IntEncoder encoder, IntsRef data, BytesRef bytes) throws IOException { @@ -147,5 +150,10 @@ public class EncodingTest extends LuceneTestCase { public void testSortingUniqueDGapNOnes3() throws Exception { encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), data, uniqueSortedData); } + + @Test + public void testSortingUniqueDGapVInt() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapVInt8IntEncoder())), data, uniqueSortedData); + } } diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java deleted file mode 100644 index f0556f3c718..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java +++ /dev/null @@ -1,54 +0,0 @@ -package org.apache.lucene.util.encoding; - -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Test; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Tests the {@link VInt8} class. - */ -public class Vint8Test extends LuceneTestCase { - - private static final int[] TEST_VALUES = { - -1000000000, - -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, - (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 - }; - private static int[] BYTES_NEEDED_TEST_VALUES = { - 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 - }; - - @Test - public void testBytesRef() throws Exception { - BytesRef bytes = new BytesRef(256); - int expectedSize = 0; - for (int j = 0; j < TEST_VALUES.length; j++) { - VInt8.encode(TEST_VALUES[j], bytes); - expectedSize += BYTES_NEEDED_TEST_VALUES[j]; - } - assertEquals(expectedSize, bytes.length); - - for (int j = 0; j < TEST_VALUES.length; j++) { - assertEquals(TEST_VALUES[j], VInt8.decode(bytes)); - } - assertEquals(bytes.offset, bytes.length); - } - -} diff --git a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java index 9a4cabdbfb8..0d5b21184f7 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java @@ -40,11 +40,13 @@ import java.util.Arrays; import java.util.Comparator; /** - * * HighFreqTerms class extracts the top n most frequent terms - * (by document frequency ) from an existing Lucene index and reports their document frequencey. - * If the -t flag is and reports both their document frequency and their total tf (total number of occurences) - * in order of highest total tf + * (by document frequency) from an existing Lucene index and reports their + * document frequency. + *

+ * If the -t flag is given, both document frequency and total tf (total + * number of occurrences) are reported, ordered by descending total tf. + * */ public class HighFreqTerms { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index cb9d0c34d3c..b88a8c0ba6f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -17,20 +17,12 @@ package org.apache.lucene.search.spell; * limitations under the License. */ -import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.Locale; -import java.util.PriorityQueue; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; -import org.apache.lucene.search.FuzzyTermsEnum; import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.FuzzyTermsEnum; import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeSource; @@ -39,6 +31,14 @@ import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.LevenshteinAutomata; +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Locale; +import java.util.PriorityQueue; + /** * Simple automaton-based spellchecker. *

@@ -384,9 +384,22 @@ public class DirectSpellChecker { } return suggestions; } - - private Collection suggestSimilar(Term term, int numSug, - IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException { + + /** + * Provide spelling corrections based on several parameters. + * + * @param term The term to suggest spelling corrections for + * @param numSug The maximum number of spelling corrections + * @param ir The index reader to fetch the candidate spelling corrections from + * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included + * @param editDistance The maximum edit distance candidates are allowed to have + * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included + * @param spare a chars scratch + * @return a collection of spelling corrections sorted by ScoreTerm's natural order. + * @throws IOException If I/O related errors occur + */ + protected Collection suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, + float accuracy, final CharsRef spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = @@ -449,15 +462,43 @@ public class DirectSpellChecker { return stQueue; } - - private static class ScoreTerm implements Comparable { + + /** + * Holds a spelling correction for internal usage inside {@link DirectSpellChecker}. + */ + protected static class ScoreTerm implements Comparable { + + /** + * The actual spellcheck correction. + */ public BytesRef term; + + /** + * The boost representing the similarity from the FuzzyTermsEnum (internal similarity score) + */ public float boost; + + /** + * The df of the spellcheck correction. + */ public int docfreq; - + + /** + * The spellcheck correction represented as string, can be null. + */ public String termAsString; + + /** + * The similarity score. + */ public float score; - + + /** + * Constructor. + */ + public ScoreTerm() { + } + @Override public int compareTo(ScoreTerm other) { if (term.bytesEquals(other.term)) diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 552cfea245c..0bbbef90b07 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -587,7 +587,7 @@ public class AnalyzingSuggester extends Lookup { //System.out.println(" prefixPaths: " + prefixPaths.size()); - BytesReader bytesReader = fst.getBytesReader(0); + BytesReader bytesReader = fst.getBytesReader(); FST.Arc> scratchArc = new FST.Arc>(); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java index 686ae3b59e8..3bb21c25a33 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java @@ -77,7 +77,7 @@ public class FSTUtil { new IntsRef())); final FST.Arc scratchArc = new FST.Arc(); - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); while (queue.size() != 0) { final Path path = queue.remove(queue.size() - 1); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java index 8d8d8b7b3a8..eea10423442 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletion.java @@ -139,7 +139,7 @@ public class FSTCompletion { try { List> rootArcs = new ArrayList>(); Arc arc = automaton.getFirstArc(new Arc()); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FST.BytesReader fstReader = automaton.getBytesReader(); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.add(new Arc().copyFrom(arc)); @@ -173,7 +173,7 @@ public class FSTCompletion { // Get the UTF-8 bytes representation of the input key. try { final FST.Arc scratch = new FST.Arc(); - FST.BytesReader fstReader = automaton.getBytesReader(0); + FST.BytesReader fstReader = automaton.getBytesReader(); for (; rootArcIndex < rootArcs.length; rootArcIndex++) { final FST.Arc rootArc = rootArcs[rootArcIndex]; final FST.Arc arc = scratch.copyFrom(rootArc); @@ -338,7 +338,7 @@ public class FSTCompletion { final int max = utf8.offset + utf8.length; // Cannot save as instance var since multiple threads // can use FSTCompletion at once... - final FST.BytesReader fstReader = automaton.getBytesReader(0); + final FST.BytesReader fstReader = automaton.getBytesReader(); for (int i = utf8.offset; i < max; i++) { if (automaton.findTargetArc(utf8.bytes[i] & 0xff, arc, arc, fstReader) == null) { // No matching prefixes, return an empty result. @@ -362,7 +362,7 @@ public class FSTCompletion { } assert output.offset == 0; output.bytes[output.length++] = (byte) arc.label; - FST.BytesReader fstReader = automaton.getBytesReader(0); + FST.BytesReader fstReader = automaton.getBytesReader(); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { if (arc.label == FST.END_LABEL) { diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index 1a7ff36cd1c..e6d69621e0f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -200,7 +200,7 @@ public class WFSTCompletionLookup extends Lookup { private Long lookupPrefix(BytesRef scratch, Arc arc) throws /*Bogus*/IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; - BytesReader bytesReader = fst.getBytesReader(0); + BytesReader bytesReader = fst.getBytesReader(); fst.getFirstArc(arc); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java index 4d3924d667d..b68e97ffb2a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/fst/FSTTester.java @@ -204,7 +204,7 @@ public class FSTTester { final FST.Arc arc = fst.getFirstArc(new FST.Arc()); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); for(int i=0;i<=term.length;i++) { final int label; @@ -241,7 +241,7 @@ public class FSTTester { in.offset = 0; final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; - final FST.BytesReader fstReader = fst.getBytesReader(0); + final FST.BytesReader fstReader = fst.getBytesReader(); while(true) { // read all arcs: diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1578a75a5ec..b916a12fc60 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -42,6 +42,40 @@ Other Changes ---------------------- +================== 4.2.0 ================== + +Versions of Major Components +--------------------- +Apache Tika 1.2 +Carrot2 3.6.2 +Velocity 1.7 and Velocity Tools 2.0 +Apache UIMA 2.3.1 +Apache ZooKeeper 3.4.5 + +Upgrading from Solr 4.1.0 +---------------------- + +(No upgrade instructions yet) + +Detailed Change List +---------------------- + +New Features +---------------------- + +Bug Fixes +---------------------- + +* SOLR-4309: /browse: Improve JQuery autosuggest behavior (janhoy) + +Optimizations +---------------------- + +Other Changes +---------------------- + + + ================== 4.1.0 ================== Versions of Major Components @@ -210,6 +244,30 @@ New Features * SOLR-4302: New parameter 'indexInfo' (defaults to true) in CoreAdmin STATUS command can be used to omit index specific information (Shahar Davidson via shalin) +* SOLR-2592: Collection specific document routing. The "compositeId" + router is the default for collections with hash based routing (i.e. when + numShards=N is specified on collection creation). Documents with ids sharing + the same domain (prefix) will be routed to the same shard, allowing for + efficient querying. + + Example: + The following two documents will be indexed to the same shard + since they share the same domain "customerB!". + + {"id" : "customerB!doc1" [...] } + {"id" : "customerB!doc2" [...] } + + At query time, one can specify a "shard.keys" parameter that lists what + shards the query should cover. + http://.../query?q=my_query&shard.keys=customerB! + + Collections that do not specify numShards at collection creation time + use custom sharding and default to the "implicit" router. Document updates + received by a shard will be indexed to that shard, unless a "_shard_" parameter + or document field names a different shard. + (Michael Garski, Dan Rosher, yonik) + + Optimizations ---------------------- @@ -258,6 +316,9 @@ Optimizations Bug Fixes ---------------------- +* SOLR-4288: Improve logging for FileDataSource (basePath, relative + resources). (Dawid Weiss) + * SOLR-4007: Morfologik dictionaries not available in Solr field type due to class loader lookup problems. (Lance Norskog, Dawid Weiss) @@ -348,7 +409,7 @@ Bug Fixes * SOLR-4064: When there is an unexpected exception while trying to run the new leader process, the SolrCore will not correctly rejoin the election. - (Po Rui Via Mark Miller) + (Po Rui via Mark Miller) * SOLR-3989: SolrZkClient constructor dropped exception cause when throwing a new RuntimeException. (Colin Bartolome, yonik) @@ -518,6 +579,9 @@ Bug Fixes * SOLR-4303: On replication, if the generation of the master is lower than the slave we need to force a full copy of the index. (Mark Miller, Gregg Donovan) + +* SOLR-4266: HttpSolrServer does not release connection properly on exception + when no response parser is used. (Steve Molloy via Mark Miller) Other Changes ---------------------- @@ -606,6 +670,9 @@ Other Changes disallow atomic update requests which change signature generating fields. (Joel Nothman, yonik, shalin) +* SOLR-4308: Remove the problematic and now unnecessary log4j-over-slf4j. + (Mark Miller) + ================== 4.0.0 ================== Versions of Major Components diff --git a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/FileDataSource.java b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/FileDataSource.java index d9892186bb0..8001935e889 100644 --- a/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/FileDataSource.java +++ b/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/FileDataSource.java @@ -92,22 +92,35 @@ public class FileDataSource extends DataSource { static File getFile(String basePath, String query) { try { - File file0 = new File(query); - File file = file0; + File file = new File(query); - if (!file.isAbsolute()) - file = new File(basePath + query); - - if (file.isFile() && file.canRead()) { - LOG.debug("Accessing File: " + file.toString()); - return file; - } else if (file != file0) - if (file0.isFile() && file0.canRead()) { - LOG.debug("Accessing File0: " + file0.toString()); - return file0; + // If it's not an absolute path, try relative from basePath. + if (!file.isAbsolute()) { + // Resolve and correct basePath. + File basePathFile; + if (basePath == null) { + basePathFile = new File(".").getAbsoluteFile(); + LOG.warn("FileDataSource.basePath is empty. " + + "Resolving to: " + basePathFile.getAbsolutePath()); + } else { + basePathFile = new File(basePath); + if (!basePathFile.isAbsolute()) { + basePathFile = basePathFile.getAbsoluteFile(); + LOG.warn("FileDataSource.basePath is not absolute. Resolving to: " + + basePathFile.getAbsolutePath()); + } } - throw new FileNotFoundException("Could not find file: " + query); + file = new File(basePathFile, query).getAbsoluteFile(); + } + + if (file.isFile() && file.canRead()) { + LOG.debug("Accessing File: " + file.getAbsolutePath()); + return file; + } else { + throw new FileNotFoundException("Could not find file: " + query + + " (resolved to: " + file.getAbsolutePath()); + } } catch (FileNotFoundException e) { throw new RuntimeException(e); } diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 68674b3b9f3..ec97c5b45e3 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -128,6 +128,8 @@ public class CoreContainer protected final Map dynamicDescriptors = new LinkedHashMap(); + protected final Set pendingDynamicCoreLoads = new HashSet(); + protected final Map coreInitFailures = Collections.synchronizedMap(new LinkedHashMap()); @@ -1245,17 +1247,8 @@ public class CoreContainer } } } - - /** Gets a core by name and increase its refcount. - * @see SolrCore#close() - * @param name the core name - * @return the core if found - */ - public SolrCore getCore(String name) { - name = checkDefault(name); - // Do this in two phases since we don't want to lock access to the cores over a load. + private SolrCore getCoreFromAnyList(String name) { SolrCore core; - synchronized (cores) { core = cores.get(name); if (core != null) { @@ -1274,20 +1267,64 @@ public class CoreContainer return core; } } + return null; + } + /** Gets a core by name and increase its refcount. + * @see SolrCore#close() + * @param name the core name + * @return the core if found + */ + public SolrCore getCore(String name) { + name = checkDefault(name); + // Do this in two phases since we don't want to lock access to the cores over a load. + SolrCore core = getCoreFromAnyList(name); + + if (core != null) return core; + + // OK, it's not presently in any list, is it in the list of dynamic cores but not loaded yet? If so, load it. CoreDescriptor desc = dynamicDescriptors.get(name); if (desc == null) { //Nope, no transient core with this name return null; } + + // Keep multiple threads from loading the same core at the same time. try { - core = create(desc); // This should throw an error if it fails. - core.open(); - if (desc.isTransient()) { - registerLazyCore(name, core, false); // This is a transient core - } else { - register(name, core, false); // This is a "permanent", although deferred-load core + boolean isPending; + synchronized (pendingDynamicCoreLoads) { + isPending = pendingDynamicCoreLoads.contains(name); + if (! isPending) { + pendingDynamicCoreLoads.add(name); + } } - } catch (Exception ex) { - throw recordAndThrow(name, "Unable to create core" + name, ex); + + while (isPending) { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + return null; // Seems best not to do anything at all if the thread is interrupted + } + + synchronized (pendingDynamicCoreLoads) { + if (!pendingDynamicCoreLoads.contains(name)) { + // NOTE: If, for some reason, the load failed, we'll return null here and presumably the log will show + // why. We'll fail all over again next time if the problem isn't corrected. + return getCoreFromAnyList(name); + } + } + } + try { + core = create(desc); // This should throw an error if it fails. + core.open(); + if (desc.isTransient()) { + registerLazyCore(name, core, false); // This is a transient core + } else { + register(name, core, false); // This is a "permanent", although deferred-load core + } + } catch (Exception ex) { + throw recordAndThrow(name, "Unable to create core" + name, ex); + } + } finally { + pendingDynamicCoreLoads.remove(name); } return core; } diff --git a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java index 4b9461f7d52..be8fe32a925 100644 --- a/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java +++ b/solr/core/src/java/org/apache/solr/update/processor/DistributedUpdateProcessor.java @@ -227,7 +227,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor { DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM)); - doDefensiveChecks(shardId, phase); + doDefensiveChecks(phase); if (DistribPhase.FROMLEADER == phase) { @@ -279,7 +279,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor { } - private void doDefensiveChecks(String shardId, DistribPhase phase) { + private void doDefensiveChecks(DistribPhase phase) { boolean isReplayOrPeersync = (updateCommand.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.REPLAY)) != 0; if (isReplayOrPeersync) return; diff --git a/solr/core/src/test/org/apache/solr/core/TestLazyCores.java b/solr/core/src/test/org/apache/solr/core/TestLazyCores.java index bc9da080eaf..0fb95608acf 100644 --- a/solr/core/src/test/org/apache/solr/core/TestLazyCores.java +++ b/solr/core/src/test/org/apache/solr/core/TestLazyCores.java @@ -34,8 +34,10 @@ import org.junit.Test; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.List; public class TestLazyCores extends SolrTestCaseJ4 { @@ -246,6 +248,44 @@ public class TestLazyCores extends SolrTestCaseJ4 { } } + // Test case for SOLR-4300 + @Test + public void testRace() throws Exception { + final List _theCores = new ArrayList(); + final CoreContainer cc = init(); + try { + + Thread[] threads = new Thread[15]; + for (int idx = 0; idx < threads.length; idx++) { + threads[idx] = new Thread() { + @Override + public void run() { + SolrCore core = cc.getCore("collectionLazy3"); + synchronized (_theCores) { + _theCores.add(core); + } + } + }; + threads[idx].start(); + } + + for (Thread thread : threads) { + thread.join(); + } + + for (int idx = 0; idx < _theCores.size() - 1; ++idx) { + assertEquals("Cores should be the same!", _theCores.get(idx), _theCores.get(idx + 1)); + } + + for (SolrCore core : _theCores) { + core.close(); + } + + } finally { + cc.shutdown(); + } + } + private void checkNotInCores(CoreContainer cc, String... nameCheck) { Collection names = cc.getCoreNames(); for (String name : nameCheck) { diff --git a/solr/example/example-DIH/solr/rss/conf/rss-data-config.xml b/solr/example/example-DIH/solr/rss/conf/rss-data-config.xml index 1e3a5b42355..cb6d395f350 100644 --- a/solr/example/example-DIH/solr/rss/conf/rss-data-config.xml +++ b/solr/example/example-DIH/solr/rss/conf/rss-data-config.xml @@ -5,22 +5,22 @@ pk="link" url="http://rss.slashdot.org/Slashdot/slashdot" processor="XPathEntityProcessor" - forEach="/RDF/channel | /RDF/item" + forEach="/rss/channel | /rss/item" transformer="DateFormatTransformer"> - - - + + + - - - - - - - - - + + + + + + + + + diff --git a/solr/example/solr/collection1/conf/solrconfig.xml b/solr/example/solr/collection1/conf/solrconfig.xml index 41c8304d348..fa13413c956 100755 --- a/solr/example/solr/collection1/conf/solrconfig.xml +++ b/solr/example/solr/collection1/conf/solrconfig.xml @@ -239,12 +239,9 @@ --> + + + ${solr.ulog.dir:} + + 15000 @@ -370,14 +380,6 @@ --> - - - ${solr.ulog.dir:} - - -